119 files changed, 84052 insertions, 574 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc
new file mode 100644
index 0000000..a09e484
--- /dev/null
+++ b/sys/kern/Make.tags.inc
@@ -0,0 +1,19 @@
+#	@(#)Make.tags.inc	8.1 (Berkeley) 6/11/93
+# $Id$
+
+# Common files for "make tags".
+# Included by the Makefile for each architecture.
+
+# Put the ../sys stuff near the end so that subroutine definitions win when
+# there is a struct tag with the same name (eg., vmmeter).  The real
+# solution would probably be for ctags to generate "struct vmmeter" tags.
+
+COMM=	/sys/conf/*.[ch] \
+	/sys/dev/*.[ch] /sys/dev/scsi/*.[ch] \
+	/sys/kern/*.[ch] /sys/libkern/*.[ch] \
+	/sys/miscfs/*/*.[ch] \
+	/sys/net/*.[ch] /sys/netccitt/*.[ch] /sys/netinet/*.[ch] \
+	/sys/netiso/*.[ch] /sys/netns/*.[ch] \
+	/sys/nfs/*.[ch] /sys/sys/*.[ch] \
+	/sys/ufs/*/*.[ch] \
+	/sys/vm/*.[ch]
diff --git a/sys/kern/Makefile b/sys/kern/Makefile
new file mode 100644
index 0000000..f42a44e
--- /dev/null
+++ b/sys/kern/Makefile
@@ -0,0 +1,53 @@
+#	@(#)Makefile	8.2 (Berkeley) 3/21/94
+
+# Makefile for kernel tags files, init_sysent, etc.
+
+ARCH=	i386 # luna68k news3400 pmax sparc tahoe vax
+
+all:
+	@echo "make tags, make links or init_sysent.c only"
+
+init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscall-hide.h \
+../sys/sysproto.h: makesyscalls.sh syscalls.master
+	-mv -f init_sysent.c init_sysent.c.bak
+	-mv -f syscalls.c syscalls.c.bak
+	-mv -f ../sys/syscall.h ../sys/syscall.h.bak
+	-mv -f ../sys/syscall-hide.h ../sys/syscall-hide.h.bak
+	-mv -f ../sys/sysproto.h ../sys/sysproto.h.bak
+	sh makesyscalls.sh syscalls.master
+
+# Kernel tags:
+# Tags files are built in the top-level directory for each architecture,
+# with a makefile listing the architecture-dependent files, etc.  The list
+# of common files is in ./Make.tags.inc.  Links to the correct tags file
+# are placed in each source directory.  We need to have links to tags files
+# from the generic directories that are relative to the machine type, even
+# via remote mounts; therefore we use symlinks to $SYSTAGS, which points at
+# ${SYSDIR}/${MACHINE}/tags.
+
+SYSTAGS=/var/db/sys_tags
+SYSDIR=/sys
+
+# Directories in which to place tags links (other than machine-dependent)
+DGEN=	conf \
+	dev dev/scsi \
+	hp hp/dev hp/hpux \
+	kern libkern \
+	miscfs miscfs/deadfs miscfs/fdesc miscfs/fifofs miscfs/kernfs \
+	miscfs/lofs miscfs/nullfs miscfs/portal miscfs/procfs \
+	miscfs/specfs miscfs/umapfs miscfs/union \
+	net netccitt netinet netiso netns nfs scripts sys \
+	ufs ufs/ffs ufs/lfs ufs/mfs ufs/ufs \
+	vm
+
+tags::
+	-for i in ${ARCH}; do \
+	    (cd ../$$i && make ${MFLAGS} tags); done
+
+links::
+	rm -f ${SYSTAGS}
+	ln -s ${SYSDIR}/${MACHINE}/tags ${SYSTAGS}
+	-for i in ${DGEN}; do \
+	    (cd ../$$i && { rm -f tags; ln -s ${SYSTAGS} tags; }) done
+	-for i in ${ARCH}; do \
+	    (cd ../$$i && make ${MFLAGS} SYSTAGS=${SYSTAGS} links); done
diff --git a/sys/kern/bus_if.m b/sys/kern/bus_if.m
new file mode 100644
index 0000000..fd4f648
--- /dev/null
+++ b/sys/kern/bus_if.m
@@ -0,0 +1,141 @@
+#
+# Copyright (c) 1998 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+#	$Id: bus_if.m,v 1.4 1998/11/08 18:51:38 nsouch Exp $
+#
+
+INTERFACE bus;
+
+#
+# This is called from system code which prints out a description of a
+# device.  It should describe the attachment that the child has with
+# the parent.  For instance the TurboLaser bus prints which node the
+# device is attached to.
+#
+METHOD void print_child {
+	device_t dev;
+	device_t child;
+};
+
+#
+# These two methods manage a bus specific set of instance variables of
+# a child device.  The intention is that each different type of bus
+# defines a set of appropriate instance variables (such as ports and
+# irqs for ISA bus etc.)
+#
+# This information could be given to the child device as a struct but
+# that makes it hard for a bus to add or remove variables without
+# forcing an edit and recompile for all drivers which may not be
+# possible for vendor supplied binary drivers.
+
+#
+# Read an instance variable.  Return 0 on success.
+#
+METHOD int read_ivar {
+	device_t dev;
+	device_t child;
+	int index;
+	uintptr_t *result;
+};
+
+#
+# Write an instance variable.  Return 0 on success.
+#
+METHOD int write_ivar {
+	device_t dev;
+	device_t child;
+	int index;
+	uintptr_t value;
+};
+
+#
+# Allocate a system resource attached to `dev' on behalf of `child'.
+# The types are defined in <machine/resource.h>; the meaning of the
+# resource-ID field varies from bus to bus (but *rid == 0 is always
+# valid if the resource type is).  start and end reflect the allowable
+# range, and should be passed as `0UL' and `~0UL', respectively, if
+# the client has no range restriction.  count is the number of consecutive
+# indices in the resource required.  flags is a set of sharing flags
+# as defined in <sys/rman.h>.
+#
+# Returns a resource or a null pointer on failure.  The caller is
+# responsible for calling rman_activate_resource() when it actually
+# uses the resource.
+#
+METHOD struct resource * alloc_resource {
+	device_t	dev;
+	device_t	child;
+	int		type;
+	int	       *rid;
+	u_long		start;
+	u_long		end;
+	u_long		count;
+	u_int		flags;
+};
+
+METHOD int activate_resource {
+	device_t	dev;
+	device_t	child;
+	int		type;
+	int		rid;
+	struct resource *r;
+};
+
+METHOD int deactivate_resource {
+	device_t	dev;
+	device_t	child;
+	int		type;
+	int		rid;
+	struct resource *r;
+};
+
+#
+# Free a resource allocated by the preceding method.  The `rid' value
+# must be the same as the one returned by BUS_ALLOC_RESOURCE (which
+# is not necessarily the same as the one the client passed).
+#
+METHOD int release_resource {
+	device_t	dev;
+	device_t	child;
+	int		type;
+	int		rid;
+	struct resource *res;
+};
+
+METHOD int setup_intr {
+	device_t	dev;
+	device_t	child;
+	struct resource *irq;
+	driver_intr_t	*intr;
+	void		*arg;
+	void		**cookiep;
+};
+
+METHOD int teardown_intr {
+	device_t	dev;
+	device_t	child;
+	struct resource	*irq;
+	void		*cookie;
+};
diff --git a/sys/kern/device_if.m b/sys/kern/device_if.m
new file mode 100644
index 0000000..f429e67
--- /dev/null
+++ b/sys/kern/device_if.m
@@ -0,0 +1,83 @@
+#
+# Copyright (c) 1998 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+#	$Id: device_if.m,v 1.2 1998/11/08 18:35:53 nsouch Exp $
+#
+
+INTERFACE device;
+
+#
+# Probe to see if the device is present.  Return 0 if the device exists,
+# ENXIO if it cannot be found.
+# 
+# Devices which implement busses should use this method to probe for
+# the existence of devices attached to the bus and add them as
+# children.  If this is combined with the use of bus_generic_attach,
+# the child devices will be automatically probed and attached.
+#
+METHOD int probe {
+	device_t dev;
+};
+
+#
+# Attach a device to the system.  The probe method will have been
+# called and will have indicated that the device exists.  This routine
+# should initialise the hardware and allocate other system resources
+# (such as devfs entries).  Returns 0 on success.
+#
+METHOD int attach {
+	device_t dev;
+};
+
+#
+# Detach a device.  This can be called if the user is replacing the
+# driver software or if a device is about to be physically removed
+# from the system (e.g. for pccard devices).  Returns 0 on success.
+#
+METHOD int detach {
+	device_t dev;
+};
+
+#
+# This is called during system shutdown to allow the driver to put the 
+# hardware into a consistent state for rebooting the computer.
+#
+METHOD int shutdown {
+	device_t dev;
+};
+
+#
+# This is called by the power-management subsystem when a suspend has been
+# requested by the user or by some automatic mechanism.  This gives
+# drivers a chance to veto the suspend or save their configuration before
+# power is removed.
+#
+METHOD int suspend {
+	device_t dev;
+};
+
+METHOD int resume {
+	device_t dev;
+};
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
new file mode 100644
index 0000000..9fbd203
--- /dev/null
+++ b/sys/kern/imgact_aout.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: imgact_aout.c,v 1.43 1998/10/16 03:55:00 peter Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/acct.h>
+#include <sys/resourcevar.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sysent.h>
+#include <sys/syscall.h>
+#include <sys/vnode.h>
+#include <sys/systm.h>
+#include <machine/md_var.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <sys/user.h>
+
+static int	exec_aout_imgact __P((struct image_params *imgp));
+
+struct sysentvec aout_sysvec = {
+	SYS_MAXSYSCALL,
+	sysent,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	sendsig,
+	sigcode,
+	&szsigcode,
+	0,
+	"FreeBSD a.out",
+	aout_coredump
+};
+
+static int
+exec_aout_imgact(imgp)
+	struct image_params *imgp;
+{
+	const struct exec *a_out = (const struct exec *) imgp->image_header;
+	struct vmspace *vmspace;
+	struct vnode *vp;
+	vm_object_t object;
+	vm_offset_t text_end, data_end;
+	unsigned long virtual_offset;
+	unsigned long file_offset;
+	unsigned long bss_size;
+	int error;
+
+	/*
+	 * Linux and *BSD binaries look very much alike,
+	 * only the machine id is different:
+	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
+	 * NetBSD is in network byte order.. ugh.
+	 */
+	if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
+	    ((a_out->a_magic >> 16) & 0xff) != 0 &&
+	    ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
+                return -1;
+
+	/*
+	 * Set file/virtual offset based on a.out variant.
+	 *	We do two cases: host byte order and network byte order
+	 *	(for NetBSD compatibility)
+	 */
+	switch ((int)(a_out->a_magic & 0xffff)) {
+	case ZMAGIC:
+		virtual_offset = 0;
+		if (a_out->a_text) {
+			file_offset = PAGE_SIZE;
+		} else {
+			/* Bill's "screwball mode" */
+			file_offset = 0;
+		}
+		break;
+	case QMAGIC:
+		virtual_offset = PAGE_SIZE;
+		file_offset = 0;
+		break;
+	default:
+		/* NetBSD compatibility */
+		switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
+		case ZMAGIC:
+		case QMAGIC:
+			virtual_offset = PAGE_SIZE;
+			file_offset = 0;
+			break;
+		default:
+			return (-1);
+		}
+	}
+
+	bss_size = roundup(a_out->a_bss, PAGE_SIZE);
+
+	/*
+	 * Check various fields in header for validity/bounds.
+	 */
+	if (/* entry point must lay with text region */
+	    a_out->a_entry < virtual_offset ||
+	    a_out->a_entry >= virtual_offset + a_out->a_text ||
+
+	    /* text and data size must each be page rounded */
+	    a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
+		return (-1);
+
+	/* text + data can't exceed file size */
+	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
+		return (EFAULT);
+
+	/*
+	 * text/data/bss must not exceed limits
+	 */
+	if (/* text can't exceed maximum text size */
+	    a_out->a_text > MAXTSIZ ||
+
+	    /* data + bss can't exceed rlimit */
+	    a_out->a_data + bss_size >
+		imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
+			return (ENOMEM);
+
+	/* copy in arguments and/or environment from old process */
+	error = exec_extract_strings(imgp);
+	if (error)
+		return (error);
+
+	/*
+	 * Destroy old process VM and create a new one (with a new stack)
+	 */
+	exec_new_vmspace(imgp);
+
+	/*
+	 * The vm space can be changed by exec_new_vmspace
+	 */
+	vmspace = imgp->proc->p_vmspace;
+
+	vp = imgp->vp;
+	object = vp->v_object;
+	vm_object_reference(object);
+
+	text_end = virtual_offset + a_out->a_text;
+	error = vm_map_insert(&vmspace->vm_map, object,
+		file_offset,
+		virtual_offset, text_end,
+		VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
+		MAP_COPY_NEEDED | MAP_COPY_ON_WRITE);
+	if (error)
+		return (error);
+
+	data_end = text_end + a_out->a_data;
+	if (a_out->a_data) {
+		vm_object_reference(object);
+		error = vm_map_insert(&vmspace->vm_map, object,
+			file_offset + a_out->a_text,
+			text_end, data_end,
+			VM_PROT_ALL, VM_PROT_ALL,
+			MAP_COPY_NEEDED | MAP_COPY_ON_WRITE);
+		if (error)
+			return (error);
+	}
+
+	pmap_object_init_pt(&vmspace->vm_pmap, virtual_offset,
+		object, (vm_pindex_t) OFF_TO_IDX(file_offset),
+		a_out->a_text + a_out->a_data, 0);
+
+	if (bss_size) {
+		error = vm_map_insert(&vmspace->vm_map, NULL, 0,
+			data_end, data_end + bss_size,
+			VM_PROT_ALL, VM_PROT_ALL, 0);
+		if (error)
+			return (error);
+	}
+		
+	/* Fill in process VM information */
+	vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
+	vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
+	vmspace->vm_daddr = (caddr_t) (uintptr_t)
+			    (virtual_offset + a_out->a_text);
+
+	/* Fill in image_params */
+	imgp->interpreted = 0;
+	imgp->entry_addr = a_out->a_entry;
+
+	imgp->proc->p_sysent = &aout_sysvec;
+
+	/* Indicate that this file should not be modified */
+	imgp->vp->v_flag |= VTEXT;
+
+	return (0);
+}
+
+/*
+ * Dump core, into a file named as described in the comments for
+ * expand_name(), unless the process was setuid/setgid.
+ */
+int
+aout_coredump(p)
+	register struct proc *p;
+{
+	register struct vnode *vp;
+	register struct ucred *cred = p->p_cred->pc_ucred;
+	register struct vmspace *vm = p->p_vmspace;
+	struct nameidata nd;
+	struct vattr vattr;
+	int error, error1;
+	char *name;			/* name of corefile */
+
+	STOPEVENT(p, S_CORE, 0);
+	if (sugid_coredump == 0 && p->p_flag & P_SUGID)
+		return (EFAULT);
+	if (ctob(UPAGES + vm->vm_dsize + vm->vm_ssize) >=
+	    p->p_rlimit[RLIMIT_CORE].rlim_cur)
+		return (EFAULT);
+	name = expand_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid);
+	if (name == NULL)
+		return (EFAULT);	/* XXX -- not the best error */
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p);
+	error = vn_open(&nd, O_CREAT | FWRITE, S_IRUSR | S_IWUSR);
+	free(name, M_TEMP);
+	if (error)
+		return (error);
+	vp = nd.ni_vp;
+
+	/* Don't dump to non-regular files or files with links. */
+	if (vp->v_type != VREG ||
+	    VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) {
+		error = EFAULT;
+		goto out;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_size = 0;
+	VOP_LEASE(vp, p, cred, LEASE_WRITE);
+	VOP_SETATTR(vp, &vattr, cred, p);
+	p->p_acflag |= ACORE;
+	bcopy(p, &p->p_addr->u_kproc.kp_proc, sizeof(struct proc));
+	fill_eproc(p, &p->p_addr->u_kproc.kp_eproc);
+	error = cpu_coredump(p, vp, cred);
+	if (error == 0)
+		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
+		    (int)ctob(vm->vm_dsize), (off_t)ctob(UPAGES), UIO_USERSPACE,
+		    IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p);
+	if (error == 0)
+		error = vn_rdwr(UIO_WRITE, vp,
+		    (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)),
+		    round_page(ctob(vm->vm_ssize)),
+		    (off_t)ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE,
+		    IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p);
+out:
+	VOP_UNLOCK(vp, 0, p);
+	error1 = vn_close(vp, FWRITE, cred, p);
+	if (error == 0)
+		error = error1;
+	return (error);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+static const struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
+EXEC_SET(aout, aout_execsw);
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
new file mode 100644
index 0000000..a0a2284
--- /dev/null
+++ b/sys/kern/imgact_elf.c
@@ -0,0 +1,992 @@
+/*-
+ * Copyright (c) 1995-1996 S�ren Schmidt
+ * Copyright (c) 1996 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software withough specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	$Id: imgact_elf.c,v 1.43 1998/12/04 22:54:51 archie Exp $
+ */
+
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/acct.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <sys/lock.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_extern.h>
+
+#include <machine/elf.h>
+#include <machine/md_var.h>
+
+__ElfType(Brandinfo);
+__ElfType(Auxargs);
+
+static int elf_check_header __P((const Elf_Ehdr *hdr, int type));
+static int elf_freebsd_fixup __P((long **stack_base,
+    struct image_params *imgp));
+static int elf_load_file __P((struct proc *p, char *file, u_long *addr,
+    u_long *entry));
+static int elf_load_section __P((struct proc *p,
+    struct vmspace *vmspace, struct vnode *vp,
+    vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
+    vm_prot_t prot));
+static int exec_elf_imgact __P((struct image_params *imgp));
+
+static int elf_trace = 0;
+SYSCTL_INT(_debug, OID_AUTO, elf_trace, CTLFLAG_RW, &elf_trace, 0, "");
+
+static struct sysentvec elf_freebsd_sysvec = {
+        SYS_MAXSYSCALL,
+        sysent,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        elf_freebsd_fixup,
+        sendsig,
+        sigcode,
+        &szsigcode,
+        0,
+	"FreeBSD ELF",
+	elf_coredump
+};
+
+static Elf_Brandinfo freebsd_brand_info = {
+						"FreeBSD",
+						"",
+						"/usr/libexec/ld-elf.so.1",
+						&elf_freebsd_sysvec
+					  };
+static Elf_Brandinfo *elf_brand_list[MAX_BRANDS] = {
+							&freebsd_brand_info,
+							NULL, NULL, NULL,
+							NULL, NULL, NULL, NULL
+						    };
+
+int
+elf_insert_brand_entry(Elf_Brandinfo *entry)
+{
+	int i;
+
+	for (i=1; i<MAX_BRANDS; i++) {
+		if (elf_brand_list[i] == NULL) {
+			elf_brand_list[i] = entry;
+			break;
+		}
+	}
+	if (i == MAX_BRANDS)
+		return -1;
+	return 0;
+}
+
+int
+elf_remove_brand_entry(Elf_Brandinfo *entry)
+{
+	int i;
+
+	for (i=1; i<MAX_BRANDS; i++) {
+		if (elf_brand_list[i] == entry) {
+			elf_brand_list[i] = NULL;
+			break;
+		}
+	}
+	if (i == MAX_BRANDS)
+		return -1;
+	return 0;
+}
+
+static int
+elf_check_header(const Elf_Ehdr *hdr, int type)
+{
+	if (!IS_ELF(*hdr) ||
+	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
+	    hdr->e_ident[EI_VERSION] != EV_CURRENT)
+		return ENOEXEC;
+
+	if (!ELF_MACHINE_OK(hdr->e_machine))
+		return ENOEXEC;
+
+	if (hdr->e_type != type || hdr->e_version != ELF_TARG_VER)
+		return ENOEXEC;
+	
+	return 0;
+}
+
+static int
+elf_load_section(struct proc *p, struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
+{
+	size_t map_len;
+	vm_offset_t map_addr;
+	int error, rv;
+	size_t copy_len;
+	vm_object_t object;
+	vm_offset_t file_addr;
+	vm_offset_t data_buf = 0;
+
+	object = vp->v_object;
+	error = 0;
+
+	map_addr = trunc_page((vm_offset_t)vmaddr);
+	file_addr = trunc_page(offset);
+
+	/*
+	 * We have two choices.  We can either clear the data in the last page
+	 * of an oversized mapping, or we can start the anon mapping a page
+	 * early and copy the initialized data into that first page.  We
+	 * choose the second..
+	 */
+	if (memsz > filsz)
+		map_len = trunc_page(offset+filsz) - file_addr;
+	else
+		map_len = round_page(offset+filsz) - file_addr;
+
+	if (map_len != 0) {
+		vm_object_reference(object);
+		vm_map_lock(&vmspace->vm_map);
+		rv = vm_map_insert(&vmspace->vm_map,
+				      object,
+				      file_addr,	/* file offset */
+				      map_addr,		/* virtual start */
+				      map_addr + map_len,/* virtual end */
+				      prot,
+				      VM_PROT_ALL,
+				      MAP_COPY_NEEDED | MAP_COPY_ON_WRITE);
+		vm_map_unlock(&vmspace->vm_map);
+		if (rv != KERN_SUCCESS)
+			return EINVAL;
+
+		/* prefault the page tables */
+		pmap_object_init_pt(&vmspace->vm_pmap,
+				    map_addr,
+				    object,
+				    (vm_pindex_t) OFF_TO_IDX(file_addr),
+				    map_len,
+				    0);
+
+		/* we can stop now if we've covered it all */
+		if (memsz == filsz)
+			return 0;
+	}
+
+
+	/*
+	 * We have to get the remaining bit of the file into the first part
+	 * of the oversized map segment.  This is normally because the .data
+	 * segment in the file is extended to provide bss.  It's a neat idea
+	 * to try and save a page, but it's a pain in the behind to implement.
+	 */
+	copy_len = (offset + filsz) - trunc_page(offset + filsz);
+	map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
+	map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
+
+	/* This had damn well better be true! */
+        if (map_len != 0) {
+		vm_map_lock(&vmspace->vm_map);
+		rv = vm_map_insert(&vmspace->vm_map, NULL, 0,
+					map_addr, map_addr + map_len,
+					VM_PROT_ALL, VM_PROT_ALL, 0);
+		vm_map_unlock(&vmspace->vm_map);
+		if (rv != KERN_SUCCESS)
+			return EINVAL; 
+	}
+
+	if (copy_len != 0) {
+		vm_object_reference(object);
+		rv = vm_map_find(exec_map,
+				 object, 
+				 trunc_page(offset + filsz),
+				 &data_buf,
+				 PAGE_SIZE,
+				 TRUE,
+				 VM_PROT_READ,
+				 VM_PROT_ALL,
+				 MAP_COPY_ON_WRITE | MAP_COPY_NEEDED);
+		if (rv != KERN_SUCCESS) {
+			vm_object_deallocate(object);
+			return EINVAL;
+		}
+		pmap_object_init_pt(exec_map->pmap, data_buf, object,
+			(vm_pindex_t) OFF_TO_IDX(trunc_page(offset + filsz)),
+			PAGE_SIZE, 1);
+
+		/* send the page fragment to user space */
+		error = copyout((caddr_t)data_buf, (caddr_t)map_addr, copy_len);
+		vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * set it to the specified protection
+	 */
+	vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len,  prot,
+		       FALSE);
+
+	return error;
+}
+
+static int
+elf_load_file(struct proc *p, char *file, u_long *addr, u_long *entry)
+{
+	Elf_Ehdr *hdr = NULL;
+	Elf_Phdr *phdr = NULL;
+	struct nameidata nd;
+	struct vmspace *vmspace = p->p_vmspace;
+	struct vattr attr;
+	struct image_params image_params, *imgp;
+	vm_prot_t prot;
+	unsigned long text_size = 0, data_size = 0;
+	unsigned long text_addr = 0, data_addr = 0;
+        int error, i;
+
+	imgp = &image_params;
+	/*
+	 * Initialize part of the common data
+	 */
+	imgp->proc = p;
+	imgp->uap = NULL;
+	imgp->attr = &attr;
+	imgp->firstpage = NULL;
+	imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE);
+
+	if (imgp->image_header == NULL) {
+		nd.ni_vp = NULL;
+		error = ENOMEM;
+		goto fail;
+	}
+
+        NDINIT(&nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, p);   
+			 
+	if (error = namei(&nd)) {
+		nd.ni_vp = NULL;
+		goto fail;
+	}
+
+	imgp->vp = nd.ni_vp;
+
+	/*
+	 * Check permissions, modes, uid, etc on the file, and "open" it.
+	 */
+	error = exec_check_permissions(imgp);
+	if (error) {
+		VOP_UNLOCK(nd.ni_vp, 0, p);
+		goto fail;
+	}
+
+	error = exec_map_first_page(imgp);
+	VOP_UNLOCK(nd.ni_vp, 0, p);
+	if (error)
+                goto fail;
+
+	hdr = (Elf_Ehdr *)imgp->image_header;
+	if (error = elf_check_header(hdr, ET_DYN))
+		goto fail;
+
+	/* Only support headers that fit within first page for now */
+	if ((hdr->e_phoff > PAGE_SIZE) ||
+	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
+		error = ENOEXEC;
+		goto fail;
+	}
+
+	phdr = (Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+
+	for (i = 0; i < hdr->e_phnum; i++) {
+		if (phdr[i].p_type == PT_LOAD) {	/* Loadable segment */
+			prot = 0;
+			if (phdr[i].p_flags & PF_X)
+  				prot |= VM_PROT_EXECUTE;
+			if (phdr[i].p_flags & PF_W)
+  				prot |= VM_PROT_WRITE;
+			if (phdr[i].p_flags & PF_R)
+  				prot |= VM_PROT_READ;
+
+			if (error = elf_load_section(p, vmspace, nd.ni_vp,
+  						     phdr[i].p_offset,
+  						     (caddr_t)phdr[i].p_vaddr +
+							(*addr),
+  						     phdr[i].p_memsz,
+  						     phdr[i].p_filesz, prot)) 
+				goto fail;
+
+			/*
+			 * Is this .text or .data ??
+			 *
+			 * We only handle one each of those yet XXX
+			 */
+			if (hdr->e_entry >= phdr[i].p_vaddr &&
+			hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) {
+  				text_addr = trunc_page(phdr[i].p_vaddr+(*addr));
+  				text_size = round_page(phdr[i].p_memsz +
+						       phdr[i].p_vaddr -
+						       trunc_page(phdr[i].p_vaddr));
+				*entry=(unsigned long)hdr->e_entry+(*addr);
+			} else {
+  				data_addr = trunc_page(phdr[i].p_vaddr+(*addr));
+  				data_size = round_page(phdr[i].p_memsz +
+						       phdr[i].p_vaddr -
+						       trunc_page(phdr[i].p_vaddr));
+			}
+		}
+	}
+
+fail:
+	if (imgp->firstpage)
+		exec_unmap_first_page(imgp);
+	if (imgp->image_header)
+		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header,
+			PAGE_SIZE);
+	if (nd.ni_vp)
+		vrele(nd.ni_vp);
+
+	return error;
+}
+
+static int
+exec_elf_imgact(struct image_params *imgp)
+{
+	const Elf_Ehdr *hdr = (const Elf_Ehdr *) imgp->image_header;
+	const Elf_Phdr *phdr;
+	Elf_Auxargs *elf_auxargs = NULL;
+	struct vmspace *vmspace;
+	vm_prot_t prot;
+	u_long text_size = 0, data_size = 0;
+	u_long text_addr = 0, data_addr = 0;
+	u_long addr, entry = 0, proghdr = 0;
+	int error, i;
+	const char *interp = NULL;
+	Elf_Brandinfo *brand_info;
+	char *brand;
+	char path[MAXPATHLEN];
+
+	/*
+	 * Do we have a valid ELF header ?
+	 */
+	if (elf_check_header(hdr, ET_EXEC))
+		return -1;
+
+	/*
+	 * From here on down, we return an errno, not -1, as we've
+	 * detected an ELF file.
+	 */
+
+	if ((hdr->e_phoff > PAGE_SIZE) ||
+	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
+		/* Only support headers in first page for now */
+		return ENOEXEC;
+	}
+	phdr = (const Elf_Phdr*)(imgp->image_header + hdr->e_phoff);
+	
+	/*
+	 * From this point on, we may have resources that need to be freed.
+	 */
+	if (error = exec_extract_strings(imgp))
+		goto fail;
+
+	exec_new_vmspace(imgp);
+
+	vmspace = imgp->proc->p_vmspace;
+
+	for (i = 0; i < hdr->e_phnum; i++) {
+		switch(phdr[i].p_type) {
+
+		case PT_LOAD:	/* Loadable segment */
+			prot = 0;
+			if (phdr[i].p_flags & PF_X)
+  				prot |= VM_PROT_EXECUTE;
+			if (phdr[i].p_flags & PF_W)
+  				prot |= VM_PROT_WRITE;
+			if (phdr[i].p_flags & PF_R)
+  				prot |= VM_PROT_READ;
+
+			if (error = elf_load_section(imgp->proc,
+						     vmspace, imgp->vp,
+  						     phdr[i].p_offset,
+  						     (caddr_t)phdr[i].p_vaddr,
+  						     phdr[i].p_memsz,
+  						     phdr[i].p_filesz, prot)) 
+  				goto fail;
+
+			/*
+			 * Is this .text or .data ??
+			 *
+			 * We only handle one each of those yet XXX
+			 */
+			if (hdr->e_entry >= phdr[i].p_vaddr &&
+			hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) {
+  				text_addr = trunc_page(phdr[i].p_vaddr);
+  				text_size = round_page(phdr[i].p_memsz +
+						       phdr[i].p_vaddr -
+						       text_addr);
+				entry = (u_long)hdr->e_entry;
+			} else {
+  				data_addr = trunc_page(phdr[i].p_vaddr);
+  				data_size = round_page(phdr[i].p_memsz +
+						       phdr[i].p_vaddr -
+						       data_addr);
+			}
+			break;
+	  	case PT_INTERP:	/* Path to interpreter */
+			if (phdr[i].p_filesz > MAXPATHLEN ||
+			    phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) {
+				error = ENOEXEC;
+				goto fail;
+			}
+			interp = imgp->image_header + phdr[i].p_offset;
+			break;
+		case PT_PHDR: 	/* Program header table info */
+			proghdr = phdr[i].p_vaddr;
+			break;
+		default:
+			break;
+		}
+	}
+
+	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
+	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
+	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
+
+	addr = 2L*MAXDSIZ; /* May depend on OS type XXX */
+
+	imgp->entry_addr = entry;
+
+	/* If the executable has a brand, search for it in the brand list. */
+	brand_info = NULL;
+	brand = (char *)&hdr->e_ident[EI_BRAND];
+	if (brand[0] != '\0') {
+		for (i = 0;  i < MAX_BRANDS;  i++) {
+			Elf_Brandinfo *bi = elf_brand_list[i];
+
+			if (bi != NULL && strcmp(brand, bi->brand) == 0) {
+				brand_info = bi;
+				break;
+			}
+		}
+	}
+
+	/* Lacking a known brand, search for a recognized interpreter. */
+	if (brand_info == NULL && interp != NULL) {
+		for (i = 0;  i < MAX_BRANDS;  i++) {
+			Elf_Brandinfo *bi = elf_brand_list[i];
+
+			if (bi != NULL &&
+			    strcmp(interp, bi->interp_path) == 0) {
+				brand_info = bi;
+				break;
+			}
+		}
+	}
+
+#ifdef __alpha__
+	/* XXX - Assume FreeBSD on the alpha. */
+	if (brand_info == NULL)
+		brand_info = &freebsd_brand_info;
+#endif
+
+	if (brand_info == NULL) {
+		if (brand[0] == 0)
+			uprintf("ELF binary type not known."
+			    "  Use \"brandelf\" to brand it.\n");
+		else
+			uprintf("ELF binary type \"%.*s\" not known.\n",
+			    EI_NIDENT - EI_BRAND, brand);
+		error = ENOEXEC;
+		goto fail;
+	}
+
+	imgp->proc->p_sysent = brand_info->sysvec;
+	if (interp != NULL) {
+		snprintf(path, sizeof(path), "%s%s",
+		    brand_info->emul_path, interp);
+                if ((error = elf_load_file(imgp->proc, path, &addr,
+		    &imgp->entry_addr)) != 0) {
+                        uprintf("ELF interpreter %s not found\n", path);
+                        goto fail;
+                }
+	}
+
+	/*
+	 * Construct auxargs table (used by the fixup routine)
+	 */
+	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
+	elf_auxargs->execfd = -1;
+	elf_auxargs->phdr = proghdr;
+	elf_auxargs->phent = hdr->e_phentsize;
+	elf_auxargs->phnum = hdr->e_phnum;
+	elf_auxargs->pagesz = PAGE_SIZE;
+	elf_auxargs->base = addr;
+	elf_auxargs->flags = 0;
+	elf_auxargs->entry = entry;
+	elf_auxargs->trace = elf_trace;
+
+	imgp->auxargs = elf_auxargs;
+	imgp->interpreted = 0;
+
+	/* don't allow modifying the file while we run it */
+	imgp->vp->v_flag |= VTEXT;
+	
+fail:
+	return error;
+}
+
+static int
+elf_freebsd_fixup(long **stack_base, struct image_params *imgp)
+{
+	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
+	long *pos;
+
+	pos = *stack_base + (imgp->argc + imgp->envc + 2);
+
+	if (args->trace) {
+		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
+	}
+	if (args->execfd != -1) {
+		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
+	}
+	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
+	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
+	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
+	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
+	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
+	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
+	AUXARGS_ENTRY(pos, AT_BASE, args->base);
+	AUXARGS_ENTRY(pos, AT_NULL, 0);
+
+	free(imgp->auxargs, M_TEMP);
+	imgp->auxargs = NULL;
+
+	(*stack_base)--;
+	suword(*stack_base, (long) imgp->argc);
+	return 0;
+} 
+
+/*
+ * Code for generating ELF core dumps.
+ */
+
+typedef void (*segment_callback) __P((vm_map_entry_t, void *));
+
+/* Closure for cb_put_phdr(). */
+struct phdr_closure {
+	Elf_Phdr *phdr;		/* Program header to fill in */
+	Elf_Off offset;		/* Offset of segment in core file */
+};
+
+/* Closure for cb_size_segment(). */
+struct sseg_closure {
+	int count;		/* Count of writable segments. */
+	size_t size;		/* Total size of all writable segments. */
+};
+
+static void cb_put_phdr __P((vm_map_entry_t, void *));
+static void cb_size_segment __P((vm_map_entry_t, void *));
+static void each_writable_segment __P((struct proc *, segment_callback,
+    void *));
+static int elf_corehdr __P((struct proc *, struct vnode *, struct ucred *,
+    int, void *, size_t));
+static void elf_puthdr __P((struct proc *, void *, size_t *,
+    const prstatus_t *, const prfpregset_t *, const prpsinfo_t *, int));
+static void elf_putnote __P((void *, size_t *, const char *, int,
+    const void *, size_t));
+
+extern int osreldate;
+
+int
+elf_coredump(p)
+	register struct proc *p;
+{
+	register struct vnode *vp;
+	register struct ucred *cred = p->p_cred->pc_ucred;
+	struct nameidata nd;
+	struct vattr vattr;
+	int error, error1;
+	char *name;			/* name of corefile */
+	struct sseg_closure seginfo;
+	void *hdr;
+	size_t hdrsize;
+
+	STOPEVENT(p, S_CORE, 0);
+
+	if (sugid_coredump == 0 && p->p_flag & P_SUGID)
+		return (EFAULT);
+
+	/* Size the program segments. */
+	seginfo.count = 0;
+	seginfo.size = 0;
+	each_writable_segment(p, cb_size_segment, &seginfo);
+
+	/*
+	 * Calculate the size of the core file header area by making
+	 * a dry run of generating it.  Nothing is written, but the
+	 * size is calculated.
+	 */
+	hdrsize = 0;
+	elf_puthdr((struct proc *)NULL, (void *)NULL, &hdrsize,
+	    (const prstatus_t *)NULL, (const prfpregset_t *)NULL,
+	    (const prpsinfo_t *)NULL, seginfo.count);
+
+	if (hdrsize + seginfo.size >= p->p_rlimit[RLIMIT_CORE].rlim_cur)
+		return (EFAULT);
+	name = expand_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid);
+	if (name == NULL)
+		return (EFAULT);	/* XXX -- not the best error */
+	
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p);
+	error = vn_open(&nd, O_CREAT | FWRITE, S_IRUSR | S_IWUSR);
+	free(name, M_TEMP);
+	if (error)
+		return (error);
+	vp = nd.ni_vp;
+
+	/* Don't dump to non-regular files or files with links. */
+	if (vp->v_type != VREG ||
+	    VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) {
+		error = EFAULT;
+		goto out;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_size = 0;
+	VOP_LEASE(vp, p, cred, LEASE_WRITE);
+	VOP_SETATTR(vp, &vattr, cred, p);
+	p->p_acflag |= ACORE;
+
+
+	/*
+	 * Allocate memory for building the header, fill it up,
+	 * and write it out.
+	 */
+	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
+	if (hdr == NULL) {
+		error = EINVAL;
+		goto out;
+	}
+	error = elf_corehdr(p, vp, cred, seginfo.count, hdr, hdrsize);
+
+	/* Write the contents of all of the writable segments. */
+	if (error == 0) {
+		Elf_Phdr *php;
+		off_t offset;
+		int i;
+
+		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
+		offset = hdrsize;
+		for (i = 0;  i < seginfo.count;  i++) {
+			error = vn_rdwr(UIO_WRITE, vp, (caddr_t)php->p_vaddr,
+			    php->p_filesz, offset, UIO_USERSPACE,
+			    IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, p);
+			if (error != 0)
+				break;
+			offset += php->p_filesz;
+			php++;
+		}
+	}
+	free(hdr, M_TEMP);
+
+out:
+	VOP_UNLOCK(vp, 0, p);
+	error1 = vn_close(vp, FWRITE, cred, p);
+	if (error == 0)
+		error = error1;
+	return (error);
+}
+
+/*
+ * A callback for each_writable_segment() to write out the segment's
+ * program header entry.
+ */
+static void
+cb_put_phdr(entry, closure)
+	vm_map_entry_t entry;
+	void *closure;
+{
+	struct phdr_closure *phc = (struct phdr_closure *)closure;
+	Elf_Phdr *phdr = phc->phdr;
+
+	phc->offset = round_page(phc->offset);
+
+	phdr->p_type = PT_LOAD;
+	phdr->p_offset = phc->offset;
+	phdr->p_vaddr = entry->start;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
+	phdr->p_align = PAGE_SIZE;
+	phdr->p_flags = 0;
+	if (entry->protection & VM_PROT_READ)
+		phdr->p_flags |= PF_R;
+	if (entry->protection & VM_PROT_WRITE)
+		phdr->p_flags |= PF_W;
+	if (entry->protection & VM_PROT_EXECUTE)
+		phdr->p_flags |= PF_X;
+
+	phc->offset += phdr->p_filesz;
+	phc->phdr++;
+}
+
+/*
+ * A callback for each_writable_segment() to gather information about
+ * the number of segments and their total size.
+ */
+static void
+cb_size_segment(entry, closure)
+	vm_map_entry_t entry;
+	void *closure;
+{
+	struct sseg_closure *ssc = (struct sseg_closure *)closure;
+
+	ssc->count++;
+	ssc->size += entry->end - entry->start;
+}
+
+/*
+ * For each writable segment in the process's memory map, call the given
+ * function with a pointer to the map entry and some arbitrary
+ * caller-supplied data.
+ */
+static void
+each_writable_segment(p, func, closure)
+	struct proc *p;
+	segment_callback func;
+	void *closure;
+{
+	vm_map_t map = &p->p_vmspace->vm_map;
+	vm_map_entry_t entry;
+
+	for (entry = map->header.next;  entry != &map->header;
+	    entry = entry->next) {
+		vm_object_t obj;
+
+		if (entry->eflags & (MAP_ENTRY_IS_A_MAP|MAP_ENTRY_IS_SUB_MAP) ||
+		    (entry->protection & (VM_PROT_READ|VM_PROT_WRITE)) !=
+		    (VM_PROT_READ|VM_PROT_WRITE))
+			continue;
+
+		if ((obj = entry->object.vm_object) == NULL)
+			continue;
+
+		/* Find the deepest backing object. */
+		while (obj->backing_object != NULL)
+			obj = obj->backing_object;
+
+		/* Ignore memory-mapped devices and such things. */
+		if (obj->type != OBJT_DEFAULT &&
+		    obj->type != OBJT_SWAP &&
+		    obj->type != OBJT_VNODE)
+			continue;
+
+		(*func)(entry, closure);
+	}
+}
+
+/*
+ * Write the core file header to the file, including padding up to
+ * the page boundary.
+ */
+static int
+elf_corehdr(p, vp, cred, numsegs, hdr, hdrsize)
+	struct proc *p;
+	struct vnode *vp;
+	struct ucred *cred;
+	int numsegs;
+	size_t hdrsize;
+	void *hdr;
+{
+	size_t off;
+	prstatus_t status;
+	prfpregset_t fpregset;
+	prpsinfo_t psinfo;
+
+	/* Gather the information for the header. */
+	bzero(&status, sizeof status);
+	status.pr_version = PRSTATUS_VERSION;
+	status.pr_statussz = sizeof(prstatus_t);
+	status.pr_gregsetsz = sizeof(gregset_t);
+	status.pr_fpregsetsz = sizeof(fpregset_t);
+	status.pr_osreldate = osreldate;
+#ifndef COMPAT_LINUX_THREADS
+	status.pr_cursig = p->p_sigacts->ps_sig;
+#else
+	status.pr_cursig = p->p_sig;
+#endif /* COMPAT_LINUX_THREADS */
+	status.pr_pid = p->p_pid;
+	fill_regs(p, &status.pr_reg);
+
+	fill_fpregs(p, &fpregset);
+
+	bzero(&psinfo, sizeof psinfo);
+	psinfo.pr_version = PRPSINFO_VERSION;
+	psinfo.pr_psinfosz = sizeof(prpsinfo_t);
+	strncpy(psinfo.pr_fname, p->p_comm, MAXCOMLEN);
+	/* XXX - We don't fill in the command line arguments properly yet. */
+	strncpy(psinfo.pr_psargs, p->p_comm, PRARGSZ);
+
+	/* Fill in the header. */
+	bzero(hdr, hdrsize);
+	off = 0;
+	elf_puthdr(p, hdr, &off, &status, &fpregset, &psinfo, numsegs);
+
+	/* Write it to the core file. */
+	return vn_rdwr(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
+	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, NULL, p);
+}
+
+static void
+elf_puthdr(struct proc *p, void *dst, size_t *off, const prstatus_t *status,
+    const prfpregset_t *fpregset, const prpsinfo_t *psinfo, int numsegs)
+{
+	size_t ehoff;
+	size_t phoff;
+	size_t noteoff;
+	size_t notesz;
+
+	ehoff = *off;
+	*off += sizeof(Elf_Ehdr);
+
+	phoff = *off;
+	*off += (numsegs + 1) * sizeof(Elf_Phdr);
+
+	noteoff = *off;
+	elf_putnote(dst, off, "FreeBSD", NT_PRSTATUS, status,
+	    sizeof *status);
+	elf_putnote(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
+	    sizeof *fpregset);
+	elf_putnote(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
+	    sizeof *psinfo);
+	notesz = *off - noteoff;
+
+	/* Align up to a page boundary for the program segments. */
+	*off = round_page(*off);
+
+	if (dst != NULL) {
+		Elf_Ehdr *ehdr;
+		Elf_Phdr *phdr;
+		struct phdr_closure phc;
+
+		/*
+		 * Fill in the ELF header.
+		 */
+		ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
+		ehdr->e_ident[EI_MAG0] = ELFMAG0;
+		ehdr->e_ident[EI_MAG1] = ELFMAG1;
+		ehdr->e_ident[EI_MAG2] = ELFMAG2;
+		ehdr->e_ident[EI_MAG3] = ELFMAG3;
+		ehdr->e_ident[EI_CLASS] = ELF_CLASS;
+		ehdr->e_ident[EI_DATA] = ELF_DATA;
+		ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+		ehdr->e_ident[EI_PAD] = 0;
+		strncpy(ehdr->e_ident + EI_BRAND, "FreeBSD",
+		    EI_NIDENT - EI_BRAND);
+		ehdr->e_type = ET_CORE;
+		ehdr->e_machine = ELF_ARCH;
+		ehdr->e_version = EV_CURRENT;
+		ehdr->e_entry = 0;
+		ehdr->e_phoff = phoff;
+		ehdr->e_flags = 0;
+		ehdr->e_ehsize = sizeof(Elf_Ehdr);
+		ehdr->e_phentsize = sizeof(Elf_Phdr);
+		ehdr->e_phnum = numsegs + 1;
+		ehdr->e_shentsize = sizeof(Elf_Shdr);
+		ehdr->e_shnum = 0;
+		ehdr->e_shstrndx = SHN_UNDEF;
+
+		/*
+		 * Fill in the program header entries.
+		 */
+		phdr = (Elf_Phdr *)((char *)dst + phoff);
+
+		/* The note segement. */
+		phdr->p_type = PT_NOTE;
+		phdr->p_offset = noteoff;
+		phdr->p_vaddr = 0;
+		phdr->p_paddr = 0;
+		phdr->p_filesz = notesz;
+		phdr->p_memsz = 0;
+		phdr->p_flags = 0;
+		phdr->p_align = 0;
+		phdr++;
+
+		/* All the writable segments from the program. */
+		phc.phdr = phdr;
+		phc.offset = *off;
+		each_writable_segment(p, cb_put_phdr, &phc);
+	}
+}
+
+static void
+elf_putnote(void *dst, size_t *off, const char *name, int type,
+    const void *desc, size_t descsz)
+{
+	Elf_Note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = descsz;
+	note.n_type = type;
+	if (dst != NULL)
+		bcopy(&note, (char *)dst + *off, sizeof note);
+	*off += sizeof note;
+	if (dst != NULL)
+		bcopy(name, (char *)dst + *off, note.n_namesz);
+	*off += roundup2(note.n_namesz, sizeof(Elf_Size));
+	if (dst != NULL)
+		bcopy(desc, (char *)dst + *off, note.n_descsz);
+	*off += roundup2(note.n_descsz, sizeof(Elf_Size));
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+static const struct execsw elf_execsw = {exec_elf_imgact, "ELF"};
+EXEC_SET(elf, elf_execsw);
diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c
new file mode 100644
index 0000000..d666a87
--- /dev/null
+++ b/sys/kern/imgact_gzip.c
@@ -0,0 +1,378 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@login.dkuug.dk> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $Id: imgact_gzip.c,v 1.34 1998/07/15 05:00:26 bde Exp $
+ *
+ * This module handles execution of a.out files which have been run through
+ * "gzip".  This saves diskspace, but wastes cpu-cycles and VM.
+ *
+ * TODO:
+ *	text-segments should be made R/O after being filled
+ *	is the vm-stuff safe ?
+ * 	should handle the entire header of gzip'ed stuff.
+ *	inflate isn't quite reentrant yet...
+ *	error-handling is a mess...
+ *	so is the rest...
+ *	tidy up unnecesary includes
+ */
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/mman.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/inflate.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+struct imgact_gzip {
+	struct image_params *ip;
+	struct exec     a_out;
+	int             error;
+	int             where;
+	u_char         *inbuf;
+	u_long          offset;
+	u_long          output;
+	u_long          len;
+	int             idx;
+	u_long          virtual_offset, file_offset, file_end, bss_size;
+};
+
+static int exec_gzip_imgact __P((struct image_params *imgp));
+static int NextByte __P((void *vp));
+static int do_aout_hdr __P((struct imgact_gzip *));
+static int Flush __P((void *vp, u_char *, u_long siz));
+
+static int
+exec_gzip_imgact(imgp)
+	struct image_params *imgp;
+{
+	int             error, error2 = 0;
+	const u_char   *p = (const u_char *) imgp->image_header;
+	struct imgact_gzip igz;
+	struct inflate  infl;
+	struct vmspace *vmspace;
+
+	/* If these four are not OK, it isn't a gzip file */
+	if (p[0] != 0x1f)
+		return -1;	/* 0    Simply magic	 */
+	if (p[1] != 0x8b)
+		return -1;	/* 1    Simply magic	 */
+	if (p[2] != 0x08)
+		return -1;	/* 2    Compression method	 */
+	if (p[9] != 0x03)
+		return -1;	/* 9    OS compressed on	 */
+
+	/*
+	 * If this one contains anything but a comment or a filename marker,
+	 * we don't want to chew on it
+	 */
+	if (p[3] & ~(0x18))
+		return ENOEXEC;	/* 3    Flags		 */
+
+	/* These are of no use to us */
+	/* 4-7  Timestamp		 */
+	/* 8    Extra flags		 */
+
+	bzero(&igz, sizeof igz);
+	bzero(&infl, sizeof infl);
+	infl.gz_private = (void *) &igz;
+	infl.gz_input = NextByte;
+	infl.gz_output = Flush;
+
+	igz.ip = imgp;
+	igz.idx = 10;
+
+	if (p[3] & 0x08) {	/* skip a filename */
+		while (p[igz.idx++])
+			if (igz.idx >= PAGE_SIZE)
+				return ENOEXEC;
+	}
+	if (p[3] & 0x10) {	/* skip a comment */
+		while (p[igz.idx++])
+			if (igz.idx >= PAGE_SIZE)
+				return ENOEXEC;
+	}
+	igz.len = imgp->attr->va_size;
+
+	error = inflate(&infl);
+
+	if ( !error ) {
+		vmspace = imgp->proc->p_vmspace;
+		error = vm_map_protect(&vmspace->vm_map,
+			(vm_offset_t) vmspace->vm_taddr,
+			(vm_offset_t) (vmspace->vm_taddr + 
+				      (vmspace->vm_tsize << PAGE_SHIFT)) ,
+			VM_PROT_READ|VM_PROT_EXECUTE,0);
+	}
+
+	if (igz.inbuf) {
+		error2 =
+			vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
+			    (vm_offset_t) igz.inbuf + PAGE_SIZE);
+	}
+	if (igz.error || error || error2) {
+		printf("Output=%lu ", igz.output);
+		printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
+		       error, igz.error, error2, igz.where);
+	}
+	if (igz.error)
+		return igz.error;
+	if (error)
+		return ENOEXEC;
+	if (error2)
+		return error2;
+	return 0;
+}
+
+static int
+do_aout_hdr(struct imgact_gzip * gz)
+{
+	int             error;
+	struct vmspace *vmspace;
+	vm_offset_t     vmaddr;
+
+	/*
+	 * Set file/virtual offset based on a.out variant. We do two cases:
+	 * host byte order and network byte order (for NetBSD compatibility)
+	 */
+	switch ((int) (gz->a_out.a_magic & 0xffff)) {
+	case ZMAGIC:
+		gz->virtual_offset = 0;
+		if (gz->a_out.a_text) {
+			gz->file_offset = PAGE_SIZE;
+		} else {
+			/* Bill's "screwball mode" */
+			gz->file_offset = 0;
+		}
+		break;
+	case QMAGIC:
+		gz->virtual_offset = PAGE_SIZE;
+		gz->file_offset = 0;
+		break;
+	default:
+		/* NetBSD compatibility */
+		switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
+		case ZMAGIC:
+		case QMAGIC:
+			gz->virtual_offset = PAGE_SIZE;
+			gz->file_offset = 0;
+			break;
+		default:
+			gz->where = __LINE__;
+			return (-1);
+		}
+	}
+
+	gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
+
+	/*
+	 * Check various fields in header for validity/bounds.
+	 */
+	if (			/* entry point must lay with text region */
+	    gz->a_out.a_entry < gz->virtual_offset ||
+	    gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
+
+	/* text and data size must each be page rounded */
+	    gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
+		gz->where = __LINE__;
+		return (-1);
+	}
+	/*
+	 * text/data/bss must not exceed limits
+	 */
+	if (			/* text can't exceed maximum text size */
+	    gz->a_out.a_text > MAXTSIZ ||
+
+	/* data + bss can't exceed rlimit */
+	    gz->a_out.a_data + gz->bss_size >
+	    gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) {
+		gz->where = __LINE__;
+		return (ENOMEM);
+	}
+	/* Find out how far we should go */
+	gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
+
+	/* copy in arguments and/or environment from old process */
+	error = exec_extract_strings(gz->ip);
+	if (error) {
+		gz->where = __LINE__;
+		return (error);
+	}
+	/*
+	 * Destroy old process VM and create a new one (with a new stack)
+	 */
+	exec_new_vmspace(gz->ip);
+
+	vmspace = gz->ip->proc->p_vmspace;
+
+	vmaddr = gz->virtual_offset;
+
+	error = vm_mmap(&vmspace->vm_map,
+			&vmaddr,
+			gz->a_out.a_text + gz->a_out.a_data,
+			VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
+			0,
+			0);
+
+	if (error) {
+		gz->where = __LINE__;
+		return (error);
+	}
+
+	if (gz->bss_size != 0) {
+		/*
+		 * Allocate demand-zeroed area for uninitialized data.
+		 * "bss" = 'block started by symbol' - named after the 
+		 * IBM 7090 instruction of the same name.
+		 */
+		vmaddr = gz->virtual_offset + gz->a_out.a_text + 
+			gz->a_out.a_data;
+		error = vm_map_find(&vmspace->vm_map,
+				NULL,
+				0,
+				&vmaddr, 
+				gz->bss_size,
+				FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+		if (error) {
+			gz->where = __LINE__;
+			return (error);
+		}
+	}
+	/* Fill in process VM information */
+	vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
+	vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset;
+	vmspace->vm_daddr = (caddr_t) (uintptr_t)
+			    (gz->virtual_offset + gz->a_out.a_text);
+
+	/* Fill in image_params */
+	gz->ip->interpreted = 0;
+	gz->ip->entry_addr = gz->a_out.a_entry;
+
+	gz->ip->proc->p_sysent = &aout_sysvec;
+
+	return 0;
+}
+
+static int
+NextByte(void *vp)
+{
+	int             error;
+	struct imgact_gzip *igz = (struct imgact_gzip *) vp;
+
+	if (igz->idx >= igz->len) {
+		igz->where = __LINE__;
+		return GZ_EOF;
+	}
+	if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
+		return igz->inbuf[(igz->idx++) - igz->offset];
+	}
+	if (igz->inbuf) {
+		error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
+			    (vm_offset_t) igz->inbuf + PAGE_SIZE);
+		if (error) {
+			igz->where = __LINE__;
+			igz->error = error;
+			return GZ_EOF;
+		}
+	}
+	igz->offset = igz->idx & ~PAGE_MASK;
+
+	error = vm_mmap(kernel_map,	/* map */
+			(vm_offset_t *) & igz->inbuf,	/* address */
+			PAGE_SIZE,	/* size */
+			VM_PROT_READ,	/* protection */
+			VM_PROT_READ,	/* max protection */
+			0,	/* flags */
+			(caddr_t) igz->ip->vp,	/* vnode */
+			igz->offset);	/* offset */
+	if (error) {
+		igz->where = __LINE__;
+		igz->error = error;
+		return GZ_EOF;
+	}
+	return igz->inbuf[(igz->idx++) - igz->offset];
+}
+
+static int
+Flush(void *vp, u_char * ptr, u_long siz)
+{
+	struct imgact_gzip *gz = (struct imgact_gzip *) vp;
+	u_char         *p = ptr, *q;
+	int             i;
+
+	/* First, find a a.out-header */
+	if (gz->output < sizeof gz->a_out) {
+		q = (u_char *) & gz->a_out;
+		i = min(siz, sizeof gz->a_out - gz->output);
+		bcopy(p, q + gz->output, i);
+		gz->output += i;
+		p += i;
+		siz -= i;
+		if (gz->output == sizeof gz->a_out) {
+			i = do_aout_hdr(gz);
+			if (i == -1) {
+				if (!gz->where)
+					gz->where = __LINE__;
+				gz->error = ENOEXEC;
+				return ENOEXEC;
+			} else if (i) {
+				gz->where = __LINE__;
+				gz->error = i;
+				return ENOEXEC;
+			}
+			if (gz->file_offset == 0) {
+				q = (u_char *) (uintptr_t) gz->virtual_offset;
+				copyout(&gz->a_out, q, sizeof gz->a_out);
+			}
+		}
+	}
+	/* Skip over zero-padded first PAGE if needed */
+	if (gz->output < gz->file_offset &&
+	    gz->output + siz > gz->file_offset) {
+		i = min(siz, gz->file_offset - gz->output);
+		gz->output += i;
+		p += i;
+		siz -= i;
+	}
+	if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
+		i = min(siz, gz->file_end - gz->output);
+		q = (u_char *) (uintptr_t)
+		    (gz->virtual_offset + gz->output - gz->file_offset);
+		copyout(p, q, i);
+		gz->output += i;
+		p += i;
+		siz -= i;
+	}
+	gz->output += siz;
+	return 0;
+}
+
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+
+static const struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
+EXEC_SET(execgzip, gzip_execsw);
diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c
new file mode 100644
index 0000000..e72b86d
--- /dev/null
+++ b/sys/kern/imgact_shell.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: imgact_shell.c,v 1.16 1997/08/02 14:31:23 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define SHELLMAGIC	0x2123 /* #! */
+#else
+#define SHELLMAGIC	0x2321
+#endif
+
+#define MAXSHELLCMDLEN	64
+
+static int	exec_shell_imgact __P((struct image_params *imgp));
+
+/*
+ * Shell interpreter image activator. A interpreter name beginning
+ *	at imgp->stringbase is the minimal successful exit requirement.
+ */
+static int
+exec_shell_imgact(imgp)
+	struct image_params *imgp;
+{
+	const char *image_header = imgp->image_header;
+	const char *ihp, *line_endp;
+	char *interp;
+
+	/* a shell script? */
+	if (((const short *) image_header)[0] != SHELLMAGIC)
+		return(-1);
+
+	/*
+	 * Don't allow a shell script to be the shell for a shell
+	 *	script. :-)
+	 */
+	if (imgp->interpreted)
+		return(ENOEXEC);
+
+	imgp->interpreted = 1;
+
+	/*
+	 * Copy shell name and arguments from image_header into string
+	 *	buffer.
+	 */
+
+	/*
+	 * Find end of line; return if the line > MAXSHELLCMDLEN long.
+	 */
+	for (ihp = &image_header[2]; *ihp != '\n'; ++ihp) {
+		if (ihp >= &image_header[MAXSHELLCMDLEN])
+			return(ENOEXEC);
+	}
+	line_endp = ihp;
+
+	/* reset for another pass */
+	ihp = &image_header[2];
+
+	/* Skip over leading spaces - until the interpreter name */
+	while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+	/* copy the interpreter name */
+	interp = imgp->interpreter_name;
+	while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t'))
+		*interp++ = *ihp++;
+	*interp = '\0';
+
+	/* Disallow a null interpreter filename */
+	if (*imgp->interpreter_name == '\0')
+		return(ENOEXEC);
+
+	/* reset for another pass */
+	ihp = &image_header[2];
+
+	/* copy the interpreter name and arguments */
+	while (ihp < line_endp) {
+		/* Skip over leading spaces */
+		while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+		if (ihp < line_endp) {
+			/*
+			 * Copy to end of token. No need to watch stringspace
+			 *	because this is at the front of the string buffer
+			 *	and the maximum shell command length is tiny.
+			 */
+			while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) {
+				*imgp->stringp++ = *ihp++;
+				imgp->stringspace--;
+			}
+
+			*imgp->stringp++ = 0;
+			imgp->stringspace--;
+
+			imgp->argc++;
+		}
+	}
+
+	imgp->argv0 = imgp->uap->fname;
+
+	return(0);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+static const struct execsw shell_execsw = { exec_shell_imgact, "#!" };
+EXEC_SET(shell, shell_execsw);
diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c
new file mode 100644
index 0000000..1db9b2c
--- /dev/null
+++ b/sys/kern/inflate.c
@@ -0,0 +1,1078 @@
+/*
+ * Most parts of this file are not covered by:
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@login.dknet.dk> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $Id: inflate.c,v 1.11 1997/10/12 20:23:40 phk Exp $
+ *
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/inflate.h>
+#ifdef KERNEL
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#endif
+#include <sys/malloc.h>
+
+#ifdef KERNEL
+static MALLOC_DEFINE(M_GZIP, "Gzip trees", "Gzip trees");
+#endif
+
+/* needed to make inflate() work */
+#define	uch u_char
+#define	ush u_short
+#define	ulg u_long
+
+/* Stuff to make inflate() work */
+#ifdef KERNEL
+#define memzero(dest,len)      bzero(dest,len)
+#endif
+#define NOMEMCPY
+#ifdef KERNEL
+#define FPRINTF printf
+#else
+extern void putstr (char *);
+#define FPRINTF putstr
+#endif
+
+#define FLUSH(x,y) {						\
+	int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y);	\
+	if (foo) 						\
+		return foo;					\
+	}
+
+static const int qflag = 0;
+
+#ifndef KERNEL /* want to use this file in kzip also */
+extern unsigned char *kzipmalloc (int);
+extern void kzipfree (void*);
+#define malloc(x, y, z) kzipmalloc((x))
+#define free(x, y) kzipfree((x))
+#endif
+
+/*
+ * This came from unzip-5.12.  I have changed it the flow to pass
+ * a structure pointer around, thus hopefully making it re-entrant.
+ * Poul-Henning
+ */
+
+/* inflate.c -- put in the public domain by Mark Adler
+   version c14o, 23 August 1994 */
+
+/* You can do whatever you like with this source file, though I would
+   prefer that if you modify it and redistribute it that you include
+   comments to that effect with your name and the date.  Thank you.
+
+   History:
+   vers    date          who           what
+   ----  ---------  --------------  ------------------------------------
+    a    ~~ Feb 92  M. Adler        used full (large, one-step) lookup table
+    b1   21 Mar 92  M. Adler        first version with partial lookup tables
+    b2   21 Mar 92  M. Adler        fixed bug in fixed-code blocks
+    b3   22 Mar 92  M. Adler        sped up match copies, cleaned up some
+    b4   25 Mar 92  M. Adler        added prototypes; removed window[] (now
+                                    is the responsibility of unzip.h--also
+                                    changed name to slide[]), so needs diffs
+                                    for unzip.c and unzip.h (this allows
+                                    compiling in the small model on MSDOS);
+                                    fixed cast of q in huft_build();
+    b5   26 Mar 92  M. Adler        got rid of unintended macro recursion.
+    b6   27 Mar 92  M. Adler        got rid of nextbyte() routine.  fixed
+                                    bug in inflate_fixed().
+    c1   30 Mar 92  M. Adler        removed lbits, dbits environment variables.
+                                    changed BMAX to 16 for explode.  Removed
+                                    OUTB usage, and replaced it with flush()--
+                                    this was a 20% speed improvement!  Added
+                                    an explode.c (to replace unimplod.c) that
+                                    uses the huft routines here.  Removed
+                                    register union.
+    c2    4 Apr 92  M. Adler        fixed bug for file sizes a multiple of 32k.
+    c3   10 Apr 92  M. Adler        reduced memory of code tables made by
+                                    huft_build significantly (factor of two to
+                                    three).
+    c4   15 Apr 92  M. Adler        added NOMEMCPY do kill use of memcpy().
+                                    worked around a Turbo C optimization bug.
+    c5   21 Apr 92  M. Adler        added the GZ_WSIZE #define to allow reducing
+                                    the 32K window size for specialized
+                                    applications.
+    c6   31 May 92  M. Adler        added some typecasts to eliminate warnings
+    c7   27 Jun 92  G. Roelofs      added some more typecasts (444:  MSC bug).
+    c8    5 Oct 92  J-l. Gailly     added ifdef'd code to deal with PKZIP bug.
+    c9    9 Oct 92  M. Adler        removed a memory error message (~line 416).
+    c10  17 Oct 92  G. Roelofs      changed ULONG/UWORD/byte to ulg/ush/uch,
+                                    removed old inflate, renamed inflate_entry
+                                    to inflate, added Mark's fix to a comment.
+   c10.5 14 Dec 92  M. Adler        fix up error messages for incomplete trees.
+    c11   2 Jan 93  M. Adler        fixed bug in detection of incomplete
+                                    tables, and removed assumption that EOB is
+                                    the longest code (bad assumption).
+    c12   3 Jan 93  M. Adler        make tables for fixed blocks only once.
+    c13   5 Jan 93  M. Adler        allow all zero length codes (pkzip 2.04c
+                                    outputs one zero length code for an empty
+                                    distance tree).
+    c14  12 Mar 93  M. Adler        made inflate.c standalone with the
+                                    introduction of inflate.h.
+   c14b  16 Jul 93  G. Roelofs      added (unsigned) typecast to w at 470.
+   c14c  19 Jul 93  J. Bush         changed v[N_MAX], l[288], ll[28x+3x] arrays
+                                    to static for Amiga.
+   c14d  13 Aug 93  J-l. Gailly     de-complicatified Mark's c[*p++]++ thing.
+   c14e   8 Oct 93  G. Roelofs      changed memset() to memzero().
+   c14f  22 Oct 93  G. Roelofs      renamed quietflg to qflag; made Trace()
+                                    conditional; added inflate_free().
+   c14g  28 Oct 93  G. Roelofs      changed l/(lx+1) macro to pointer (Cray bug)
+   c14h   7 Dec 93  C. Ghisler      huft_build() optimizations.
+   c14i   9 Jan 94  A. Verheijen    set fixed_t{d,l} to NULL after freeing;
+                    G. Roelofs      check NEXTBYTE macro for GZ_EOF.
+   c14j  23 Jan 94  G. Roelofs      removed Ghisler "optimizations"; ifdef'd
+                                    GZ_EOF check.
+   c14k  27 Feb 94  G. Roelofs      added some typecasts to avoid warnings.
+   c14l   9 Apr 94  G. Roelofs      fixed split comments on preprocessor lines
+                                    to avoid bug in Encore compiler.
+   c14m   7 Jul 94  P. Kienitz      modified to allow assembler version of
+                                    inflate_codes() (define ASM_INFLATECODES)
+   c14n  22 Jul 94  G. Roelofs      changed fprintf to FPRINTF for DLL versions
+   c14o  23 Aug 94  C. Spieler      added a newline to a debug statement;
+                    G. Roelofs      added another typecast to avoid MSC warning
+ */
+
+
+/*
+   Inflate deflated (PKZIP's method 8 compressed) data.  The compression
+   method searches for as much of the current string of bytes (up to a
+   length of 258) in the previous 32K bytes.  If it doesn't find any
+   matches (of at least length 3), it codes the next byte.  Otherwise, it
+   codes the length of the matched string and its distance backwards from
+   the current position.  There is a single Huffman code that codes both
+   single bytes (called "literals") and match lengths.  A second Huffman
+   code codes the distance information, which follows a length code.  Each
+   length or distance code actually represents a base value and a number
+   of "extra" (sometimes zero) bits to get to add to the base value.  At
+   the end of each deflated block is a special end-of-block (EOB) literal/
+   length code.  The decoding process is basically: get a literal/length
+   code; if EOB then done; if a literal, emit the decoded byte; if a
+   length then get the distance and emit the referred-to bytes from the
+   sliding window of previously emitted data.
+
+   There are (currently) three kinds of inflate blocks: stored, fixed, and
+   dynamic.  The compressor outputs a chunk of data at a time and decides
+   which method to use on a chunk-by-chunk basis.  A chunk might typically
+   be 32K to 64K, uncompressed.  If the chunk is uncompressible, then the
+   "stored" method is used.  In this case, the bytes are simply stored as
+   is, eight bits per byte, with none of the above coding.  The bytes are
+   preceded by a count, since there is no longer an EOB code.
+
+   If the data is compressible, then either the fixed or dynamic methods
+   are used.  In the dynamic method, the compressed data is preceded by
+   an encoding of the literal/length and distance Huffman codes that are
+   to be used to decode this block.  The representation is itself Huffman
+   coded, and so is preceded by a description of that code.  These code
+   descriptions take up a little space, and so for small blocks, there is
+   a predefined set of codes, called the fixed codes.  The fixed method is
+   used if the block ends up smaller that way (usually for quite small
+   chunks); otherwise the dynamic method is used.  In the latter case, the
+   codes are customized to the probabilities in the current block and so
+   can code it much better than the pre-determined fixed codes can.
+
+   The Huffman codes themselves are decoded using a mutli-level table
+   lookup, in order to maximize the speed of decoding plus the speed of
+   building the decoding tables.  See the comments below that precede the
+   lbits and dbits tuning parameters.
+ */
+
+
+/*
+   Notes beyond the 1.93a appnote.txt:
+
+   1. Distance pointers never point before the beginning of the output
+      stream.
+   2. Distance pointers can point back across blocks, up to 32k away.
+   3. There is an implied maximum of 7 bits for the bit length table and
+      15 bits for the actual data.
+   4. If only one code exists, then it is encoded using one bit.  (Zero
+      would be more efficient, but perhaps a little confusing.)  If two
+      codes exist, they are coded using one bit each (0 and 1).
+   5. There is no way of sending zero distance codes--a dummy must be
+      sent if there are none.  (History: a pre 2.0 version of PKZIP would
+      store blocks with no distance codes, but this was discovered to be
+      too harsh a criterion.)  Valid only for 1.93a.  2.04c does allow
+      zero distance codes, which is sent as one code of zero bits in
+      length.
+   6. There are up to 286 literal/length codes.  Code 256 represents the
+      end-of-block.  Note however that the static length tree defines
+      288 codes just to fill out the Huffman codes.  Codes 286 and 287
+      cannot be used though, since there is no length base or extra bits
+      defined for them.  Similarily, there are up to 30 distance codes.
+      However, static trees define 32 codes (all 5 bits) to fill out the
+      Huffman codes, but the last two had better not show up in the data.
+   7. Unzip can check dynamic Huffman blocks for complete code sets.
+      The exception is that a single code would not be complete (see #4).
+   8. The five bits following the block type is really the number of
+      literal codes sent minus 257.
+   9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
+      (1+6+6).  Therefore, to output three times the length, you output
+      three codes (1+1+1), whereas to output four times the same length,
+      you only need two codes (1+3).  Hmm.
+  10. In the tree reconstruction algorithm, Code = Code + Increment
+      only if BitLength(i) is not zero.  (Pretty obvious.)
+  11. Correction: 4 Bits: # of Bit Length codes - 4     (4 - 19)
+  12. Note: length code 284 can represent 227-258, but length code 285
+      really is 258.  The last length deserves its own, short code
+      since it gets used a lot in very redundant files.  The length
+      258 is special since 258 - 3 (the min match length) is 255.
+  13. The literal/length and distance code bit lengths are read as a
+      single stream of lengths.  It is possible (and advantageous) for
+      a repeat code (16, 17, or 18) to go across the boundary between
+      the two sets of lengths.
+ */
+
+
+#define PKZIP_BUG_WORKAROUND	/* PKZIP 1.93a problem--live with it */
+
+/*
+    inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE,
+    FLUSH() and memzero macros.  If the window size is not 32K, it
+    should also define GZ_WSIZE.  If INFMOD is defined, it can include
+    compiled functions to support the NEXTBYTE and/or FLUSH() macros.
+    There are defaults for NEXTBYTE and FLUSH() below for use as
+    examples of what those functions need to do.  Normally, you would
+    also want FLUSH() to compute a crc on the data.  inflate.h also
+    needs to provide these typedefs:
+
+        typedef unsigned char uch;
+        typedef unsigned short ush;
+        typedef unsigned long ulg;
+
+    This module uses the external functions malloc() and free() (and
+    probably memset() or bzero() in the memzero() macro).  Their
+    prototypes are normally found in <string.h> and <stdlib.h>.
+ */
+#define INFMOD			/* tell inflate.h to include code to be
+				 * compiled */
+
+/* Huffman code lookup table entry--this entry is four bytes for machines
+   that have 16-bit pointers (e.g. PC's in the small or medium model).
+   Valid extra bits are 0..13.  e == 15 is EOB (end of block), e == 16
+   means that v is a literal, 16 < e < 32 means that v is a pointer to
+   the next table, which codes e - 16 bits, and lastly e == 99 indicates
+   an unused code.  If a code with e == 99 is looked up, this implies an
+   error in the data. */
+struct huft {
+	uch             e;	/* number of extra bits or operation */
+	uch             b;	/* number of bits in this code or subcode */
+	union {
+		ush             n;	/* literal, length base, or distance
+					 * base */
+		struct huft    *t;	/* pointer to next level of table */
+	}               v;
+};
+
+
+/* Function prototypes */
+static int huft_build __P((struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *));
+static int huft_free __P((struct inflate *, struct huft *));
+static int inflate_codes __P((struct inflate *, struct huft *, struct huft *, int, int));
+static int inflate_stored __P((struct inflate *));
+static int xinflate __P((struct inflate *));
+static int inflate_fixed __P((struct inflate *));
+static int inflate_dynamic __P((struct inflate *));
+static int inflate_block __P((struct inflate *, int *));
+
+/* The inflate algorithm uses a sliding 32K byte window on the uncompressed
+   stream to find repeated byte strings.  This is implemented here as a
+   circular buffer.  The index is updated simply by incrementing and then
+   and'ing with 0x7fff (32K-1). */
+/* It is left to other modules to supply the 32K area.  It is assumed
+   to be usable as if it were declared "uch slide[32768];" or as just
+   "uch *slide;" and then malloc'ed in the latter case.  The definition
+   must be in unzip.h, included above. */
+
+
+/* Tables for deflate from PKZIP's appnote.txt. */
+
+/* Order of the bit length code lengths */
+static const unsigned border[] = {
+	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+static const ush cplens[] = {	/* Copy lengths for literal codes 257..285 */
+	3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+	35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+ /* note: see note #13 above about the 258 in this list. */
+
+static const ush cplext[] = {	/* Extra bits for literal codes 257..285 */
+	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+	3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99};	/* 99==invalid */
+
+static const ush cpdist[] = {	/* Copy offsets for distance codes 0..29 */
+	1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+	257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+	8193, 12289, 16385, 24577};
+
+static const ush cpdext[] = {	/* Extra bits for distance codes */
+	0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+	7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+	12, 12, 13, 13};
+
+/* And'ing with mask[n] masks the lower n bits */
+static const ush mask[] = {
+	0x0000,
+	0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+	0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+
+
+/* Macros for inflate() bit peeking and grabbing.
+   The usage is:
+
+        NEEDBITS(glbl,j)
+        x = b & mask[j];
+        DUMPBITS(j)
+
+   where NEEDBITS makes sure that b has at least j bits in it, and
+   DUMPBITS removes the bits from b.  The macros use the variable k
+   for the number of bits in b.  Normally, b and k are register
+   variables for speed, and are initialized at the begining of a
+   routine that uses these macros from a global bit buffer and count.
+
+   In order to not ask for more bits than there are in the compressed
+   stream, the Huffman tables are constructed to only ask for just
+   enough bits to make up the end-of-block code (value 256).  Then no
+   bytes need to be "returned" to the buffer at the end of the last
+   block.  See the huft_build() routine.
+ */
+
+/*
+ * The following 2 were global variables.
+ * They are now fields of the inflate structure.
+ */
+
+#define NEEDBITS(glbl,n) {						\
+		while(k<(n)) {						\
+			int c=(*glbl->gz_input)(glbl->gz_private);	\
+			if(c==GZ_EOF)					\
+				return 1; 				\
+			b|=((ulg)c)<<k;					\
+			k+=8;						\
+		}							\
+	}
+
+#define DUMPBITS(n) {b>>=(n);k-=(n);}
+
+/*
+   Huffman code decoding is performed using a multi-level table lookup.
+   The fastest way to decode is to simply build a lookup table whose
+   size is determined by the longest code.  However, the time it takes
+   to build this table can also be a factor if the data being decoded
+   is not very long.  The most common codes are necessarily the
+   shortest codes, so those codes dominate the decoding time, and hence
+   the speed.  The idea is you can have a shorter table that decodes the
+   shorter, more probable codes, and then point to subsidiary tables for
+   the longer codes.  The time it costs to decode the longer codes is
+   then traded against the time it takes to make longer tables.
+
+   This results of this trade are in the variables lbits and dbits
+   below.  lbits is the number of bits the first level table for literal/
+   length codes can decode in one step, and dbits is the same thing for
+   the distance codes.  Subsequent tables are also less than or equal to
+   those sizes.  These values may be adjusted either when all of the
+   codes are shorter than that, in which case the longest code length in
+   bits is used, or when the shortest code is *longer* than the requested
+   table size, in which case the length of the shortest code in bits is
+   used.
+
+   There are two different values for the two tables, since they code a
+   different number of possibilities each.  The literal/length table
+   codes 286 possible values, or in a flat code, a little over eight
+   bits.  The distance table codes 30 possible values, or a little less
+   than five bits, flat.  The optimum values for speed end up being
+   about one bit more than those, so lbits is 8+1 and dbits is 5+1.
+   The optimum values may differ though from machine to machine, and
+   possibly even between compilers.  Your mileage may vary.
+ */
+
+static const int lbits = 9;	/* bits in base literal/length lookup table */
+static const int dbits = 6;	/* bits in base distance lookup table */
+
+
+/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */
+#define BMAX 16			/* maximum bit length of any code (16 for
+				 * explode) */
+#define N_MAX 288		/* maximum number of codes in any set */
+
+/* Given a list of code lengths and a maximum table size, make a set of
+   tables to decode that set of codes.  Return zero on success, one if
+   the given code set is incomplete (the tables are still built in this
+   case), two if the input is invalid (all zero length codes or an
+   oversubscribed set of lengths), and three if not enough memory.
+   The code with value 256 is special, and the tables are constructed
+   so that no bits beyond that code are fetched when that code is
+   decoded. */
+static int
+huft_build(glbl, b, n, s, d, e, t, m)
+	struct inflate *glbl;
+	unsigned       *b;	/* code lengths in bits (all assumed <= BMAX) */
+	unsigned        n;	/* number of codes (assumed <= N_MAX) */
+	unsigned        s;	/* number of simple-valued codes (0..s-1) */
+	const ush      *d;	/* list of base values for non-simple codes */
+	const ush      *e;	/* list of extra bits for non-simple codes */
+	struct huft   **t;	/* result: starting table */
+	int            *m;	/* maximum lookup bits, returns actual */
+{
+	unsigned        a;	/* counter for codes of length k */
+	unsigned        c[BMAX + 1];	/* bit length count table */
+	unsigned        el;	/* length of EOB code (value 256) */
+	unsigned        f;	/* i repeats in table every f entries */
+	int             g;	/* maximum code length */
+	int             h;	/* table level */
+	register unsigned i;	/* counter, current code */
+	register unsigned j;	/* counter */
+	register int    k;	/* number of bits in current code */
+	int             lx[BMAX + 1];	/* memory for l[-1..BMAX-1] */
+	int            *l = lx + 1;	/* stack of bits per table */
+	register unsigned *p;	/* pointer into c[], b[], or v[] */
+	register struct huft *q;/* points to current table */
+	struct huft     r;	/* table entry for structure assignment */
+	struct huft    *u[BMAX];/* table stack */
+	unsigned        v[N_MAX];	/* values in order of bit length */
+	register int    w;	/* bits before this table == (l * h) */
+	unsigned        x[BMAX + 1];	/* bit offsets, then code stack */
+	unsigned       *xp;	/* pointer into x */
+	int             y;	/* number of dummy codes added */
+	unsigned        z;	/* number of entries in current table */
+
+	/* Generate counts for each bit length */
+	el = n > 256 ? b[256] : BMAX;	/* set length of EOB code, if any */
+#ifdef KERNEL
+	memzero((char *) c, sizeof(c));
+#else
+	for (i = 0; i < BMAX+1; i++)
+		c [i] = 0;
+#endif
+	p = b;
+	i = n;
+	do {
+		c[*p]++;
+		p++;		/* assume all entries <= BMAX */
+	} while (--i);
+	if (c[0] == n) {	/* null input--all zero length codes */
+		*t = (struct huft *) NULL;
+		*m = 0;
+		return 0;
+	}
+	/* Find minimum and maximum length, bound *m by those */
+	for (j = 1; j <= BMAX; j++)
+		if (c[j])
+			break;
+	k = j;			/* minimum code length */
+	if ((unsigned) *m < j)
+		*m = j;
+	for (i = BMAX; i; i--)
+		if (c[i])
+			break;
+	g = i;			/* maximum code length */
+	if ((unsigned) *m > i)
+		*m = i;
+
+	/* Adjust last length count to fill out codes, if needed */
+	for (y = 1 << j; j < i; j++, y <<= 1)
+		if ((y -= c[j]) < 0)
+			return 2;	/* bad input: more codes than bits */
+	if ((y -= c[i]) < 0)
+		return 2;
+	c[i] += y;
+
+	/* Generate starting offsets into the value table for each length */
+	x[1] = j = 0;
+	p = c + 1;
+	xp = x + 2;
+	while (--i) {		/* note that i == g from above */
+		*xp++ = (j += *p++);
+	}
+
+	/* Make a table of values in order of bit lengths */
+	p = b;
+	i = 0;
+	do {
+		if ((j = *p++) != 0)
+			v[x[j]++] = i;
+	} while (++i < n);
+
+	/* Generate the Huffman codes and for each, make the table entries */
+	x[0] = i = 0;		/* first Huffman code is zero */
+	p = v;			/* grab values in bit order */
+	h = -1;			/* no tables yet--level -1 */
+	w = l[-1] = 0;		/* no bits decoded yet */
+	u[0] = (struct huft *) NULL;	/* just to keep compilers happy */
+	q = (struct huft *) NULL;	/* ditto */
+	z = 0;			/* ditto */
+
+	/* go through the bit lengths (k already is bits in shortest code) */
+	for (; k <= g; k++) {
+		a = c[k];
+		while (a--) {
+			/*
+			 * here i is the Huffman code of length k bits for
+			 * value *p
+			 */
+			/* make tables up to required level */
+			while (k > w + l[h]) {
+				w += l[h++];	/* add bits already decoded */
+
+				/*
+				 * compute minimum size table less than or
+				 * equal to *m bits
+				 */
+				z = (z = g - w) > (unsigned) *m ? *m : z;	/* upper limit */
+				if ((f = 1 << (j = k - w)) > a + 1) {	/* try a k-w bit table *//* t
+									 * oo few codes for k-w
+									 * bit table */
+					f -= a + 1;	/* deduct codes from
+							 * patterns left */
+					xp = c + k;
+					while (++j < z) {	/* try smaller tables up
+								 * to z bits */
+						if ((f <<= 1) <= *++xp)
+							break;	/* enough codes to use
+								 * up j bits */
+						f -= *xp;	/* else deduct codes
+								 * from patterns */
+					}
+				}
+				if ((unsigned) w + j > el && (unsigned) w < el)
+					j = el - w;	/* make EOB code end at
+							 * table */
+				z = 1 << j;	/* table entries for j-bit
+						 * table */
+				l[h] = j;	/* set table size in stack */
+
+				/* allocate and link in new table */
+				if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) ==
+				    (struct huft *) NULL) {
+					if (h)
+						huft_free(glbl, u[0]);
+					return 3;	/* not enough memory */
+				}
+				glbl->gz_hufts += z + 1;	/* track memory usage */
+				*t = q + 1;	/* link to list for
+						 * huft_free() */
+				*(t = &(q->v.t)) = (struct huft *) NULL;
+				u[h] = ++q;	/* table starts after link */
+
+				/* connect to last table, if there is one */
+				if (h) {
+					x[h] = i;	/* save pattern for
+							 * backing up */
+					r.b = (uch) l[h - 1];	/* bits to dump before
+								 * this table */
+					r.e = (uch) (16 + j);	/* bits in this table */
+					r.v.t = q;	/* pointer to this table */
+					j = (i & ((1 << w) - 1)) >> (w - l[h - 1]);
+					u[h - 1][j] = r;	/* connect to last table */
+				}
+			}
+
+			/* set up table entry in r */
+			r.b = (uch) (k - w);
+			if (p >= v + n)
+				r.e = 99;	/* out of values--invalid
+						 * code */
+			else if (*p < s) {
+				r.e = (uch) (*p < 256 ? 16 : 15);	/* 256 is end-of-block
+									 * code */
+				r.v.n = *p++;	/* simple code is just the
+						 * value */
+			} else {
+				r.e = (uch) e[*p - s];	/* non-simple--look up
+							 * in lists */
+				r.v.n = d[*p++ - s];
+			}
+
+			/* fill code-like entries with r */
+			f = 1 << (k - w);
+			for (j = i >> w; j < z; j += f)
+				q[j] = r;
+
+			/* backwards increment the k-bit code i */
+			for (j = 1 << (k - 1); i & j; j >>= 1)
+				i ^= j;
+			i ^= j;
+
+			/* backup over finished tables */
+			while ((i & ((1 << w) - 1)) != x[h])
+				w -= l[--h];	/* don't need to update q */
+		}
+	}
+
+	/* return actual size of base table */
+	*m = l[0];
+
+	/* Return true (1) if we were given an incomplete table */
+	return y != 0 && g != 1;
+}
+
+static int
+huft_free(glbl, t)
+	struct inflate *glbl;
+	struct huft    *t;	/* table to free */
+/* Free the malloc'ed tables built by huft_build(), which makes a linked
+   list of the tables it made, with the links in a dummy first entry of
+   each table. */
+{
+	register struct huft *p, *q;
+
+	/* Go through linked list, freeing from the malloced (t[-1]) address. */
+	p = t;
+	while (p != (struct huft *) NULL) {
+		q = (--p)->v.t;
+		free(p, M_GZIP);
+		p = q;
+	}
+	return 0;
+}
+
+/* inflate (decompress) the codes in a deflated (compressed) block.
+   Return an error code or zero if it all goes ok. */
+static int
+inflate_codes(glbl, tl, td, bl, bd)
+	struct inflate *glbl;
+	struct huft    *tl, *td;/* literal/length and distance decoder tables */
+	int             bl, bd;	/* number of bits decoded by tl[] and td[] */
+{
+	register unsigned e;	/* table entry flag/number of extra bits */
+	unsigned        n, d;	/* length and index for copy */
+	unsigned        w;	/* current window position */
+	struct huft    *t;	/* pointer to table entry */
+	unsigned        ml, md;	/* masks for bl and bd bits */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local copies of globals */
+	b = glbl->gz_bb;			/* initialize bit buffer */
+	k = glbl->gz_bk;
+	w = glbl->gz_wp;	/* initialize window position */
+
+	/* inflate the coded data */
+	ml = mask[bl];		/* precompute masks for speed */
+	md = mask[bd];
+	while (1) {		/* do until end of block */
+		NEEDBITS(glbl, (unsigned) bl)
+			if ((e = (t = tl + ((unsigned) b & ml))->e) > 16)
+			do {
+				if (e == 99)
+					return 1;
+				DUMPBITS(t->b)
+					e -= 16;
+				NEEDBITS(glbl, e)
+			} while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+		DUMPBITS(t->b)
+			if (e == 16) {	/* then it's a literal */
+			glbl->gz_slide[w++] = (uch) t->v.n;
+			if (w == GZ_WSIZE) {
+				FLUSH(glbl, w);
+				w = 0;
+			}
+		} else {	/* it's an EOB or a length */
+			/* exit if end of block */
+			if (e == 15)
+				break;
+
+			/* get length of block to copy */
+			NEEDBITS(glbl, e)
+				n = t->v.n + ((unsigned) b & mask[e]);
+			DUMPBITS(e);
+
+			/* decode distance of block to copy */
+			NEEDBITS(glbl, (unsigned) bd)
+				if ((e = (t = td + ((unsigned) b & md))->e) > 16)
+				do {
+					if (e == 99)
+						return 1;
+					DUMPBITS(t->b)
+						e -= 16;
+					NEEDBITS(glbl, e)
+				} while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+			DUMPBITS(t->b)
+				NEEDBITS(glbl, e)
+				d = w - t->v.n - ((unsigned) b & mask[e]);
+			DUMPBITS(e)
+			/* do the copy */
+				do {
+				n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e);
+#ifndef NOMEMCPY
+				if (w - d >= e) {	/* (this test assumes
+							 * unsigned comparison) */
+					memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e);
+					w += e;
+					d += e;
+				} else	/* do it slow to avoid memcpy()
+					 * overlap */
+#endif				/* !NOMEMCPY */
+					do {
+						glbl->gz_slide[w++] = glbl->gz_slide[d++];
+					} while (--e);
+				if (w == GZ_WSIZE) {
+					FLUSH(glbl, w);
+					w = 0;
+				}
+			} while (n);
+		}
+	}
+
+	/* restore the globals from the locals */
+	glbl->gz_wp = w;	/* restore global window pointer */
+	glbl->gz_bb = b;			/* restore global bit buffer */
+	glbl->gz_bk = k;
+
+	/* done */
+	return 0;
+}
+
+/* "decompress" an inflated type 0 (stored) block. */
+static int
+inflate_stored(glbl)
+	struct inflate *glbl;
+{
+	unsigned        n;	/* number of bytes in block */
+	unsigned        w;	/* current window position */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local copies of globals */
+	b = glbl->gz_bb;			/* initialize bit buffer */
+	k = glbl->gz_bk;
+	w = glbl->gz_wp;	/* initialize window position */
+
+	/* go to byte boundary */
+	n = k & 7;
+	DUMPBITS(n);
+
+	/* get the length and its complement */
+	NEEDBITS(glbl, 16)
+		n = ((unsigned) b & 0xffff);
+	DUMPBITS(16)
+		NEEDBITS(glbl, 16)
+		if (n != (unsigned) ((~b) & 0xffff))
+		return 1;	/* error in compressed data */
+	DUMPBITS(16)
+	/* read and output the compressed data */
+		while (n--) {
+		NEEDBITS(glbl, 8)
+			glbl->gz_slide[w++] = (uch) b;
+		if (w == GZ_WSIZE) {
+			FLUSH(glbl, w);
+			w = 0;
+		}
+		DUMPBITS(8)
+	}
+
+	/* restore the globals from the locals */
+	glbl->gz_wp = w;	/* restore global window pointer */
+	glbl->gz_bb = b;			/* restore global bit buffer */
+	glbl->gz_bk = k;
+	return 0;
+}
+
+/* decompress an inflated type 1 (fixed Huffman codes) block.  We should
+   either replace this with a custom decoder, or at least precompute the
+   Huffman tables. */
+static int
+inflate_fixed(glbl)
+	struct inflate *glbl;
+{
+	/* if first time, set up tables for fixed blocks */
+	if (glbl->gz_fixed_tl == (struct huft *) NULL) {
+		int             i;	/* temporary variable */
+		static unsigned l[288];	/* length list for huft_build */
+
+		/* literal table */
+		for (i = 0; i < 144; i++)
+			l[i] = 8;
+		for (; i < 256; i++)
+			l[i] = 9;
+		for (; i < 280; i++)
+			l[i] = 7;
+		for (; i < 288; i++)	/* make a complete, but wrong code
+					 * set */
+			l[i] = 8;
+		glbl->gz_fixed_bl = 7;
+		if ((i = huft_build(glbl, l, 288, 257, cplens, cplext,
+			    &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) {
+			glbl->gz_fixed_tl = (struct huft *) NULL;
+			return i;
+		}
+		/* distance table */
+		for (i = 0; i < 30; i++)	/* make an incomplete code
+						 * set */
+			l[i] = 5;
+		glbl->gz_fixed_bd = 5;
+		if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext,
+			     &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) {
+			huft_free(glbl, glbl->gz_fixed_tl);
+			glbl->gz_fixed_tl = (struct huft *) NULL;
+			return i;
+		}
+	}
+	/* decompress until an end-of-block code */
+	return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0;
+}
+
+/* decompress an inflated type 2 (dynamic Huffman codes) block. */
+static int
+inflate_dynamic(glbl)
+	struct inflate *glbl;
+{
+	int             i;	/* temporary variables */
+	unsigned        j;
+	unsigned        l;	/* last length */
+	unsigned        m;	/* mask for bit lengths table */
+	unsigned        n;	/* number of lengths to get */
+	struct huft    *tl;	/* literal/length code table */
+	struct huft    *td;	/* distance code table */
+	int             bl;	/* lookup bits for tl */
+	int             bd;	/* lookup bits for td */
+	unsigned        nb;	/* number of bit length codes */
+	unsigned        nl;	/* number of literal/length codes */
+	unsigned        nd;	/* number of distance codes */
+#ifdef PKZIP_BUG_WORKAROUND
+	unsigned        ll[288 + 32];	/* literal/length and distance code
+					 * lengths */
+#else
+	unsigned        ll[286 + 30];	/* literal/length and distance code
+					 * lengths */
+#endif
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local bit buffer */
+	b = glbl->gz_bb;
+	k = glbl->gz_bk;
+
+	/* read in table lengths */
+	NEEDBITS(glbl, 5)
+		nl = 257 + ((unsigned) b & 0x1f);	/* number of
+							 * literal/length codes */
+	DUMPBITS(5)
+		NEEDBITS(glbl, 5)
+		nd = 1 + ((unsigned) b & 0x1f);	/* number of distance codes */
+	DUMPBITS(5)
+		NEEDBITS(glbl, 4)
+		nb = 4 + ((unsigned) b & 0xf);	/* number of bit length codes */
+	DUMPBITS(4)
+#ifdef PKZIP_BUG_WORKAROUND
+		if (nl > 288 || nd > 32)
+#else
+		if (nl > 286 || nd > 30)
+#endif
+		return 1;	/* bad lengths */
+	/* read in bit-length-code lengths */
+	for (j = 0; j < nb; j++) {
+		NEEDBITS(glbl, 3)
+			ll[border[j]] = (unsigned) b & 7;
+		DUMPBITS(3)
+	}
+	for (; j < 19; j++)
+		ll[border[j]] = 0;
+
+	/* build decoding table for trees--single level, 7 bit lookup */
+	bl = 7;
+	if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) {
+		if (i == 1)
+			huft_free(glbl, tl);
+		return i;	/* incomplete code set */
+	}
+	/* read in literal and distance code lengths */
+	n = nl + nd;
+	m = mask[bl];
+	i = l = 0;
+	while ((unsigned) i < n) {
+		NEEDBITS(glbl, (unsigned) bl)
+			j = (td = tl + ((unsigned) b & m))->b;
+		DUMPBITS(j)
+			j = td->v.n;
+		if (j < 16)	/* length of code in bits (0..15) */
+			ll[i++] = l = j;	/* save last length in l */
+		else if (j == 16) {	/* repeat last length 3 to 6 times */
+			NEEDBITS(glbl, 2)
+				j = 3 + ((unsigned) b & 3);
+			DUMPBITS(2)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = l;
+		} else if (j == 17) {	/* 3 to 10 zero length codes */
+			NEEDBITS(glbl, 3)
+				j = 3 + ((unsigned) b & 7);
+			DUMPBITS(3)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = 0;
+			l = 0;
+		} else {	/* j == 18: 11 to 138 zero length codes */
+			NEEDBITS(glbl, 7)
+				j = 11 + ((unsigned) b & 0x7f);
+			DUMPBITS(7)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = 0;
+			l = 0;
+		}
+	}
+
+	/* free decoding table for trees */
+	huft_free(glbl, tl);
+
+	/* restore the global bit buffer */
+	glbl->gz_bb = b;
+	glbl->gz_bk = k;
+
+	/* build the decoding tables for literal/length and distance codes */
+	bl = lbits;
+	i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl);
+	if (i != 0) {
+		if (i == 1 && !qflag) {
+			FPRINTF("(incomplete l-tree)  ");
+			huft_free(glbl, tl);
+		}
+		return i;	/* incomplete code set */
+	}
+	bd = dbits;
+	i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd);
+	if (i != 0) {
+		if (i == 1 && !qflag) {
+			FPRINTF("(incomplete d-tree)  ");
+#ifdef PKZIP_BUG_WORKAROUND
+			i = 0;
+		}
+#else
+			huft_free(glbl, td);
+		}
+		huft_free(glbl, tl);
+		return i;	/* incomplete code set */
+#endif
+	}
+	/* decompress until an end-of-block code */
+	if (inflate_codes(glbl, tl, td, bl, bd))
+		return 1;
+
+	/* free the decoding tables, return */
+	huft_free(glbl, tl);
+	huft_free(glbl, td);
+	return 0;
+}
+
+/* decompress an inflated block */
+static int
+inflate_block(glbl, e)
+	struct inflate *glbl;
+	int            *e;	/* last block flag */
+{
+	unsigned        t;	/* block type */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local bit buffer */
+	b = glbl->gz_bb;
+	k = glbl->gz_bk;
+
+	/* read in last block bit */
+	NEEDBITS(glbl, 1)
+		* e = (int) b & 1;
+	DUMPBITS(1)
+	/* read in block type */
+		NEEDBITS(glbl, 2)
+		t = (unsigned) b & 3;
+	DUMPBITS(2)
+	/* restore the global bit buffer */
+		glbl->gz_bb = b;
+	glbl->gz_bk = k;
+
+	/* inflate that block type */
+	if (t == 2)
+		return inflate_dynamic(glbl);
+	if (t == 0)
+		return inflate_stored(glbl);
+	if (t == 1)
+		return inflate_fixed(glbl);
+	/* bad block type */
+	return 2;
+}
+
+
+
+/* decompress an inflated entry */
+static int
+xinflate(glbl)
+	struct inflate *glbl;
+{
+	int             e;	/* last block flag */
+	int             r;	/* result code */
+	unsigned        h;	/* maximum struct huft's malloc'ed */
+
+	glbl->gz_fixed_tl = (struct huft *) NULL;
+
+	/* initialize window, bit buffer */
+	glbl->gz_wp = 0;
+	glbl->gz_bk = 0;
+	glbl->gz_bb = 0;
+
+	/* decompress until the last block */
+	h = 0;
+	do {
+		glbl->gz_hufts = 0;
+		if ((r = inflate_block(glbl, &e)) != 0)
+			return r;
+		if (glbl->gz_hufts > h)
+			h = glbl->gz_hufts;
+	} while (!e);
+
+	/* flush out slide */
+	FLUSH(glbl, glbl->gz_wp);
+
+	/* return success */
+	return 0;
+}
+
+/* Nobody uses this - why not? */
+int
+inflate(glbl)
+	struct inflate *glbl;
+{
+	int             i;
+#ifdef KERNEL
+	u_char		*p = NULL;
+
+	if (!glbl->gz_slide)
+		p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK);
+#endif
+	if (!glbl->gz_slide)
+#ifdef KERNEL
+		return(ENOMEM);
+#else
+		return 3; /* kzip expects 3 */
+#endif
+	i = xinflate(glbl);
+
+	if (glbl->gz_fixed_td != (struct huft *) NULL) {
+		huft_free(glbl, glbl->gz_fixed_td);
+		glbl->gz_fixed_td = (struct huft *) NULL;
+	}
+	if (glbl->gz_fixed_tl != (struct huft *) NULL) {
+		huft_free(glbl, glbl->gz_fixed_tl);
+		glbl->gz_fixed_tl = (struct huft *) NULL;
+	}
+#ifdef KERNEL
+	if (p == glbl->gz_slide) {
+		free(glbl->gz_slide, M_GZIP);
+		glbl->gz_slide = NULL;
+	}
+#endif
+	return i;
+}
+/* ----------------------- END INFLATE.C */
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
new file mode 100644
index 0000000..246684f
--- /dev/null
+++ b/sys/kern/init_main.c
@@ -0,0 +1,698 @@
+/*
+ * Copyright (c) 1995 Terrence R. Lambert
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
+ * $Id: init_main.c,v 1.102 1998/12/30 10:38:58 dfr Exp $
+ */
+
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/sysent.h>
+#include <sys/reboot.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
+#include <sys/unistd.h>
+#include <sys/malloc.h>
+
+#include <machine/cpu.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+#include <sys/copyright.h>
+
+extern struct linker_set	sysinit_set;	/* XXX */
+
+extern void __main __P((void));
+extern void main __P((void *framep));
+
+/* Components of the first process -- never freed. */
+static struct session session0;
+static struct pgrp pgrp0;
+struct	proc proc0;
+static struct pcred cred0;
+#ifdef COMPAT_LINUX_THREADS
+static struct procsig procsig0;
+#endif /* COMPAT_LINUX_THREADS */
+static struct filedesc0 filedesc0;
+static struct plimit limit0;
+static struct vmspace vmspace0;
+struct	proc *initproc;
+
+int cmask = CMASK;
+extern	struct user *proc0paddr;
+
+struct	vnode *rootvp;
+int	boothowto = 0;		/* initialized so that it can be patched */
+
+struct	timeval boottime;
+SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime,
+	CTLFLAG_RD, &boottime, timeval, "");
+
+static int shutdowntimeout = 120;
+SYSCTL_INT(_kern, OID_AUTO, shutdown_timeout,
+	CTLFLAG_RW, &shutdowntimeout, 0, "");
+
+/*
+ * Promiscuous argument pass for start_init()
+ *
+ * This is a kludge because we use a return from main() rather than a call
+ * to a new routine in locore.s to kick the kernel alive from locore.s.
+ */
+static void	*init_framep;
+
+
+#if __GNUC__ >= 2
+void __main() {}
+#endif
+
+
+/*
+ * This ensures that there is at least one entry so that the sysinit_set
+ * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
+ * executed.
+ */
+SYSINIT(placeholder, SI_SUB_DUMMY,SI_ORDER_ANY, NULL, NULL)
+
+/*
+ * The sysinit table itself.  Items are checked off as the are run.
+ * If we want to register new sysinit types, add them to newsysinit.
+ */
+struct sysinit **sysinit = (struct sysinit **)sysinit_set.ls_items;
+struct sysinit **newsysinit;
+
+/*
+ * Merge a new sysinit set into the current set, reallocating it if
+ * necessary.  This can only be called after malloc is running.
+ */
+void
+sysinit_add(set)
+	struct sysinit **set;
+{
+	struct sysinit **newset;
+	struct sysinit **sipp;
+	struct sysinit **xipp;
+	int count = 0;
+
+	if (newsysinit)
+		for (sipp = newsysinit; *sipp; sipp++)
+			count++;
+	else
+		for (sipp = sysinit; *sipp; sipp++)
+			count++;
+	for (sipp = set; *sipp; sipp++)
+		count++;
+	count++;		/* Trailing NULL */
+	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
+	if (newset == NULL)
+		panic("cannot malloc for sysinit");
+	xipp = newset;
+	if (newsysinit)
+		for (sipp = newsysinit; *sipp; sipp++)
+			*xipp++ = *sipp;
+	else
+		for (sipp = sysinit; *sipp; sipp++)
+			*xipp++ = *sipp;
+	for (sipp = set; *sipp; sipp++)
+		*xipp++ = *sipp;
+	*xipp = NULL;
+	if (newsysinit)
+		free(newsysinit, M_TEMP);
+	newsysinit = newset;
+}
+
+/*
+ * System startup; initialize the world, create process 0, mount root
+ * filesystem, and fork to create init and pagedaemon.  Most of the
+ * hard work is done in the lower-level initialization routines including
+ * startup(), which does memory initialization and autoconfiguration.
+ *
+ * This allows simple addition of new kernel subsystems that require
+ * boot time initialization.  It also allows substitution of subsystem
+ * (for instance, a scheduler, kernel profiler, or VM system) by object
+ * module.  Finally, it allows for optional "kernel threads".
+ */
+void
+main(framep)
+	void *framep;
+{
+
+	register struct sysinit **sipp;		/* system initialization*/
+	register struct sysinit **xipp;		/* interior loop of sort*/
+	register struct sysinit *save;		/* bubble*/
+
+	/*
+	 * Copy the locore.s frame pointer for proc0, this is forked into
+	 * all other processes.
+	 */
+	init_framep = framep;
+
+restart:
+	/*
+	 * Perform a bubble sort of the system initialization objects by
+	 * their subsystem (primary key) and order (secondary key).
+	 */
+	for (sipp = sysinit; *sipp; sipp++) {
+		for (xipp = sipp + 1; *xipp; xipp++) {
+			if ((*sipp)->subsystem < (*xipp)->subsystem ||
+			     ((*sipp)->subsystem == (*xipp)->subsystem &&
+			      (*sipp)->order < (*xipp)->order))
+				continue;	/* skip*/
+			save = *sipp;
+			*sipp = *xipp;
+			*xipp = save;
+		}
+	}
+
+	/*
+	 * Traverse the (now) ordered list of system initialization tasks.
+	 * Perform each task, and continue on to the next task.
+	 *
+	 * The last item on the list is expected to be the scheduler,
+	 * which will not return.
+	 */
+	for (sipp = sysinit; *sipp; sipp++) {
+
+		if ((*sipp)->subsystem == SI_SUB_DUMMY)
+			continue;	/* skip dummy task(s)*/
+
+		if ((*sipp)->subsystem == SI_SUB_DONE)
+			continue;
+
+		switch( (*sipp)->type) {
+		case SI_TYPE_DEFAULT:
+			/* no special processing*/
+			(*((*sipp)->func))((*sipp)->udata);
+			break;
+
+		case SI_TYPE_KTHREAD:
+#if !defined(SMP)
+			/* kernel thread*/
+			if (fork1(&proc0, RFMEM|RFFDG|RFPROC))
+				panic("fork kernel thread");
+			cpu_set_fork_handler(pfind(proc0.p_retval[0]),
+			    (*sipp)->func, (*sipp)->udata);
+			break;
+#endif
+
+		case SI_TYPE_KPROCESS:
+			if (fork1(&proc0, RFFDG|RFPROC))
+				panic("fork kernel process");
+			cpu_set_fork_handler(pfind(proc0.p_retval[0]),
+			    (*sipp)->func, (*sipp)->udata);
+			break;
+
+		default:
+			panic("init_main: unrecognized init type");
+		}
+
+		/* Check off the one we're just done */
+		(*sipp)->subsystem = SI_SUB_DONE;
+
+		/* Check if we've installed more sysinit items via KLD */
+		if (newsysinit != NULL) {
+			if (sysinit != (struct sysinit **)sysinit_set.ls_items)
+				free(sysinit, M_TEMP);
+			sysinit = newsysinit;
+			newsysinit = NULL;
+			goto restart;
+		}
+	}
+
+	panic("Shouldn't get here!");
+	/* NOTREACHED*/
+}
+
+
+/*
+ * Start a kernel process.  This is called after a fork() call in
+ * main() in the file kern/init_main.c.
+ *
+ * This function is used to start "internal" daemons.
+ */
+/* ARGSUSED*/
+void
+kproc_start(udata)
+	void *udata;
+{
+	struct kproc_desc	*kp = udata;
+	struct proc		*p = curproc;
+
+#ifdef DIAGNOSTIC
+	printf("Start pid=%d <%s>\n",p->p_pid, kp->arg0);
+#endif
+
+	/* save a global descriptor, if desired*/
+	if( kp->global_procpp != NULL)
+		*kp->global_procpp	= p;
+
+	/* this is a non-swapped system process*/
+	p->p_flag |= P_INMEM | P_SYSTEM;
+
+	/* set up arg0 for 'ps', et al*/
+	strcpy( p->p_comm, kp->arg0);
+
+	/* call the processes' main()...*/
+	(*kp->func)();
+
+	/* NOTREACHED */
+	panic("kproc_start: %s", kp->arg0);
+}
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's belong elsewhere, but have not yet
+ **** been moved.
+ ****
+ ***************************************************************************
+ */
+#ifdef OMIT
+/*
+ * Handled by vfs_mountroot (bad idea) at this time... should be
+ * done the same as 4.4Lite2.
+ */
+SYSINIT(swapinit, SI_SUB_SWAP, SI_ORDER_FIRST, swapinit, NULL)
+#endif	/* OMIT*/
+
+static void print_caddr_t __P((void *data));
+static void
+print_caddr_t(data)
+	void *data;
+{
+	printf("%s", (char *)data);
+}
+SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The two following SYSINT's are proc0 specific glue code.  I am not
+ **** convinced that they can not be safely combined, but their order of
+ **** operation has been maintained as the same as the original init_main.c
+ **** for right now.
+ ****
+ **** These probably belong in init_proc.c or kern_proc.c, since they
+ **** deal with proc0 (the fork template process).
+ ****
+ ***************************************************************************
+ */
+/* ARGSUSED*/
+static void proc0_init __P((void *dummy));
+static void
+proc0_init(dummy)
+	void *dummy;
+{
+	register struct proc		*p;
+	register struct filedesc0	*fdp;
+	register unsigned i;
+
+	/*
+	 * Initialize the current process pointer (curproc) before
+	 * any possible traps/probes to simplify trap processing.
+	 */
+	p = &proc0;
+	curproc = p;			/* XXX redundant*/
+
+	/*
+	 * Initialize process and pgrp structures.
+	 */
+	procinit();
+
+	/*
+	 * Initialize sleep queue hash table
+	 */
+	sleepinit();
+
+	/*
+	 * additional VM structures
+	 */
+	vm_init2();
+
+	/*
+	 * Create process 0 (the swapper).
+	 */
+	LIST_INSERT_HEAD(&allproc, p, p_list);
+	p->p_pgrp = &pgrp0;
+	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
+	LIST_INIT(&pgrp0.pg_members);
+	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
+
+	pgrp0.pg_session = &session0;
+	session0.s_count = 1;
+	session0.s_leader = p;
+
+	p->p_sysent = &aout_sysvec;
+
+	p->p_flag = P_INMEM | P_SYSTEM;
+	p->p_stat = SRUN;
+	p->p_nice = NZERO;
+	p->p_rtprio.type = RTP_PRIO_NORMAL;
+	p->p_rtprio.prio = 0;
+
+/*
+ * Link for kernel based threads
+ */
+	p->p_peers = 0;
+	p->p_leader = p;
+
+	bcopy("swapper", p->p_comm, sizeof ("swapper"));
+
+	/* Create credentials. */
+	cred0.p_refcnt = 1;
+	p->p_cred = &cred0;
+	p->p_ucred = crget();
+	p->p_ucred->cr_ngroups = 1;	/* group 0 */
+
+#ifdef COMPAT_LINUX_THREADS
+	/* Create procsig. */
+	p->p_procsig = &procsig0;
+	p->p_procsig->ps_refcnt = 2;
+
+#endif /* COMPAT_LINUX_THREADS */
+	/* Create the file descriptor table. */
+	fdp = &filedesc0;
+	p->p_fd = &fdp->fd_fd;
+	fdp->fd_fd.fd_refcnt = 1;
+	fdp->fd_fd.fd_cmask = cmask;
+	fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
+	fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
+	fdp->fd_fd.fd_nfiles = NDFILE;
+
+	/* Create the limits structures. */
+	p->p_limit = &limit0;
+	for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
+		limit0.pl_rlimit[i].rlim_cur =
+		    limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
+	limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
+	    limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
+	limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
+	    limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
+	i = ptoa(cnt.v_free_count);
+	limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
+	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
+	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
+	limit0.p_cpulimit = RLIM_INFINITY;
+	limit0.p_refcnt = 1;
+
+
+	/* Allocate a prototype map so we have something to fork. */
+	pmap_pinit0(&vmspace0.vm_pmap);
+	p->p_vmspace = &vmspace0;
+	vmspace0.vm_refcnt = 1;
+	vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
+	    trunc_page(VM_MAXUSER_ADDRESS));
+	vmspace0.vm_map.pmap = &vmspace0.vm_pmap;
+	p->p_addr = proc0paddr;				/* XXX */
+
+#ifndef __alpha__		/* XXX what is this? */
+#define INCOMPAT_LITES2
+#ifdef INCOMPAT_LITES2
+	/*
+	 * proc0 needs to have a coherent frame base in its stack.
+	 */
+	cpu_set_init_frame(p, init_framep);			/* XXX! */
+#endif	/* INCOMPAT_LITES2*/
+#endif
+
+	/*
+	 * We continue to place resource usage info and signal
+	 * actions in the user struct so they're pageable.
+	 */
+	p->p_stats = &p->p_addr->u_stats;
+	p->p_sigacts = &p->p_addr->u_sigacts;
+
+	/*
+	 * Charge root for one process.
+	 */
+	(void)chgproccnt(0, 1);
+
+	/*
+	 * Initialize the procfs flags (to 0, of course)
+	 */
+	p->p_stops = p->p_stype = p->p_step = 0;
+
+}
+SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
+
+/* ARGSUSED*/
+static void proc0_post __P((void *dummy));
+static void
+proc0_post(dummy)
+	void *dummy;
+{
+	struct timespec ts;
+
+	/*
+	 * Now can look at time, having had a chance to verify the time
+	 * from the file system.  Reset p->p_runtime as it may have been
+	 * munched in mi_switch() after the time got set.  Set
+	 * p->p_switchtime to be consistent with this unmunching.
+	 */
+	microtime(&proc0.p_stats->p_start);
+	proc0.p_runtime = 0;
+	microuptime(&proc0.p_switchtime);
+
+	/*
+	 * Give the ``random'' number generator a thump.
+	 * XXX: Does read_random() contain enough bits to be used here ?
+	 */
+	nanotime(&ts);
+	srandom(ts.tv_sec ^ ts.tv_nsec);
+
+	/* Initialize signal state for process 0. */
+	siginit(&proc0);
+}
+SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
+
+
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's and glue code should be moved to the
+ **** respective files on a per subsystem basis.
+ ****
+ ***************************************************************************
+ */
+
+/* ARGSUSED */
+static void root_conf __P((void *dummy));
+static void
+root_conf(dummy)
+	void *dummy;
+{
+	cpu_rootconf();
+}
+SYSINIT(root_conf, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, root_conf, NULL)
+
+/* ARGSUSED*/
+static void xxx_vfs_root_fdtab __P((void *dummy));
+static void
+xxx_vfs_root_fdtab(dummy)
+	void *dummy;
+{
+	register struct filedesc0	*fdp = &filedesc0;
+
+	/* Get the vnode for '/'.  Set fdp->fd_fd.fd_cdir to reference it. */
+	if (VFS_ROOT(mountlist.cqh_first, &rootvnode))
+		panic("cannot find root vnode");
+	fdp->fd_fd.fd_cdir = rootvnode;
+	VREF(fdp->fd_fd.fd_cdir);
+	VOP_UNLOCK(rootvnode, 0, &proc0);
+	fdp->fd_fd.fd_rdir = rootvnode;
+}
+SYSINIT(retrofit, SI_SUB_ROOT_FDTAB, SI_ORDER_FIRST, xxx_vfs_root_fdtab, NULL)
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following code probably belongs in another file, like
+ **** kern/init_init.c.  It is here for two reasons only:
+ ****
+ ****	1)	This code returns to startup the system; this is
+ ****		abnormal for a kernel thread.
+ ****	2)	This code promiscuously uses init_frame
+ ****
+ ***************************************************************************
+ */
+
+static void kthread_init __P((void *dummy));
+SYSINIT_KP(init,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kthread_init, NULL)
+
+
+extern void prepare_usermode __P((void));
+static void start_init __P((struct proc *p));
+
+/* ARGSUSED*/
+static void
+kthread_init(dummy)
+	void *dummy;
+{
+	/* Create process 1 (init(8)). */
+	start_init(curproc);
+
+	prepare_usermode();
+
+	/*
+	 * This returns to the fork trampoline, then to user mode.
+	 */
+	return;	
+}
+
+
+/*
+ * List of paths to try when searching for "init".
+ */
+static char *initpaths[] = {
+	"/sbin/init",
+	"/sbin/oinit",
+	"/sbin/init.bak",
+	"/stand/sysinstall",
+	NULL,
+};
+
+/*
+ * Start the initial user process; try exec'ing each pathname in "initpaths".
+ * The program is invoked with one argument containing the boot flags.
+ */
+static void
+start_init(p)
+	struct proc *p;
+{
+	vm_offset_t addr;
+	struct execve_args args;
+	int options, i, error;
+	char **pathp, *path, *ucp, **uap, *arg0, *arg1;
+
+	initproc = p;
+
+	/*
+	 * Need just enough stack to hold the faked-up "execve()" arguments.
+	 */
+	addr = trunc_page(USRSTACK - PAGE_SIZE);
+	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
+		panic("init: couldn't allocate argument space");
+	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
+	p->p_vmspace->vm_ssize = 1;
+
+	for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) {
+		/*
+		 * Move out the boot flag argument.
+		 */
+		options = 0;
+		ucp = (char *)USRSTACK;
+		(void)subyte(--ucp, 0);		/* trailing zero */
+		if (boothowto & RB_SINGLE) {
+			(void)subyte(--ucp, 's');
+			options = 1;
+		}
+#ifdef notyet
+                if (boothowto & RB_FASTBOOT) {
+			(void)subyte(--ucp, 'f');
+			options = 1;
+		}
+#endif
+
+#ifdef BOOTCDROM
+		(void)subyte(--ucp, 'C');
+		options = 1;
+#endif
+		if (options == 0)
+			(void)subyte(--ucp, '-');
+		(void)subyte(--ucp, '-');		/* leading hyphen */
+		arg1 = ucp;
+
+		/*
+		 * Move out the file name (also arg 0).
+		 */
+		for (i = strlen(path) + 1; i >= 0; i--)
+			(void)subyte(--ucp, path[i]);
+		arg0 = ucp;
+
+		/*
+		 * Move out the arg pointers.
+		 */
+		uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
+		(void)suword((caddr_t)--uap, (long)0);	/* terminator */
+		(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
+		(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
+
+		/*
+		 * Point at the arguments.
+		 */
+		args.fname = arg0;
+		args.argv = uap;
+		args.envv = NULL;
+
+		/*
+		 * Now try to exec the program.  If can't for any reason
+		 * other than it doesn't exist, complain.
+		 *
+		 * Otherwise return to main() which returns to btext
+		 * which completes the system startup.
+		 */
+		if ((error = execve(p, &args)) == 0)
+			return;
+		if (error != ENOENT)
+			printf("exec %s: error %d\n", path, error);
+	}
+	printf("init: not found\n");
+	panic("no init");
+}
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
new file mode 100644
index 0000000..c31ed46
--- /dev/null
+++ b/sys/kern/init_sysent.c
@@ -0,0 +1,360 @@
+/*
+ * System call switch table.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * created from	Id: syscalls.master,v 1.55 1998/11/11 12:45:14 peter Exp 
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+#ifdef COMPAT_43
+#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)
+#else
+#define compat(n, name) 0, (sy_call_t *)nosys
+#endif
+
+/* The casts are bogus but will do for now. */
+struct sysent sysent[] = {
+	{ 0, (sy_call_t *)nosys },			/* 0 = syscall */
+	{ 1, (sy_call_t *)exit },			/* 1 = exit */
+	{ 0, (sy_call_t *)fork },			/* 2 = fork */
+	{ 3, (sy_call_t *)read },			/* 3 = read */
+	{ 3, (sy_call_t *)write },			/* 4 = write */
+	{ 3, (sy_call_t *)open },			/* 5 = open */
+	{ 1, (sy_call_t *)close },			/* 6 = close */
+	{ 4, (sy_call_t *)wait4 },			/* 7 = wait4 */
+	{ compat(2,creat) },		/* 8 = old creat */
+	{ 2, (sy_call_t *)link },			/* 9 = link */
+	{ 1, (sy_call_t *)unlink },			/* 10 = unlink */
+	{ 0, (sy_call_t *)nosys },			/* 11 = obsolete execv */
+	{ 1, (sy_call_t *)chdir },			/* 12 = chdir */
+	{ 1, (sy_call_t *)fchdir },			/* 13 = fchdir */
+	{ 3, (sy_call_t *)mknod },			/* 14 = mknod */
+	{ 2, (sy_call_t *)chmod },			/* 15 = chmod */
+	{ 3, (sy_call_t *)chown },			/* 16 = chown */
+	{ 1, (sy_call_t *)obreak },			/* 17 = break */
+	{ 3, (sy_call_t *)getfsstat },			/* 18 = getfsstat */
+	{ compat(3,lseek) },		/* 19 = old lseek */
+	{ 0, (sy_call_t *)getpid },			/* 20 = getpid */
+	{ 4, (sy_call_t *)mount },			/* 21 = mount */
+	{ 2, (sy_call_t *)unmount },			/* 22 = unmount */
+	{ 1, (sy_call_t *)setuid },			/* 23 = setuid */
+	{ 0, (sy_call_t *)getuid },			/* 24 = getuid */
+	{ 0, (sy_call_t *)geteuid },			/* 25 = geteuid */
+	{ 4, (sy_call_t *)ptrace },			/* 26 = ptrace */
+	{ 3, (sy_call_t *)recvmsg },			/* 27 = recvmsg */
+	{ 3, (sy_call_t *)sendmsg },			/* 28 = sendmsg */
+	{ 6, (sy_call_t *)recvfrom },			/* 29 = recvfrom */
+	{ 3, (sy_call_t *)accept },			/* 30 = accept */
+	{ 3, (sy_call_t *)getpeername },		/* 31 = getpeername */
+	{ 3, (sy_call_t *)getsockname },		/* 32 = getsockname */
+	{ 2, (sy_call_t *)access },			/* 33 = access */
+	{ 2, (sy_call_t *)chflags },			/* 34 = chflags */
+	{ 2, (sy_call_t *)fchflags },			/* 35 = fchflags */
+	{ 0, (sy_call_t *)sync },			/* 36 = sync */
+	{ 2, (sy_call_t *)kill },			/* 37 = kill */
+	{ compat(2,stat) },		/* 38 = old stat */
+	{ 0, (sy_call_t *)getppid },			/* 39 = getppid */
+	{ compat(2,lstat) },		/* 40 = old lstat */
+	{ 1, (sy_call_t *)dup },			/* 41 = dup */
+	{ 0, (sy_call_t *)pipe },			/* 42 = pipe */
+	{ 0, (sy_call_t *)getegid },			/* 43 = getegid */
+	{ 4, (sy_call_t *)profil },			/* 44 = profil */
+	{ 4, (sy_call_t *)ktrace },			/* 45 = ktrace */
+	{ 3, (sy_call_t *)sigaction },			/* 46 = sigaction */
+	{ 0, (sy_call_t *)getgid },			/* 47 = getgid */
+	{ 2, (sy_call_t *)sigprocmask },		/* 48 = sigprocmask */
+	{ 2, (sy_call_t *)getlogin },			/* 49 = getlogin */
+	{ 1, (sy_call_t *)setlogin },			/* 50 = setlogin */
+	{ 1, (sy_call_t *)acct },			/* 51 = acct */
+	{ 0, (sy_call_t *)sigpending },			/* 52 = sigpending */
+	{ 2, (sy_call_t *)sigaltstack },		/* 53 = sigaltstack */
+	{ 3, (sy_call_t *)ioctl },			/* 54 = ioctl */
+	{ 1, (sy_call_t *)reboot },			/* 55 = reboot */
+	{ 1, (sy_call_t *)revoke },			/* 56 = revoke */
+	{ 2, (sy_call_t *)symlink },			/* 57 = symlink */
+	{ 3, (sy_call_t *)readlink },			/* 58 = readlink */
+	{ 3, (sy_call_t *)execve },			/* 59 = execve */
+	{ 1, (sy_call_t *)umask },			/* 60 = umask */
+	{ 1, (sy_call_t *)chroot },			/* 61 = chroot */
+	{ compat(2,fstat) },		/* 62 = old fstat */
+	{ compat(4,getkerninfo) },		/* 63 = old getkerninfo */
+	{ compat(0,getpagesize) },		/* 64 = old getpagesize */
+	{ 3, (sy_call_t *)msync },			/* 65 = msync */
+	{ 0, (sy_call_t *)vfork },			/* 66 = vfork */
+	{ 0, (sy_call_t *)nosys },			/* 67 = obsolete vread */
+	{ 0, (sy_call_t *)nosys },			/* 68 = obsolete vwrite */
+	{ 1, (sy_call_t *)sbrk },			/* 69 = sbrk */
+	{ 1, (sy_call_t *)sstk },			/* 70 = sstk */
+	{ compat(6,mmap) },		/* 71 = old mmap */
+	{ 1, (sy_call_t *)ovadvise },			/* 72 = vadvise */
+	{ 2, (sy_call_t *)munmap },			/* 73 = munmap */
+	{ 3, (sy_call_t *)mprotect },			/* 74 = mprotect */
+	{ 3, (sy_call_t *)madvise },			/* 75 = madvise */
+	{ 0, (sy_call_t *)nosys },			/* 76 = obsolete vhangup */
+	{ 0, (sy_call_t *)nosys },			/* 77 = obsolete vlimit */
+	{ 3, (sy_call_t *)mincore },			/* 78 = mincore */
+	{ 2, (sy_call_t *)getgroups },			/* 79 = getgroups */
+	{ 2, (sy_call_t *)setgroups },			/* 80 = setgroups */
+	{ 0, (sy_call_t *)getpgrp },			/* 81 = getpgrp */
+	{ 2, (sy_call_t *)setpgid },			/* 82 = setpgid */
+	{ 3, (sy_call_t *)setitimer },			/* 83 = setitimer */
+	{ compat(0,wait) },		/* 84 = old wait */
+	{ 1, (sy_call_t *)swapon },			/* 85 = swapon */
+	{ 2, (sy_call_t *)getitimer },			/* 86 = getitimer */
+	{ compat(2,gethostname) },		/* 87 = old gethostname */
+	{ compat(2,sethostname) },		/* 88 = old sethostname */
+	{ 0, (sy_call_t *)getdtablesize },		/* 89 = getdtablesize */
+	{ 2, (sy_call_t *)dup2 },			/* 90 = dup2 */
+	{ 0, (sy_call_t *)nosys },			/* 91 = getdopt */
+	{ 3, (sy_call_t *)fcntl },			/* 92 = fcntl */
+	{ 5, (sy_call_t *)select },			/* 93 = select */
+	{ 0, (sy_call_t *)nosys },			/* 94 = setdopt */
+	{ 1, (sy_call_t *)fsync },			/* 95 = fsync */
+	{ 3, (sy_call_t *)setpriority },		/* 96 = setpriority */
+	{ 3, (sy_call_t *)socket },			/* 97 = socket */
+	{ 3, (sy_call_t *)connect },			/* 98 = connect */
+	{ compat(3,accept) },		/* 99 = old accept */
+	{ 2, (sy_call_t *)getpriority },		/* 100 = getpriority */
+	{ compat(4,send) },		/* 101 = old send */
+	{ compat(4,recv) },		/* 102 = old recv */
+	{ 1, (sy_call_t *)sigreturn },			/* 103 = sigreturn */
+	{ 3, (sy_call_t *)bind },			/* 104 = bind */
+	{ 5, (sy_call_t *)setsockopt },			/* 105 = setsockopt */
+	{ 2, (sy_call_t *)listen },			/* 106 = listen */
+	{ 0, (sy_call_t *)nosys },			/* 107 = obsolete vtimes */
+	{ compat(3,sigvec) },		/* 108 = old sigvec */
+	{ compat(1,sigblock) },		/* 109 = old sigblock */
+	{ compat(1,sigsetmask) },		/* 110 = old sigsetmask */
+	{ 1, (sy_call_t *)sigsuspend },			/* 111 = sigsuspend */
+	{ compat(2,sigstack) },		/* 112 = old sigstack */
+	{ compat(3,recvmsg) },		/* 113 = old recvmsg */
+	{ compat(3,sendmsg) },		/* 114 = old sendmsg */
+	{ 0, (sy_call_t *)nosys },			/* 115 = obsolete vtrace */
+	{ 2, (sy_call_t *)gettimeofday },		/* 116 = gettimeofday */
+	{ 2, (sy_call_t *)getrusage },			/* 117 = getrusage */
+	{ 5, (sy_call_t *)getsockopt },			/* 118 = getsockopt */
+	{ 0, (sy_call_t *)nosys },			/* 119 = resuba */
+	{ 3, (sy_call_t *)readv },			/* 120 = readv */
+	{ 3, (sy_call_t *)writev },			/* 121 = writev */
+	{ 2, (sy_call_t *)settimeofday },		/* 122 = settimeofday */
+	{ 3, (sy_call_t *)fchown },			/* 123 = fchown */
+	{ 2, (sy_call_t *)fchmod },			/* 124 = fchmod */
+	{ compat(6,recvfrom) },		/* 125 = old recvfrom */
+	{ 2, (sy_call_t *)setreuid },			/* 126 = setreuid */
+	{ 2, (sy_call_t *)setregid },			/* 127 = setregid */
+	{ 2, (sy_call_t *)rename },			/* 128 = rename */
+	{ compat(2,truncate) },		/* 129 = old truncate */
+	{ compat(2,ftruncate) },		/* 130 = old ftruncate */
+	{ 2, (sy_call_t *)flock },			/* 131 = flock */
+	{ 2, (sy_call_t *)mkfifo },			/* 132 = mkfifo */
+	{ 6, (sy_call_t *)sendto },			/* 133 = sendto */
+	{ 2, (sy_call_t *)shutdown },			/* 134 = shutdown */
+	{ 4, (sy_call_t *)socketpair },			/* 135 = socketpair */
+	{ 2, (sy_call_t *)mkdir },			/* 136 = mkdir */
+	{ 1, (sy_call_t *)rmdir },			/* 137 = rmdir */
+	{ 2, (sy_call_t *)utimes },			/* 138 = utimes */
+	{ 0, (sy_call_t *)nosys },			/* 139 = obsolete 4.2 sigreturn */
+	{ 2, (sy_call_t *)adjtime },			/* 140 = adjtime */
+	{ compat(3,getpeername) },		/* 141 = old getpeername */
+	{ compat(0,gethostid) },		/* 142 = old gethostid */
+	{ compat(1,sethostid) },		/* 143 = old sethostid */
+	{ compat(2,getrlimit) },		/* 144 = old getrlimit */
+	{ compat(2,setrlimit) },		/* 145 = old setrlimit */
+	{ compat(2,killpg) },		/* 146 = old killpg */
+	{ 0, (sy_call_t *)setsid },			/* 147 = setsid */
+	{ 4, (sy_call_t *)quotactl },			/* 148 = quotactl */
+	{ compat(0,quota) },		/* 149 = old quota */
+	{ compat(3,getsockname) },		/* 150 = old getsockname */
+	{ 0, (sy_call_t *)nosys },			/* 151 = sem_lock */
+	{ 0, (sy_call_t *)nosys },			/* 152 = sem_wakeup */
+	{ 0, (sy_call_t *)nosys },			/* 153 = asyncdaemon */
+	{ 0, (sy_call_t *)nosys },			/* 154 = nosys */
+	{ 2, (sy_call_t *)nosys },			/* 155 = nfssvc */
+	{ compat(4,getdirentries) },		/* 156 = old getdirentries */
+	{ 2, (sy_call_t *)statfs },			/* 157 = statfs */
+	{ 2, (sy_call_t *)fstatfs },			/* 158 = fstatfs */
+	{ 0, (sy_call_t *)nosys },			/* 159 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 160 = nosys */
+	{ 2, (sy_call_t *)nosys },			/* 161 = getfh */
+	{ 2, (sy_call_t *)getdomainname },		/* 162 = getdomainname */
+	{ 2, (sy_call_t *)setdomainname },		/* 163 = setdomainname */
+	{ 1, (sy_call_t *)uname },			/* 164 = uname */
+	{ 2, (sy_call_t *)sysarch },			/* 165 = sysarch */
+	{ 3, (sy_call_t *)rtprio },			/* 166 = rtprio */
+	{ 0, (sy_call_t *)nosys },			/* 167 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 168 = nosys */
+	{ 5, (sy_call_t *)semsys },			/* 169 = semsys */
+	{ 6, (sy_call_t *)msgsys },			/* 170 = msgsys */
+	{ 4, (sy_call_t *)shmsys },			/* 171 = shmsys */
+	{ 0, (sy_call_t *)nosys },			/* 172 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 173 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 174 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 175 = nosys */
+	{ 1, (sy_call_t *)ntp_adjtime },		/* 176 = ntp_adjtime */
+	{ 0, (sy_call_t *)nosys },			/* 177 = sfork */
+	{ 0, (sy_call_t *)nosys },			/* 178 = getdescriptor */
+	{ 0, (sy_call_t *)nosys },			/* 179 = setdescriptor */
+	{ 0, (sy_call_t *)nosys },			/* 180 = nosys */
+	{ 1, (sy_call_t *)setgid },			/* 181 = setgid */
+	{ 1, (sy_call_t *)setegid },			/* 182 = setegid */
+	{ 1, (sy_call_t *)seteuid },			/* 183 = seteuid */
+	{ 0, (sy_call_t *)nosys },			/* 184 = lfs_bmapv */
+	{ 0, (sy_call_t *)nosys },			/* 185 = lfs_markv */
+	{ 0, (sy_call_t *)nosys },			/* 186 = lfs_segclean */
+	{ 0, (sy_call_t *)nosys },			/* 187 = lfs_segwait */
+	{ 2, (sy_call_t *)stat },			/* 188 = stat */
+	{ 2, (sy_call_t *)fstat },			/* 189 = fstat */
+	{ 2, (sy_call_t *)lstat },			/* 190 = lstat */
+	{ 2, (sy_call_t *)pathconf },			/* 191 = pathconf */
+	{ 2, (sy_call_t *)fpathconf },			/* 192 = fpathconf */
+	{ 0, (sy_call_t *)nosys },			/* 193 = nosys */
+	{ 2, (sy_call_t *)getrlimit },			/* 194 = getrlimit */
+	{ 2, (sy_call_t *)setrlimit },			/* 195 = setrlimit */
+	{ 4, (sy_call_t *)getdirentries },		/* 196 = getdirentries */
+	{ 8, (sy_call_t *)mmap },			/* 197 = mmap */
+	{ 0, (sy_call_t *)nosys },			/* 198 = __syscall */
+	{ 5, (sy_call_t *)lseek },			/* 199 = lseek */
+	{ 4, (sy_call_t *)truncate },			/* 200 = truncate */
+	{ 4, (sy_call_t *)ftruncate },			/* 201 = ftruncate */
+	{ 6, (sy_call_t *)__sysctl },			/* 202 = __sysctl */
+	{ 2, (sy_call_t *)mlock },			/* 203 = mlock */
+	{ 2, (sy_call_t *)munlock },			/* 204 = munlock */
+	{ 1, (sy_call_t *)undelete },			/* 205 = undelete */
+	{ 2, (sy_call_t *)futimes },			/* 206 = futimes */
+	{ 1, (sy_call_t *)getpgid },			/* 207 = getpgid */
+	{ 0, (sy_call_t *)nosys },			/* 208 = newreboot */
+	{ 3, (sy_call_t *)poll },			/* 209 = poll */
+	{ 0, (sy_call_t *)lkmnosys },			/* 210 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 211 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 212 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 213 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 214 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 215 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 216 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 217 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 218 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 219 = lkmnosys */
+	{ 4, (sy_call_t *)__semctl },			/* 220 = __semctl */
+	{ 3, (sy_call_t *)semget },			/* 221 = semget */
+	{ 3, (sy_call_t *)semop },			/* 222 = semop */
+	{ 1, (sy_call_t *)semconfig },			/* 223 = semconfig */
+	{ 3, (sy_call_t *)msgctl },			/* 224 = msgctl */
+	{ 2, (sy_call_t *)msgget },			/* 225 = msgget */
+	{ 4, (sy_call_t *)msgsnd },			/* 226 = msgsnd */
+	{ 5, (sy_call_t *)msgrcv },			/* 227 = msgrcv */
+	{ 3, (sy_call_t *)shmat },			/* 228 = shmat */
+	{ 3, (sy_call_t *)shmctl },			/* 229 = shmctl */
+	{ 1, (sy_call_t *)shmdt },			/* 230 = shmdt */
+	{ 3, (sy_call_t *)shmget },			/* 231 = shmget */
+	{ 2, (sy_call_t *)clock_gettime },		/* 232 = clock_gettime */
+	{ 2, (sy_call_t *)clock_settime },		/* 233 = clock_settime */
+	{ 2, (sy_call_t *)clock_getres },		/* 234 = clock_getres */
+	{ 0, (sy_call_t *)nosys },			/* 235 = timer_create */
+	{ 0, (sy_call_t *)nosys },			/* 236 = timer_delete */
+	{ 0, (sy_call_t *)nosys },			/* 237 = timer_settime */
+	{ 0, (sy_call_t *)nosys },			/* 238 = timer_gettime */
+	{ 0, (sy_call_t *)nosys },			/* 239 = timer_getoverrun */
+	{ 2, (sy_call_t *)nanosleep },			/* 240 = nanosleep */
+	{ 0, (sy_call_t *)nosys },			/* 241 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 242 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 243 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 244 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 245 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 246 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 247 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 248 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 249 = nosys */
+	{ 3, (sy_call_t *)minherit },			/* 250 = minherit */
+	{ 1, (sy_call_t *)rfork },			/* 251 = rfork */
+	{ 3, (sy_call_t *)openbsd_poll },		/* 252 = openbsd_poll */
+	{ 0, (sy_call_t *)issetugid },			/* 253 = issetugid */
+	{ 3, (sy_call_t *)lchown },			/* 254 = lchown */
+	{ 0, (sy_call_t *)nosys },			/* 255 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 256 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 257 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 258 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 259 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 260 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 261 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 262 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 263 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 264 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 265 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 266 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 267 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 268 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 269 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 270 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 271 = nosys */
+	{ 3, (sy_call_t *)getdents },			/* 272 = getdents */
+	{ 0, (sy_call_t *)nosys },			/* 273 = nosys */
+	{ 2, (sy_call_t *)lchmod },			/* 274 = lchmod */
+	{ 3, (sy_call_t *)lchown },			/* 275 = netbsd_lchown */
+	{ 2, (sy_call_t *)lutimes },			/* 276 = lutimes */
+	{ 3, (sy_call_t *)msync },			/* 277 = netbsd_msync */
+	{ 2, (sy_call_t *)nstat },			/* 278 = nstat */
+	{ 2, (sy_call_t *)nfstat },			/* 279 = nfstat */
+	{ 2, (sy_call_t *)nlstat },			/* 280 = nlstat */
+	{ 0, (sy_call_t *)nosys },			/* 281 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 282 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 283 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 284 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 285 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 286 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 287 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 288 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 289 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 290 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 291 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 292 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 293 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 294 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 295 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 296 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 297 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 298 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 299 = nosys */
+	{ 1, (sy_call_t *)modnext },			/* 300 = modnext */
+	{ 2, (sy_call_t *)modstat },			/* 301 = modstat */
+	{ 1, (sy_call_t *)modfnext },			/* 302 = modfnext */
+	{ 1, (sy_call_t *)modfind },			/* 303 = modfind */
+	{ 1, (sy_call_t *)kldload },			/* 304 = kldload */
+	{ 1, (sy_call_t *)kldunload },			/* 305 = kldunload */
+	{ 1, (sy_call_t *)kldfind },			/* 306 = kldfind */
+	{ 1, (sy_call_t *)kldnext },			/* 307 = kldnext */
+	{ 2, (sy_call_t *)kldstat },			/* 308 = kldstat */
+	{ 1, (sy_call_t *)kldfirstmod },		/* 309 = kldfirstmod */
+	{ 1, (sy_call_t *)getsid },			/* 310 = getsid */
+	{ 0, (sy_call_t *)nosys },			/* 311 = setresuid */
+	{ 0, (sy_call_t *)nosys },			/* 312 = setresgid */
+	{ 0, (sy_call_t *)nosys },			/* 313 = obsolete signanosleep */
+	{ 1, (sy_call_t *)aio_return },			/* 314 = aio_return */
+	{ 3, (sy_call_t *)aio_suspend },		/* 315 = aio_suspend */
+	{ 2, (sy_call_t *)aio_cancel },			/* 316 = aio_cancel */
+	{ 1, (sy_call_t *)aio_error },			/* 317 = aio_error */
+	{ 1, (sy_call_t *)aio_read },			/* 318 = aio_read */
+	{ 1, (sy_call_t *)aio_write },			/* 319 = aio_write */
+	{ 4, (sy_call_t *)lio_listio },			/* 320 = lio_listio */
+	{ 0, (sy_call_t *)yield },			/* 321 = yield */
+	{ 1, (sy_call_t *)thr_sleep },			/* 322 = thr_sleep */
+	{ 1, (sy_call_t *)thr_wakeup },			/* 323 = thr_wakeup */
+	{ 1, (sy_call_t *)mlockall },			/* 324 = mlockall */
+	{ 0, (sy_call_t *)munlockall },			/* 325 = munlockall */
+	{ 2, (sy_call_t *)__getcwd },			/* 326 = __getcwd */
+	{ 2, (sy_call_t *)sched_setparam },		/* 327 = sched_setparam */
+	{ 2, (sy_call_t *)sched_getparam },		/* 328 = sched_getparam */
+	{ 3, (sy_call_t *)sched_setscheduler },		/* 329 = sched_setscheduler */
+	{ 1, (sy_call_t *)sched_getscheduler },		/* 330 = sched_getscheduler */
+	{ 0, (sy_call_t *)sched_yield },		/* 331 = sched_yield */
+	{ 1, (sy_call_t *)sched_get_priority_max },		/* 332 = sched_get_priority_max */
+	{ 1, (sy_call_t *)sched_get_priority_min },		/* 333 = sched_get_priority_min */
+	{ 2, (sy_call_t *)sched_rr_get_interval },		/* 334 = sched_rr_get_interval */
+	{ 2, (sy_call_t *)utrace },			/* 335 = utrace */
+	{ 8, (sy_call_t *)sendfile },			/* 336 = sendfile */
+	{ 3, (sy_call_t *)kldsym },			/* 337 = kldsym */
+};
diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c
new file mode 100644
index 0000000..11db4e9
--- /dev/null
+++ b/sys/kern/kern_acct.c
@@ -0,0 +1,330 @@
+/*-
+ * Copyright (c) 1994 Christopher G. Demetriou
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_acct.c	8.1 (Berkeley) 6/14/93
+ *	$Id: kern_acct.c,v 1.18 1997/11/06 19:29:07 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/fcntl.h>
+#include <sys/syslog.h>
+#include <sys/kernel.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/namei.h>
+#include <sys/acct.h>
+#include <sys/resourcevar.h>
+#include <sys/tty.h>
+
+/*
+ * The routines implemented in this file are described in:
+ *      Leffler, et al.: The Design and Implementation of the 4.3BSD
+ *	    UNIX Operating System (Addison Welley, 1989)
+ * on pages 62-63.
+ *
+ * Arguably, to simplify accounting operations, this mechanism should
+ * be replaced by one in which an accounting log file (similar to /dev/klog)
+ * is read by a user process, etc.  However, that has its own problems.
+ */
+
+/*
+ * Internal accounting functions.
+ * The former's operation is described in Leffler, et al., and the latter
+ * was provided by UCB with the 4.4BSD-Lite release
+ */
+static comp_t	encode_comp_t __P((u_long, u_long));
+static void	acctwatch __P((void *));
+
+/*
+ * Accounting callout handle used for periodic scheduling of
+ * acctwatch.
+ */
+static struct	callout_handle acctwatch_handle
+    = CALLOUT_HANDLE_INITIALIZER(&acctwatch_handle);
+
+/*
+ * Accounting vnode pointer, and saved vnode pointer.
+ */
+static struct	vnode *acctp;
+static struct	vnode *savacctp;
+
+/*
+ * Values associated with enabling and disabling accounting
+ */
+static int acctsuspend = 2;	/* stop accounting when < 2% free space left */
+SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
+	&acctsuspend, 0, "");
+
+static int acctresume = 4;	/* resume when free space risen to > 4% */
+SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
+	&acctresume, 0, "");
+
+static int acctchkfreq = 15;	/* frequency (in seconds) to check space */
+SYSCTL_INT(_kern, OID_AUTO, acct_chkfreq, CTLFLAG_RW,
+	&acctchkfreq, 0, "");
+
+/*
+ * Accounting system call.  Written based on the specification and
+ * previous implementation done by Mark Tinguely.
+ */
+int
+acct(a1, uap)
+	struct proc *a1;
+	struct acct_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct nameidata nd;
+	int error;
+
+	/* Make sure that the caller is root. */
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error)
+		return (error);
+
+	/*
+	 * If accounting is to be started to a file, open that file for
+	 * writing and make sure it's a 'normal'.
+	 */
+	if (SCARG(uap, path) != NULL) {
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path),
+		       p);
+		error = vn_open(&nd, FWRITE, 0);
+		if (error)
+			return (error);
+		VOP_UNLOCK(nd.ni_vp, 0, p);
+		if (nd.ni_vp->v_type != VREG) {
+			vn_close(nd.ni_vp, FWRITE, p->p_ucred, p);
+			return (EACCES);
+		}
+	}
+
+	/*
+	 * If accounting was previously enabled, kill the old space-watcher,
+	 * close the file, and (if no new file was specified, leave).
+	 */
+	if (acctp != NULLVP || savacctp != NULLVP) {
+		untimeout(acctwatch, NULL, acctwatch_handle);
+		error = vn_close((acctp != NULLVP ? acctp : savacctp), FWRITE,
+		    p->p_ucred, p);
+		acctp = savacctp = NULLVP;
+	}
+	if (SCARG(uap, path) == NULL)
+		return (error);
+
+	/*
+	 * Save the new accounting file vnode, and schedule the new
+	 * free space watcher.
+	 */
+	acctp = nd.ni_vp;
+	acctwatch(NULL);
+	return (error);
+}
+
+/*
+ * Write out process accounting information, on process exit.
+ * Data to be written out is specified in Leffler, et al.
+ * and are enumerated below.  (They're also noted in the system
+ * "acct.h" header file.)
+ */
+
+int
+acct_process(p)
+	struct proc *p;
+{
+	struct acct acct;
+	struct rusage *r;
+	struct timeval ut, st, tmp;
+	int t;
+	struct vnode *vp;
+
+	/* If accounting isn't enabled, don't bother */
+	vp = acctp;
+	if (vp == NULLVP)
+		return (0);
+
+	/*
+	 * Get process accounting information.
+	 */
+
+	/* (1) The name of the command that ran */
+	bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
+
+	/* (2) The amount of user and system time that was used */
+	calcru(p, &ut, &st, NULL);
+	acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec);
+	acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec);
+
+	/* (3) The elapsed time the commmand ran (and its starting time) */
+	acct.ac_btime = p->p_stats->p_start.tv_sec;
+	microtime(&tmp);
+	timevalsub(&tmp, &p->p_stats->p_start);
+	acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec);
+
+	/* (4) The average amount of memory used */
+	r = &p->p_stats->p_ru;
+	tmp = ut;
+	timevaladd(&tmp, &st);
+	t = tmp.tv_sec * hz + tmp.tv_usec / tick;
+	if (t)
+		acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
+	else
+		acct.ac_mem = 0;
+
+	/* (5) The number of disk I/O operations done */
+	acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
+
+	/* (6) The UID and GID of the process */
+	acct.ac_uid = p->p_cred->p_ruid;
+	acct.ac_gid = p->p_cred->p_rgid;
+
+	/* (7) The terminal from which the process was started */
+	if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
+		acct.ac_tty = p->p_pgrp->pg_session->s_ttyp->t_dev;
+	else
+		acct.ac_tty = NODEV;
+
+	/* (8) The boolean flags that tell how the process terminated, etc. */
+	acct.ac_flag = p->p_acflag;
+
+	/*
+	 * Eliminate any file size rlimit.
+	 */
+	if (p->p_limit->p_refcnt > 1 &&
+	    (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
+		p->p_limit->p_refcnt--;
+		p->p_limit = limcopy(p->p_limit);
+	} 
+	p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+
+	/*
+	 * Write the accounting information to the file.
+	 */
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	return (vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct),
+	    (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, p->p_ucred,
+	    (int *)0, p));
+}
+
+/*
+ * Encode_comp_t converts from ticks in seconds and microseconds
+ * to ticks in 1/AHZ seconds.  The encoding is described in
+ * Leffler, et al., on page 63.
+ */
+
+#define	MANTSIZE	13			/* 13 bit mantissa. */
+#define	EXPSIZE		3			/* Base 8 (3 bit) exponent. */
+#define	MAXFRACT	((1 << MANTSIZE) - 1)	/* Maximum fractional value. */
+
+static comp_t
+encode_comp_t(s, us)
+	u_long s, us;
+{
+	int exp, rnd;
+
+	exp = 0;
+	rnd = 0;
+	s *= AHZ;
+	s += us / (1000000 / AHZ);	/* Maximize precision. */
+
+	while (s > MAXFRACT) {
+	rnd = s & (1 << (EXPSIZE - 1));	/* Round up? */
+		s >>= EXPSIZE;		/* Base 8 exponent == 3 bit shift. */
+		exp++;
+	}
+
+	/* If we need to round up, do it (and handle overflow correctly). */
+	if (rnd && (++s > MAXFRACT)) {
+		s >>= EXPSIZE;
+		exp++;
+	}
+
+	/* Clean it up and polish it off. */
+	exp <<= MANTSIZE;		/* Shift the exponent into place */
+	exp += s;			/* and add on the mantissa. */
+	return (exp);
+}
+
+/*
+ * Periodically check the file system to see if accounting
+ * should be turned on or off.  Beware the case where the vnode
+ * has been vgone()'d out from underneath us, e.g. when the file
+ * system containing the accounting file has been forcibly unmounted.
+ */
+/* ARGSUSED */
+static void
+acctwatch(a)
+	void *a;
+{
+	struct statfs sb;
+
+	if (savacctp != NULLVP) {
+		if (savacctp->v_type == VBAD) {
+			(void) vn_close(savacctp, FWRITE, NOCRED, NULL);
+			savacctp = NULLVP;
+			return;
+		}
+		(void)VFS_STATFS(savacctp->v_mount, &sb, (struct proc *)0);
+		if (sb.f_bavail > acctresume * sb.f_blocks / 100) {
+			acctp = savacctp;
+			savacctp = NULLVP;
+			log(LOG_NOTICE, "Accounting resumed\n");
+		}
+	} else {
+		if (acctp == NULLVP)
+			return;
+		if (acctp->v_type == VBAD) {
+			(void) vn_close(acctp, FWRITE, NOCRED, NULL);
+			acctp = NULLVP;
+			return;
+		}
+		(void)VFS_STATFS(acctp->v_mount, &sb, (struct proc *)0);
+		if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) {
+			savacctp = acctp;
+			acctp = NULLVP;
+			log(LOG_NOTICE, "Accounting suspended\n");
+		}
+	}
+	acctwatch_handle = timeout(acctwatch, NULL, acctchkfreq * hz);
+}
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
new file mode 100644
index 0000000..2ea378f
--- /dev/null
+++ b/sys/kern/kern_clock.c
@@ -0,0 +1,870 @@
+/*-
+ * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ * $Id: kern_clock.c,v 1.85 1998/11/23 09:58:53 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dkstat.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/timex.h>
+#include <vm/vm.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#include <machine/limits.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+#if defined(SMP) && defined(BETTER_CLOCK)
+#include <machine/smp.h>
+#endif
+
+/* This is where the NTIMECOUNTER option hangs out */
+#include "opt_ntp.h"
+
+/*
+ * Number of timecounters used to implement stable storage
+ */
+#ifndef NTIMECOUNTER
+#define NTIMECOUNTER	5
+#endif
+
+static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter", 
+	"Timecounter stable storage");
+
+static void initclocks __P((void *dummy));
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+static void tco_forward __P((int force));
+static void tco_setscales __P((struct timecounter *tc));
+static __inline unsigned tco_delta __P((struct timecounter *tc));
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+#if defined(SMP) && defined(BETTER_CLOCK)
+long cp_time[CPUSTATES];
+#else
+static long cp_time[CPUSTATES];
+#endif
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
+time_t time_second;
+
+/*
+ * Which update policy to use.
+ *   0 - every tick, bad hardware may fail with "calcru negative..."
+ *   1 - more resistent to the above hardware, but less efficient.
+ */
+static int tco_method;
+
+/*
+ * Implement a dummy timecounter which we can use until we get a real one
+ * in the air.  This allows the console and other early stuff to use
+ * timeservices.
+ */
+
+static unsigned 
+dummy_get_timecount(struct timecounter *tc)
+{
+	static unsigned now;
+	return (++now);
+}
+
+static struct timecounter dummy_timecounter = {
+	dummy_get_timecount,
+	0,
+	~0u,
+	1000000,
+	"dummy"
+};
+
+struct timecounter *timecounter = &dummy_timecounter;
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other.
+ *
+ * The main timer, running hz times per second, is used to trigger interval
+ * timers, timeouts and rescheduling as needed.
+ *
+ * The second timer handles kernel and user profiling,
+ * and does resource use estimation.  If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks.  For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires.  Otherwise, it would never accumulate
+ * cpu ticks.  The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock.  This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling.  This profile clock runs at profhz.  We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics.  (For profiling, every tick counts.)
+ *
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
+ */
+
+int	stathz;
+int	profhz;
+static int profprocs;
+int	ticks;
+static int psdiv, pscnt;		/* prof => stat divider */
+int	psratio;			/* ratio: prof / stat */
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+	void *dummy;
+{
+	register int i;
+
+	/*
+	 * Set divisors to 1 (normal case) and let the machine-specific
+	 * code do its bit.
+	 */
+	psdiv = pscnt = 1;
+	cpu_initclocks();
+
+	/*
+	 * Compute profhz/stathz, and fix profhz if needed.
+	 */
+	i = stathz ? stathz : hz;
+	if (profhz == 0)
+		profhz = i;
+	psratio = profhz / i;
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(frame)
+	register struct clockframe *frame;
+{
+	register struct proc *p;
+
+	p = curproc;
+	if (p) {
+		register struct pstats *pstats;
+
+		/*
+		 * Run current process's virtual and profile time, as needed.
+		 */
+		pstats = p->p_stats;
+		if (CLKF_USERMODE(frame) &&
+		    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+			psignal(p, SIGVTALRM);
+		if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+			psignal(p, SIGPROF);
+	}
+
+#if defined(SMP) && defined(BETTER_CLOCK)
+	forward_hardclock(pscnt);
+#endif
+
+	/*
+	 * If no separate statistics clock is available, run it from here.
+	 */
+	if (stathz == 0)
+		statclock(frame);
+
+	tco_forward(0);
+	ticks++;
+
+	/*
+	 * Process callouts at a very low cpu priority, so we don't keep the
+	 * relatively high clock interrupt priority any longer than necessary.
+	 */
+	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
+		if (CLKF_BASEPRI(frame)) {
+			/*
+			 * Save the overhead of a software interrupt;
+			 * it will happen as soon as we return, so do it now.
+			 */
+			(void)splsoftclock();
+			softclock();
+		} else
+			setsoftclock();
+	} else if (softticks + 1 == ticks)
+		++softticks;
+}
+
+/*
+ * Compute number of ticks in the specified amount of time.
+ */
+int
+tvtohz(tv)
+	struct timeval *tv;
+{
+	register unsigned long ticks;
+	register long sec, usec;
+
+	/*
+	 * If the number of usecs in the whole seconds part of the time
+	 * difference fits in a long, then the total number of usecs will
+	 * fit in an unsigned long.  Compute the total and convert it to
+	 * ticks, rounding up and adding 1 to allow for the current tick
+	 * to expire.  Rounding also depends on unsigned long arithmetic
+	 * to avoid overflow.
+	 *
+	 * Otherwise, if the number of ticks in the whole seconds part of
+	 * the time difference fits in a long, then convert the parts to
+	 * ticks separately and add, using similar rounding methods and
+	 * overflow avoidance.  This method would work in the previous
+	 * case but it is slightly slower and assumes that hz is integral.
+	 *
+	 * Otherwise, round the time difference down to the maximum
+	 * representable value.
+	 *
+	 * If ints have 32 bits, then the maximum value for any timeout in
+	 * 10ms ticks is 248 days.
+	 */
+	sec = tv->tv_sec;
+	usec = tv->tv_usec;
+	if (usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+	if (sec < 0) {
+#ifdef DIAGNOSTIC
+		if (usec > 0) {
+			sec++;
+			usec -= 1000000;
+		}
+		printf("tvotohz: negative time difference %ld sec %ld usec\n",
+		       sec, usec);
+#endif
+		ticks = 1;
+	} else if (sec <= LONG_MAX / 1000000)
+		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+			/ tick + 1;
+	else if (sec <= LONG_MAX / hz)
+		ticks = sec * hz
+			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
+	else
+		ticks = LONG_MAX;
+	if (ticks > INT_MAX)
+		ticks = INT_MAX;
+	return ((int)ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+	register struct proc *p;
+{
+	int s;
+
+	if ((p->p_flag & P_PROFIL) == 0) {
+		p->p_flag |= P_PROFIL;
+		if (++profprocs == 1 && stathz != 0) {
+			s = splstatclock();
+			psdiv = pscnt = psratio;
+			setstatclockrate(profhz);
+			splx(s);
+		}
+	}
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+	register struct proc *p;
+{
+	int s;
+
+	if (p->p_flag & P_PROFIL) {
+		p->p_flag &= ~P_PROFIL;
+		if (--profprocs == 0 && stathz != 0) {
+			s = splstatclock();
+			psdiv = pscnt = 1;
+			setstatclockrate(stathz);
+			splx(s);
+		}
+	}
+}
+
+/*
+ * Statistics clock.  Grab profile sample, and if divider reaches 0,
+ * do process and kernel statistics.
+ */
+void
+statclock(frame)
+	register struct clockframe *frame;
+{
+#ifdef GPROF
+	register struct gmonparam *g;
+	int i;
+#endif
+	register struct proc *p;
+	struct pstats *pstats;
+	long rss;
+	struct rusage *ru;
+	struct vmspace *vm;
+
+	if (curproc != NULL && CLKF_USERMODE(frame)) {
+		p = curproc;
+		if (p->p_flag & P_PROFIL)
+			addupc_intr(p, CLKF_PC(frame), 1);
+#if defined(SMP) && defined(BETTER_CLOCK)
+		if (stathz != 0)
+			forward_statclock(pscnt);
+#endif
+		if (--pscnt > 0)
+			return;
+		/*
+		 * Came from user mode; CPU was in user state.
+		 * If this process is being profiled record the tick.
+		 */
+		p->p_uticks++;
+		if (p->p_nice > NZERO)
+			cp_time[CP_NICE]++;
+		else
+			cp_time[CP_USER]++;
+	} else {
+#ifdef GPROF
+		/*
+		 * Kernel statistics are just like addupc_intr, only easier.
+		 */
+		g = &_gmonparam;
+		if (g->state == GMON_PROF_ON) {
+			i = CLKF_PC(frame) - g->lowpc;
+			if (i < g->textsize) {
+				i /= HISTFRACTION * sizeof(*g->kcount);
+				g->kcount[i]++;
+			}
+		}
+#endif
+#if defined(SMP) && defined(BETTER_CLOCK)
+		if (stathz != 0)
+			forward_statclock(pscnt);
+#endif
+		if (--pscnt > 0)
+			return;
+		/*
+		 * Came from kernel mode, so we were:
+		 * - handling an interrupt,
+		 * - doing syscall or trap work on behalf of the current
+		 *   user process, or
+		 * - spinning in the idle loop.
+		 * Whichever it is, charge the time as appropriate.
+		 * Note that we charge interrupts to the current process,
+		 * regardless of whether they are ``for'' that process,
+		 * so that we know how much of its real time was spent
+		 * in ``non-process'' (i.e., interrupt) work.
+		 */
+		p = curproc;
+		if (CLKF_INTR(frame)) {
+			if (p != NULL)
+				p->p_iticks++;
+			cp_time[CP_INTR]++;
+		} else if (p != NULL) {
+			p->p_sticks++;
+			cp_time[CP_SYS]++;
+		} else
+			cp_time[CP_IDLE]++;
+	}
+	pscnt = psdiv;
+
+	/*
+	 * We maintain statistics shown by user-level statistics
+	 * programs:  the amount of time in each cpu state.
+	 */
+
+	/*
+	 * We adjust the priority of the current process.  The priority of
+	 * a process gets worse as it accumulates CPU time.  The cpu usage
+	 * estimator (p_estcpu) is increased here.  The formula for computing
+	 * priorities (in kern_synch.c) will compute a different value each
+	 * time p_estcpu increases by 4.  The cpu usage estimator ramps up
+	 * quite quickly when the process is running (linearly), and decays
+	 * away exponentially, at a rate which is proportionally slower when
+	 * the system is busy.  The basic principal is that the system will
+	 * 90% forget that the process used a lot of CPU time in 5 * loadav
+	 * seconds.  This causes the system to favor processes which haven't
+	 * run much recently, and to round-robin among other processes.
+	 */
+	if (p != NULL) {
+		p->p_cpticks++;
+		if (++p->p_estcpu == 0)
+			p->p_estcpu--;
+		if ((p->p_estcpu & 3) == 0) {
+			resetpriority(p);
+			if (p->p_priority >= PUSER)
+				p->p_priority = p->p_usrpri;
+		}
+
+		/* Update resource usage integrals and maximums. */
+		if ((pstats = p->p_stats) != NULL &&
+		    (ru = &pstats->p_ru) != NULL &&
+		    (vm = p->p_vmspace) != NULL) {
+			ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+			ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+			ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+			rss = vm->vm_pmap.pm_stats.resident_count *
+			      PAGE_SIZE / 1024;
+			if (ru->ru_maxrss < rss)
+				ru->ru_maxrss = rss;
+        	}
+	}
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
+{
+	struct clockinfo clkinfo;
+	/*
+	 * Construct clockinfo structure.
+	 */
+	clkinfo.hz = hz;
+	clkinfo.tick = tick;
+	clkinfo.tickadj = tickadj;
+	clkinfo.profhz = profhz;
+	clkinfo.stathz = stathz ? stathz : hz;
+	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
+
+static __inline unsigned
+tco_delta(struct timecounter *tc)
+{
+
+	return ((tc->tc_get_timecount(tc) - tc->tc_offset_count) & 
+	    tc->tc_counter_mask);
+}
+
+/*
+ * We have four functions for looking at the clock, two for microseconds
+ * and two for nanoseconds.  For each there is fast but less precise
+ * version "get{nano|micro}time" which will return a time which is up
+ * to 1/HZ previous to the call, whereas the raw version "{nano|micro}time"
+ * will return a timestamp which is as precise as possible.
+ */
+
+void
+getmicrotime(struct timeval *tvp)
+{
+	struct timecounter *tc;
+
+	if (!tco_method) {
+		tc = timecounter;
+		*tvp = tc->tc_microtime;
+	} else {
+		microtime(tvp);
+	}
+}
+
+void
+getnanotime(struct timespec *tsp)
+{
+	struct timecounter *tc;
+
+	if (!tco_method) {
+		tc = timecounter;
+		*tsp = tc->tc_nanotime;
+	} else {
+		nanotime(tsp);
+	}
+}
+
+void
+microtime(struct timeval *tv)
+{
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	tv->tv_sec = tc->tc_offset_sec;
+	tv->tv_usec = tc->tc_offset_micro;
+	tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
+	tv->tv_usec += boottime.tv_usec;
+	tv->tv_sec += boottime.tv_sec;
+	while (tv->tv_usec >= 1000000) {
+		tv->tv_usec -= 1000000;
+		tv->tv_sec++;
+	}
+}
+
+void
+nanotime(struct timespec *ts)
+{
+	unsigned count;
+	u_int64_t delta;
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	ts->tv_sec = tc->tc_offset_sec;
+	count = tco_delta(tc);
+	delta = tc->tc_offset_nano;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+	delta >>= 32;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+	delta += boottime.tv_usec * 1000;
+	ts->tv_sec += boottime.tv_sec;
+	while (delta >= 1000000000) {
+		delta -= 1000000000;
+		ts->tv_sec++;
+	}
+	ts->tv_nsec = delta;
+}
+
+void
+timecounter_timespec(unsigned count, struct timespec *ts)
+{
+	u_int64_t delta;
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	ts->tv_sec = tc->tc_offset_sec;
+	count -= tc->tc_offset_count;
+	count &= tc->tc_counter_mask;
+	delta = tc->tc_offset_nano;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+	delta >>= 32;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+	delta += boottime.tv_usec * 1000;
+	ts->tv_sec += boottime.tv_sec;
+	while (delta >= 1000000000) {
+		delta -= 1000000000;
+		ts->tv_sec++;
+	}
+	ts->tv_nsec = delta;
+}
+
+void
+getmicrouptime(struct timeval *tvp)
+{
+	struct timecounter *tc;
+
+	if (!tco_method) {
+		tc = timecounter;
+		tvp->tv_sec = tc->tc_offset_sec;
+		tvp->tv_usec = tc->tc_offset_micro;
+	} else {
+		microuptime(tvp);
+	}
+}
+
+void
+getnanouptime(struct timespec *tsp)
+{
+	struct timecounter *tc;
+
+	if (!tco_method) {
+		tc = timecounter;
+		tsp->tv_sec = tc->tc_offset_sec;
+		tsp->tv_nsec = tc->tc_offset_nano >> 32;
+	} else {
+		nanouptime(tsp);
+	}
+}
+
+void
+microuptime(struct timeval *tv)
+{
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	tv->tv_sec = tc->tc_offset_sec;
+	tv->tv_usec = tc->tc_offset_micro;
+	tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
+	if (tv->tv_usec >= 1000000) {
+		tv->tv_usec -= 1000000;
+		tv->tv_sec++;
+	}
+}
+
+void
+nanouptime(struct timespec *ts)
+{
+	unsigned count;
+	u_int64_t delta;
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	ts->tv_sec = tc->tc_offset_sec;
+	count = tco_delta(tc);
+	delta = tc->tc_offset_nano;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+	delta >>= 32;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+	if (delta >= 1000000000) {
+		delta -= 1000000000;
+		ts->tv_sec++;
+	}
+	ts->tv_nsec = delta;
+}
+
+static void
+tco_setscales(struct timecounter *tc)
+{
+	u_int64_t scale;
+
+	scale = 1000000000LL << 32;
+	if (tc->tc_adjustment > 0)
+		scale += (tc->tc_adjustment * 1000LL) << 10;
+	else
+		scale -= (-tc->tc_adjustment * 1000LL) << 10;
+	scale /= tc->tc_frequency;
+	tc->tc_scale_micro = scale / 1000;
+	tc->tc_scale_nano_f = scale & 0xffffffff;
+	tc->tc_scale_nano_i = scale >> 32;
+}
+
+void
+init_timecounter(struct timecounter *tc)
+{
+	struct timespec ts1;
+	struct timecounter *t1, *t2, *t3;
+	int i;
+
+	tc->tc_adjustment = 0;
+	tco_setscales(tc);
+	tc->tc_offset_count = tc->tc_get_timecount(tc);
+	tc->tc_tweak = tc;
+	MALLOC(t1, struct timecounter *, sizeof *t1, M_TIMECOUNTER, M_WAITOK);
+	*t1 = *tc;
+	t2 = t1;
+	for (i = 1; i < NTIMECOUNTER; i++) {
+		MALLOC(t3, struct timecounter *, sizeof *t3,
+		    M_TIMECOUNTER, M_WAITOK);
+		*t3 = *tc;
+		t3->tc_other = t2;
+		t2 = t3;
+	}
+	t1->tc_other = t3;
+	tc = t1;
+
+	printf("Timecounter \"%s\"  frequency %lu Hz\n", 
+	    tc->tc_name, (u_long)tc->tc_frequency);
+
+	/* XXX: For now always start using the counter. */
+	tc->tc_offset_count = tc->tc_get_timecount(tc);
+	nanouptime(&ts1);
+	tc->tc_offset_nano = (u_int64_t)ts1.tv_nsec << 32;
+	tc->tc_offset_micro = ts1.tv_nsec / 1000;
+	tc->tc_offset_sec = ts1.tv_sec;
+	timecounter = tc;
+}
+
+void
+set_timecounter(struct timespec *ts)
+{
+	struct timespec ts2;
+
+	nanouptime(&ts2);
+	boottime.tv_sec = ts->tv_sec - ts2.tv_sec;
+	boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000;
+	if (boottime.tv_usec < 0) {
+		boottime.tv_usec += 1000000;
+		boottime.tv_sec--;
+	}
+	/* fiddle all the little crinkly bits around the fiords... */
+	tco_forward(1);
+}
+
+
+#if 0 /* Currently unused */
+void
+switch_timecounter(struct timecounter *newtc)
+{
+	int s;
+	struct timecounter *tc;
+	struct timespec ts;
+
+	s = splclock();
+	tc = timecounter;
+	if (newtc == tc || newtc == tc->tc_other) {
+		splx(s);
+		return;
+	}
+	nanouptime(&ts);
+	newtc->tc_offset_sec = ts.tv_sec;
+	newtc->tc_offset_nano = (u_int64_t)ts.tv_nsec << 32;
+	newtc->tc_offset_micro = ts.tv_nsec / 1000;
+	newtc->tc_offset_count = newtc->tc_get_timecount(newtc);
+	timecounter = newtc;
+	splx(s);
+}
+#endif
+
+static struct timecounter *
+sync_other_counter(void)
+{
+	struct timecounter *tc, *tcn, *tco;
+	unsigned delta;
+
+	tco = timecounter;
+	tc = tco->tc_other;
+	tcn = tc->tc_other;
+	*tc = *tco;
+	tc->tc_other = tcn;
+	delta = tco_delta(tc);
+	tc->tc_offset_count += delta;
+	tc->tc_offset_count &= tc->tc_counter_mask;
+	tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_f;
+	tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_i << 32;
+	return (tc);
+}
+
+static void
+tco_forward(int force)
+{
+	struct timecounter *tc, *tco;
+
+	tco = timecounter;
+	tc = sync_other_counter();
+	/*
+	 * We may be inducing a tiny error here, the tc_poll_pps() may
+	 * process a latched count which happens after the tco_delta()
+	 * in sync_other_counter(), which would extend the previous
+	 * counters parameters into the domain of this new one.
+	 * Since the timewindow is very small for this, the error is
+	 * going to be only a few weenieseconds (as Dave Mills would
+	 * say), so lets just not talk more about it, OK ?
+	 */
+	if (tco->tc_poll_pps) 
+		tco->tc_poll_pps(tco);
+	if (timedelta != 0) {
+		tc->tc_offset_nano += (u_int64_t)(tickdelta * 1000) << 32;
+		timedelta -= tickdelta;
+		force++;
+	}
+
+	while (tc->tc_offset_nano >= 1000000000ULL << 32) {
+		tc->tc_offset_nano -= 1000000000ULL << 32;
+		tc->tc_offset_sec++;
+		tc->tc_frequency = tc->tc_tweak->tc_frequency;
+		tc->tc_adjustment = tc->tc_tweak->tc_adjustment;
+		ntp_update_second(tc);	/* XXX only needed if xntpd runs */
+		tco_setscales(tc);
+		force++;
+	}
+
+	if (tco_method && !force)
+		return;
+
+	tc->tc_offset_micro = (tc->tc_offset_nano / 1000) >> 32;
+
+	/* Figure out the wall-clock time */
+	tc->tc_nanotime.tv_sec = tc->tc_offset_sec + boottime.tv_sec;
+	tc->tc_nanotime.tv_nsec = 
+	    (tc->tc_offset_nano >> 32) + boottime.tv_usec * 1000;
+	tc->tc_microtime.tv_usec = tc->tc_offset_micro + boottime.tv_usec;
+	if (tc->tc_nanotime.tv_nsec >= 1000000000) {
+		tc->tc_nanotime.tv_nsec -= 1000000000;
+		tc->tc_microtime.tv_usec -= 1000000;
+		tc->tc_nanotime.tv_sec++;
+	}
+	time_second = tc->tc_microtime.tv_sec = tc->tc_nanotime.tv_sec;
+
+	timecounter = tc;
+}
+
+static int
+sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS
+{
+
+	return (sysctl_handle_opaque(oidp, 
+	    &timecounter->tc_tweak->tc_frequency,
+	    sizeof(timecounter->tc_tweak->tc_frequency), req));
+}
+
+static int
+sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS
+{
+
+	return (sysctl_handle_opaque(oidp, 
+	    &timecounter->tc_tweak->tc_adjustment,
+	    sizeof(timecounter->tc_tweak->tc_adjustment), req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+
+SYSCTL_INT(_kern_timecounter, KERN_ARGMAX, method, CTLFLAG_RW, &tco_method, 0,
+    "This variable determines the method used for updating timecounters. "
+    "If the default algorithm (0) fails with \"calcru negative...\" messages "
+    "try the alternate algorithm (1) which handles bad hardware better."
+
+);
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT | CTLFLAG_RW,
+    0, sizeof(u_int), sysctl_kern_timecounter_frequency, "I", "");
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT | CTLFLAG_RW,
+    0, sizeof(int), sysctl_kern_timecounter_adjustment, "I", "");
diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c
new file mode 100644
index 0000000..df832f6
--- /dev/null
+++ b/sys/kern/kern_conf.c
@@ -0,0 +1,220 @@
+/*-
+ * Parts Copyright (c) 1995 Terrence R. Lambert
+ * Copyright (c) 1995 Julian R. Elischer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Terrence R. Lambert.
+ * 4. The name Terrence R. Lambert may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Julian R. Elischer ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_conf.c,v 1.28 1998/10/25 17:44:50 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/module.h>
+#include <sys/conf.h>
+#include <sys/vnode.h>
+
+#define NUMBDEV 128
+#define NUMCDEV 256
+#define bdevsw_ALLOCSTART	(NUMBDEV/2)
+#define cdevsw_ALLOCSTART	(NUMCDEV/2)
+
+struct cdevsw 	*bdevsw[NUMBDEV];
+int	nblkdev = NUMBDEV;
+struct cdevsw 	*cdevsw[NUMCDEV];
+int	nchrdev = NUMCDEV;
+
+/*
+ * Routine to convert from character to block device number.
+ *
+ * A minimal stub routine can always return NODEV.
+ */
+dev_t
+chrtoblk(dev_t dev)
+{
+	struct cdevsw *cd;
+
+	if(cd = cdevsw[major(dev)]) {
+          if (cd->d_bmaj != -1)
+	    return(makedev(cd->d_bmaj,minor(dev)));
+	}
+	return(NODEV);
+}
+
+/*
+ * (re)place an entry in the bdevsw or cdevsw table
+ * return the slot used in major(*descrip)
+ */
+static int
+bdevsw_add(dev_t *descrip,
+		struct cdevsw *newentry,
+		struct cdevsw **oldentry)
+{
+	int i ;
+
+	if ( (int)*descrip == NODEV) {	/* auto (0 is valid) */
+		/*
+		 * Search the table looking for a slot...
+		 */
+		for (i = bdevsw_ALLOCSTART; i < nblkdev; i++)
+			if (bdevsw[i] == NULL)
+				break;		/* found one! */
+		/* out of allocable slots? */
+		if (i >= nblkdev) {
+			return ENFILE;
+		}
+	} else {				/* assign */
+		i = major(*descrip);
+		if (i < 0 || i >= nblkdev) {
+			return EINVAL;
+		}
+	}
+
+	/* maybe save old */
+        if (oldentry) {
+		*oldentry = bdevsw[i];
+	}
+	if (newentry) {
+		newentry->d_bmaj = i;
+	}
+	/* replace with new */
+	bdevsw[i] = newentry;
+
+	/* done!  let them know where we put it */
+	*descrip = makedev(i,0);
+	return 0;
+} 
+
+int
+cdevsw_add(dev_t *descrip,
+		struct cdevsw *newentry,
+		struct cdevsw **oldentry)
+{
+	int i ;
+
+	if ( (int)*descrip == NODEV) {	/* auto (0 is valid) */
+		/*
+		 * Search the table looking for a slot...
+		 */
+		for (i = cdevsw_ALLOCSTART; i < nchrdev; i++)
+			if (cdevsw[i] == NULL)
+				break;		/* found one! */
+		/* out of allocable slots? */
+		if (i >= nchrdev) {
+			return ENFILE;
+		}
+	} else {				/* assign */
+		i = major(*descrip);
+		if (i < 0 || i >= nchrdev) {
+			return EINVAL;
+		}
+	}
+
+	/* maybe save old */
+        if (oldentry) {
+		*oldentry = cdevsw[i];
+	}
+	if (newentry) {
+		newentry->d_bmaj = -1;
+		newentry->d_maj = i;
+	}
+	/* replace with new */
+	cdevsw[i] = newentry;
+
+	/* done!  let them know where we put it */
+	*descrip = makedev(i,0);
+	return 0;
+} 
+
+/*
+ * note must call cdevsw_add before bdevsw_add due to d_bmaj hack.
+ */
+void
+cdevsw_add_generic(int bdev, int cdev, struct cdevsw *cdevsw)
+{
+	dev_t dev;
+
+	dev = makedev(cdev, 0);
+	cdevsw_add(&dev, cdevsw, NULL);
+	dev = makedev(bdev, 0);
+	bdevsw_add(&dev, cdevsw, NULL);
+}
+
+int
+cdevsw_module_handler(module_t mod, int what, void *arg)
+{
+	struct cdevsw_module_data* data = (struct cdevsw_module_data*) arg;
+	int error;
+
+	switch (what) {
+	case MOD_LOAD:
+		if (error = cdevsw_add(&data->dev, data->cdevsw, NULL))
+			return error;
+		break;
+
+	case MOD_UNLOAD:
+		if (error = cdevsw_add(&data->dev, NULL, NULL))
+			return error;
+		break;
+	}
+
+	if (data->chainevh)
+		return data->chainevh(mod, what, data->chainarg);
+	else
+		return 0;
+}
+
+int
+bdevsw_module_handler(module_t mod, int what, void* arg)
+{
+	struct bdevsw_module_data* data = (struct bdevsw_module_data*) arg;
+	int error;
+
+	switch (what) {
+	case MOD_LOAD:
+		if (error = cdevsw_add(&data->cdev, data->cdevsw, NULL))
+			return error;
+		if (error = bdevsw_add(&data->bdev, data->cdevsw, NULL)) {
+			cdevsw_add(&data->bdev, NULL, NULL);
+			return error;
+		}
+		break;
+
+	case MOD_UNLOAD:
+		if (error = bdevsw_add(&data->bdev, NULL, NULL))
+			return error;
+		if (error = cdevsw_add(&data->cdev, NULL, NULL))
+			return error;
+		break;
+	}
+
+	if (data->chainevh)
+		return data->chainevh(mod, what, data->chainarg);
+	else
+		return 0;
+}
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
new file mode 100644
index 0000000..1d18a86
--- /dev/null
+++ b/sys/kern/kern_descrip.c
@@ -0,0 +1,1313 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
+ * $Id: kern_descrip.c,v 1.57 1998/11/11 10:55:56 truckman Exp $
+ */
+
+#include "opt_compat.h"
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/conf.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/resourcevar.h>
+#include <sys/pipe.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
+MALLOC_DEFINE(M_FILE, "file", "Open file structure");
+static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
+
+
+static	 d_open_t  fdopen;
+#define NUMFDESC 64
+
+#define CDEV_MAJOR 22
+static struct cdevsw fildesc_cdevsw = 
+	{ fdopen,	noclose,	noread,		nowrite,
+	  noioc,	nostop,		nullreset,	nodevtotty,
+	  seltrue,	nommap,		nostrat };
+
+static int finishdup __P((struct filedesc *fdp, int old, int new, register_t *retval));
+/*
+ * Descriptor management.
+ */
+struct filelist filehead;	/* head of list of open files */
+int nfiles;			/* actual number of open files */
+extern int cmask;	
+
+/*
+ * System calls on descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdtablesize_args {
+	int	dummy;
+};
+#endif
+/* ARGSUSED */
+int
+getdtablesize(p, uap)
+	struct proc *p;
+	struct getdtablesize_args *uap;
+{
+
+	p->p_retval[0] = 
+	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+	return (0);
+}
+
+/*
+ * Duplicate a file descriptor to a particular value.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup2_args {
+	u_int	from;
+	u_int	to;
+};
+#endif
+/* ARGSUSED */
+int
+dup2(p, uap)
+	struct proc *p;
+	struct dup2_args *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register u_int old = uap->from, new = uap->to;
+	int i, error;
+
+	if (old >= fdp->fd_nfiles ||
+	    fdp->fd_ofiles[old] == NULL ||
+	    new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
+	    new >= maxfilesperproc)
+		return (EBADF);
+	if (old == new) {
+		p->p_retval[0] = new;
+		return (0);
+	}
+	if (new >= fdp->fd_nfiles) {
+		if ((error = fdalloc(p, new, &i)))
+			return (error);
+		if (new != i)
+			panic("dup2: fdalloc");
+	} else if (fdp->fd_ofiles[new]) {
+		if (fdp->fd_ofileflags[new] & UF_MAPPED)
+			(void) munmapfd(p, new);
+		/*
+		 * dup2() must succeed even if the close has an error.
+		 */
+		(void) closef(fdp->fd_ofiles[new], p);
+	}
+	return (finishdup(fdp, (int)old, (int)new, p->p_retval));
+}
+
+/*
+ * Duplicate a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup_args {
+	u_int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+dup(p, uap)
+	struct proc *p;
+	struct dup_args *uap;
+{
+	register struct filedesc *fdp;
+	u_int old;
+	int new, error;
+
+	old = uap->fd;
+
+#if 0
+	/*
+	 * XXX Compatibility
+	 */
+	if (old &~ 077) { uap->fd &= 077; return (dup2(p, uap, p->p_retval)); }
+#endif
+
+	fdp = p->p_fd;
+	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL)
+		return (EBADF);
+	if ((error = fdalloc(p, 0, &new)))
+		return (error);
+	return (finishdup(fdp, (int)old, new, p->p_retval));
+}
+
+/*
+ * The file control system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fcntl_args {
+	int	fd;
+	int	cmd;
+	long	arg;
+};
+#endif
+/* ARGSUSED */
+int
+fcntl(p, uap)
+	struct proc *p;
+	register struct fcntl_args *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	register char *pop;
+	struct vnode *vp;
+	int i, tmp, error, flg = F_POSIX;
+	struct flock fl;
+	u_int newmin;
+
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+		return (EBADF);
+	pop = &fdp->fd_ofileflags[uap->fd];
+	switch (uap->cmd) {
+
+	case F_DUPFD:
+		newmin = uap->arg;
+		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
+		    newmin >= maxfilesperproc)
+			return (EINVAL);
+		if ((error = fdalloc(p, newmin, &i)))
+			return (error);
+		return (finishdup(fdp, uap->fd, i, p->p_retval));
+
+	case F_GETFD:
+		p->p_retval[0] = *pop & 1;
+		return (0);
+
+	case F_SETFD:
+		*pop = (*pop &~ 1) | (uap->arg & 1);
+		return (0);
+
+	case F_GETFL:
+		p->p_retval[0] = OFLAGS(fp->f_flag);
+		return (0);
+
+	case F_SETFL:
+		fp->f_flag &= ~FCNTLFLAGS;
+		fp->f_flag |= FFLAGS(uap->arg & ~O_ACCMODE) & FCNTLFLAGS;
+		tmp = fp->f_flag & FNONBLOCK;
+		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
+		if (error)
+			return (error);
+		tmp = fp->f_flag & FASYNC;
+		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
+		if (!error)
+			return (0);
+		fp->f_flag &= ~FNONBLOCK;
+		tmp = 0;
+		(void) (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
+		return (error);
+
+	case F_GETOWN:
+		error = (*fp->f_ops->fo_ioctl)
+			(fp, FIOGETOWN, (caddr_t)p->p_retval, p);
+		return (error);
+
+	case F_SETOWN:
+		return ((*fp->f_ops->fo_ioctl)
+			(fp, FIOSETOWN, (caddr_t)&uap->arg, p));
+
+	case F_SETLKW:
+		flg |= F_WAIT;
+		/* Fall into F_SETLK */
+
+	case F_SETLK:
+		if (fp->f_type != DTYPE_VNODE)
+			return (EBADF);
+		vp = (struct vnode *)fp->f_data;
+		/* Copy in the lock structure */
+		error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
+		    sizeof(fl));
+		if (error)
+			return (error);
+		if (fl.l_whence == SEEK_CUR)
+			fl.l_start += fp->f_offset;
+		switch (fl.l_type) {
+
+		case F_RDLCK:
+			if ((fp->f_flag & FREAD) == 0)
+				return (EBADF);
+			p->p_flag |= P_ADVLOCK;
+			return (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg));
+
+		case F_WRLCK:
+			if ((fp->f_flag & FWRITE) == 0)
+				return (EBADF);
+			p->p_flag |= P_ADVLOCK;
+			return (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg));
+
+		case F_UNLCK:
+			return (VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &fl,
+				F_POSIX));
+
+		default:
+			return (EINVAL);
+		}
+
+	case F_GETLK:
+		if (fp->f_type != DTYPE_VNODE)
+			return (EBADF);
+		vp = (struct vnode *)fp->f_data;
+		/* Copy in the lock structure */
+		error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
+		    sizeof(fl));
+		if (error)
+			return (error);
+		if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK &&
+		    fl.l_type != F_UNLCK)
+			return (EINVAL);
+		if (fl.l_whence == SEEK_CUR)
+			fl.l_start += fp->f_offset;
+		if ((error = VOP_ADVLOCK(vp,(caddr_t)p,F_GETLK,&fl,F_POSIX)))
+			return (error);
+		return (copyout((caddr_t)&fl, (caddr_t)(intptr_t)uap->arg,
+		    sizeof(fl)));
+
+	default:
+		return (EINVAL);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Common code for dup, dup2, and fcntl(F_DUPFD).
+ */
+static int
+finishdup(fdp, old, new, retval)
+	register struct filedesc *fdp;
+	register int old, new;
+	register_t *retval;
+{
+	register struct file *fp;
+
+	fp = fdp->fd_ofiles[old];
+	fdp->fd_ofiles[new] = fp;
+	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
+	fp->f_count++;
+	if (new > fdp->fd_lastfile)
+		fdp->fd_lastfile = new;
+	*retval = new;
+	return (0);
+}
+
+/*
+ * If sigio is on the list associated with a process or process group,
+ * disable signalling from the device, remove sigio from the list and
+ * free sigio.
+ */
+void
+funsetown(sigio)
+	struct sigio *sigio;
+{
+	int s;
+
+	if (sigio == NULL)
+		return;
+	s = splhigh();
+	*(sigio->sio_myref) = NULL;
+	splx(s);
+	if (sigio->sio_pgid < 0) {
+		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
+			     sigio, sio_pgsigio);
+	} else /* if ((*sigiop)->sio_pgid > 0) */ {
+		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
+			     sigio, sio_pgsigio);
+	}
+	crfree(sigio->sio_ucred);
+	FREE(sigio, M_SIGIO);
+}
+
+/* Free a list of sigio structures. */
+void
+funsetownlst(sigiolst)
+	struct sigiolst *sigiolst;
+{
+	struct sigio *sigio;
+
+	while ((sigio = sigiolst->slh_first) != NULL)
+		funsetown(sigio);
+}
+
+/*
+ * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
+ *
+ * After permission checking, add a sigio structure to the sigio list for
+ * the process or process group.
+ */
+int
+fsetown(pgid, sigiop)
+	pid_t pgid;
+	struct sigio **sigiop;
+{
+	struct proc *proc;
+	struct pgrp *pgrp;
+	struct sigio *sigio;
+	int s;
+
+	if (pgid == 0) {
+		funsetown(*sigiop);
+		return (0);
+	}
+	if (pgid > 0) {
+		proc = pfind(pgid);
+		if (proc == NULL)
+			return (ESRCH);
+		/*
+		 * Policy - Don't allow a process to FSETOWN a process
+		 * in another session.
+		 *
+		 * Remove this test to allow maximum flexibility or
+		 * restrict FSETOWN to the current process or process
+		 * group for maximum safety.
+		 */
+		else if (proc->p_session != curproc->p_session)
+			return (EPERM);
+		pgrp = NULL;
+	} else /* if (pgid < 0) */ {
+		pgrp = pgfind(-pgid);
+		if (pgrp == NULL)
+			return (ESRCH);
+		/*
+		 * Policy - Don't allow a process to FSETOWN a process
+		 * in another session.
+		 *
+		 * Remove this test to allow maximum flexibility or
+		 * restrict FSETOWN to the current process or process
+		 * group for maximum safety.
+		 */
+		else if (pgrp->pg_session != curproc->p_session)
+			return (EPERM);
+		proc = NULL;
+	}
+	funsetown(*sigiop);
+	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO,
+	       M_WAITOK);
+	if (pgid > 0) {
+		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
+		sigio->sio_proc = proc;
+	} else {
+		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
+		sigio->sio_pgrp = pgrp;
+	}
+	sigio->sio_pgid = pgid;
+	crhold(curproc->p_ucred);
+	sigio->sio_ucred = curproc->p_ucred;
+	/* It would be convenient if p_ruid was in ucred. */
+	sigio->sio_ruid = curproc->p_cred->p_ruid;
+	sigio->sio_myref = sigiop;
+	s = splhigh();
+	*sigiop = sigio;
+	splx(s);
+	return (0);
+}
+
+/*
+ * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
+ */
+pid_t
+fgetown(sigio)
+	struct sigio *sigio;
+{
+	return (sigio != NULL ? sigio->sio_pgid : 0);
+}
+
+/*
+ * Close a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+        int     fd;
+};
+#endif
+/* ARGSUSED */
+int
+close(p, uap)
+	struct proc *p;
+	struct close_args *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	register int fd = uap->fd;
+	register u_char *pf;
+
+	if ((unsigned)fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[fd]) == NULL)
+		return (EBADF);
+	pf = (u_char *)&fdp->fd_ofileflags[fd];
+	if (*pf & UF_MAPPED)
+		(void) munmapfd(p, fd);
+	fdp->fd_ofiles[fd] = NULL;
+	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+		fdp->fd_lastfile--;
+	if (fd < fdp->fd_freefile)
+		fdp->fd_freefile = fd;
+	*pf = 0;
+	return (closef(fp, p));
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ofstat_args {
+	int	fd;
+	struct	ostat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+ofstat(p, uap)
+	struct proc *p;
+	register struct ofstat_args *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	struct stat ub;
+	struct ostat oub;
+	int error;
+
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+		return (EBADF);
+	switch (fp->f_type) {
+
+	case DTYPE_FIFO:
+	case DTYPE_VNODE:
+		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
+		break;
+
+	case DTYPE_SOCKET:
+		error = soo_stat((struct socket *)fp->f_data, &ub);
+		break;
+
+	case DTYPE_PIPE:
+		error = pipe_stat((struct pipe *)fp->f_data, &ub);
+		break;
+
+	default:
+		panic("ofstat");
+		/*NOTREACHED*/
+	}
+	cvtstat(&ub, &oub);
+	if (error == 0)
+		error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub));
+	return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstat_args {
+	int	fd;
+	struct	stat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+fstat(p, uap)
+	struct proc *p;
+	register struct fstat_args *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	struct stat ub;
+	int error;
+
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+		return (EBADF);
+	switch (fp->f_type) {
+
+	case DTYPE_FIFO:
+	case DTYPE_VNODE:
+		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
+		break;
+
+	case DTYPE_SOCKET:
+		error = soo_stat((struct socket *)fp->f_data, &ub);
+		break;
+
+	case DTYPE_PIPE:
+		error = pipe_stat((struct pipe *)fp->f_data, &ub);
+		break;
+
+	default:
+		panic("fstat");
+		/*NOTREACHED*/
+	}
+	if (error == 0)
+		error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub));
+	return (error);
+}
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nfstat_args {
+	int	fd;
+	struct	nstat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+nfstat(p, uap)
+	struct proc *p;
+	register struct nfstat_args *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	struct stat ub;
+	struct nstat nub;
+	int error;
+
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+		return (EBADF);
+	switch (fp->f_type) {
+
+	case DTYPE_FIFO:
+	case DTYPE_VNODE:
+		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
+		break;
+
+	case DTYPE_SOCKET:
+		error = soo_stat((struct socket *)fp->f_data, &ub);
+		break;
+
+	case DTYPE_PIPE:
+		error = pipe_stat((struct pipe *)fp->f_data, &ub);
+		break;
+
+	default:
+		panic("fstat");
+		/*NOTREACHED*/
+	}
+	if (error == 0) {
+		cvtnstat(&ub, &nub);
+		error = copyout((caddr_t)&nub, (caddr_t)uap->sb, sizeof (nub));
+	}
+	return (error);
+}
+
+/*
+ * Return pathconf information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fpathconf_args {
+	int	fd;
+	int	name;
+};
+#endif
+/* ARGSUSED */
+int
+fpathconf(p, uap)
+	struct proc *p;
+	register struct fpathconf_args *uap;
+{
+	struct filedesc *fdp = p->p_fd;
+	struct file *fp;
+	struct vnode *vp;
+
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+		return (EBADF);
+	switch (fp->f_type) {
+
+	case DTYPE_PIPE:
+	case DTYPE_SOCKET:
+		if (uap->name != _PC_PIPE_BUF)
+			return (EINVAL);
+		p->p_retval[0] = PIPE_BUF;
+		return (0);
+
+	case DTYPE_FIFO:
+	case DTYPE_VNODE:
+		vp = (struct vnode *)fp->f_data;
+		return (VOP_PATHCONF(vp, uap->name, p->p_retval));
+
+	default:
+		panic("fpathconf");
+	}
+	/*NOTREACHED*/
+}
+
+/*
+ * Allocate a file descriptor for the process.
+ */
+static int fdexpand;
+SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
+
+int
+fdalloc(p, want, result)
+	struct proc *p;
+	int want;
+	int *result;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register int i;
+	int lim, last, nfiles;
+	struct file **newofile;
+	char *newofileflags;
+
+	/*
+	 * Search for a free descriptor starting at the higher
+	 * of want or fd_freefile.  If that fails, consider
+	 * expanding the ofile array.
+	 */
+	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+	for (;;) {
+		last = min(fdp->fd_nfiles, lim);
+		if ((i = want) < fdp->fd_freefile)
+			i = fdp->fd_freefile;
+		for (; i < last; i++) {
+			if (fdp->fd_ofiles[i] == NULL) {
+				fdp->fd_ofileflags[i] = 0;
+				if (i > fdp->fd_lastfile)
+					fdp->fd_lastfile = i;
+				if (want <= fdp->fd_freefile)
+					fdp->fd_freefile = i;
+				*result = i;
+				return (0);
+			}
+		}
+
+		/*
+		 * No space in current array.  Expand?
+		 */
+		if (fdp->fd_nfiles >= lim)
+			return (EMFILE);
+		if (fdp->fd_nfiles < NDEXTENT)
+			nfiles = NDEXTENT;
+		else
+			nfiles = 2 * fdp->fd_nfiles;
+		MALLOC(newofile, struct file **, nfiles * OFILESIZE,
+		    M_FILEDESC, M_WAITOK);
+		newofileflags = (char *) &newofile[nfiles];
+		/*
+		 * Copy the existing ofile and ofileflags arrays
+		 * and zero the new portion of each array.
+		 */
+		bcopy(fdp->fd_ofiles, newofile,
+			(i = sizeof(struct file *) * fdp->fd_nfiles));
+		bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i);
+		bcopy(fdp->fd_ofileflags, newofileflags,
+			(i = sizeof(char) * fdp->fd_nfiles));
+		bzero(newofileflags + i, nfiles * sizeof(char) - i);
+		if (fdp->fd_nfiles > NDFILE)
+			FREE(fdp->fd_ofiles, M_FILEDESC);
+		fdp->fd_ofiles = newofile;
+		fdp->fd_ofileflags = newofileflags;
+		fdp->fd_nfiles = nfiles;
+		fdexpand++;
+	}
+	return (0);
+}
+
+/*
+ * Check to see whether n user file descriptors
+ * are available to the process p.
+ */
+int
+fdavail(p, n)
+	struct proc *p;
+	register int n;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register struct file **fpp;
+	register int i, lim, last;
+
+	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
+		return (1);
+
+	last = min(fdp->fd_nfiles, lim);
+	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
+	for (i = last - fdp->fd_freefile; --i >= 0; fpp++)
+		if (*fpp == NULL && --n <= 0)
+			return (1);
+	return (0);
+}
+
+/*
+ * Create a new open file structure and allocate
+ * a file decriptor for the process that refers to it.
+ */
+int
+falloc(p, resultfp, resultfd)
+	register struct proc *p;
+	struct file **resultfp;
+	int *resultfd;
+{
+	register struct file *fp, *fq;
+	int error, i;
+
+	if ((error = fdalloc(p, 0, &i)))
+		return (error);
+	if (nfiles >= maxfiles) {
+		tablefull("file");
+		return (ENFILE);
+	}
+	/*
+	 * Allocate a new file descriptor.
+	 * If the process has file descriptor zero open, add to the list
+	 * of open files at that point, otherwise put it at the front of
+	 * the list of open files.
+	 */
+	nfiles++;
+	MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK);
+	bzero(fp, sizeof(struct file));
+	if ((fq = p->p_fd->fd_ofiles[0])) {
+		LIST_INSERT_AFTER(fq, fp, f_list);
+	} else {
+		LIST_INSERT_HEAD(&filehead, fp, f_list);
+	}
+	p->p_fd->fd_ofiles[i] = fp;
+	fp->f_count = 1;
+	fp->f_cred = p->p_ucred;
+	fp->f_seqcount = 1;
+	crhold(fp->f_cred);
+	if (resultfp)
+		*resultfp = fp;
+	if (resultfd)
+		*resultfd = i;
+	return (0);
+}
+
+/*
+ * Free a file descriptor.
+ */
+void
+ffree(fp)
+	register struct file *fp;
+{
+	LIST_REMOVE(fp, f_list);
+	crfree(fp->f_cred);
+#if defined(DIAGNOSTIC) || defined(INVARIANTS)
+	fp->f_count = 0;
+#endif
+	nfiles--;
+	FREE(fp, M_FILE);
+}
+
+/*
+ * Build a new filedesc structure.
+ */
+struct filedesc *
+fdinit(p)
+	struct proc *p;
+{
+	register struct filedesc0 *newfdp;
+	register struct filedesc *fdp = p->p_fd;
+
+	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
+	    M_FILEDESC, M_WAITOK);
+	bzero(newfdp, sizeof(struct filedesc0));
+	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
+	VREF(newfdp->fd_fd.fd_cdir);
+	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
+	VREF(newfdp->fd_fd.fd_rdir);
+
+	/* Create the file descriptor table. */
+	newfdp->fd_fd.fd_refcnt = 1;
+	newfdp->fd_fd.fd_cmask = cmask;
+	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
+	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
+	newfdp->fd_fd.fd_nfiles = NDFILE;
+
+	newfdp->fd_fd.fd_freefile = 0;
+	newfdp->fd_fd.fd_lastfile = 0;
+
+	return (&newfdp->fd_fd);
+}
+
+/*
+ * Share a filedesc structure.
+ */
+struct filedesc *
+fdshare(p)
+	struct proc *p;
+{
+	p->p_fd->fd_refcnt++;
+	return (p->p_fd);
+}
+
+/*
+ * Copy a filedesc structure.
+ */
+struct filedesc *
+fdcopy(p)
+	struct proc *p;
+{
+	register struct filedesc *newfdp, *fdp = p->p_fd;
+	register struct file **fpp;
+	register int i;
+
+/*
+ * Certain daemons might not have file descriptors
+ */
+	if (fdp == NULL)
+		return NULL;
+
+	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
+	    M_FILEDESC, M_WAITOK);
+	bcopy(fdp, newfdp, sizeof(struct filedesc));
+	VREF(newfdp->fd_cdir);
+	VREF(newfdp->fd_rdir);
+	newfdp->fd_refcnt = 1;
+
+	/*
+	 * If the number of open files fits in the internal arrays
+	 * of the open file structure, use them, otherwise allocate
+	 * additional memory for the number of descriptors currently
+	 * in use.
+	 */
+	if (newfdp->fd_lastfile < NDFILE) {
+		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
+		newfdp->fd_ofileflags =
+		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
+		i = NDFILE;
+	} else {
+		/*
+		 * Compute the smallest multiple of NDEXTENT needed
+		 * for the file descriptors currently in use,
+		 * allowing the table to shrink.
+		 */
+		i = newfdp->fd_nfiles;
+		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
+			i /= 2;
+		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
+		    M_FILEDESC, M_WAITOK);
+		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
+	}
+	newfdp->fd_nfiles = i;
+	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
+	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
+	fpp = newfdp->fd_ofiles;
+	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++)
+		if (*fpp != NULL)
+			(*fpp)->f_count++;
+	return (newfdp);
+}
+
+/*
+ * Release a filedesc structure.
+ */
+void
+fdfree(p)
+	struct proc *p;
+{
+	register struct filedesc *fdp = p->p_fd;
+	struct file **fpp;
+	register int i;
+
+/*
+ * Certain daemons might not have file descriptors
+ */
+	if (fdp == NULL)
+		return;
+
+	if (--fdp->fd_refcnt > 0)
+		return;
+	fpp = fdp->fd_ofiles;
+	for (i = fdp->fd_lastfile; i-- >= 0; fpp++)
+		if (*fpp)
+			(void) closef(*fpp, p);
+	if (fdp->fd_nfiles > NDFILE)
+		FREE(fdp->fd_ofiles, M_FILEDESC);
+	vrele(fdp->fd_cdir);
+	vrele(fdp->fd_rdir);
+	FREE(fdp, M_FILEDESC);
+}
+
+/*
+ * Close any files on exec?
+ */
+void
+fdcloseexec(p)
+	struct proc *p;
+{
+	struct filedesc *fdp = p->p_fd;
+	struct file **fpp;
+	char *fdfp;
+	register int i;
+
+/*
+ * Certain daemons might not have file descriptors
+ */
+	if (fdp == NULL)
+		return;
+
+	fpp = fdp->fd_ofiles;
+	fdfp = fdp->fd_ofileflags;
+	for (i = 0; i <= fdp->fd_lastfile; i++, fpp++, fdfp++)
+		if (*fpp != NULL && (*fdfp & UF_EXCLOSE)) {
+			if (*fdfp & UF_MAPPED)
+				(void) munmapfd(p, i);
+			(void) closef(*fpp, p);
+			*fpp = NULL;
+			*fdfp = 0;
+			if (i < fdp->fd_freefile)
+				fdp->fd_freefile = i;
+		}
+	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+		fdp->fd_lastfile--;
+}
+
+/*
+ * Internal form of close.
+ * Decrement reference count on file structure.
+ * Note: p may be NULL when closing a file
+ * that was being passed in a message.
+ */
+int
+closef(fp, p)
+	register struct file *fp;
+	register struct proc *p;
+{
+	struct vnode *vp;
+	struct flock lf;
+	int error;
+
+	if (fp == NULL)
+		return (0);
+	/*
+	 * POSIX record locking dictates that any close releases ALL
+	 * locks owned by this process.  This is handled by setting
+	 * a flag in the unlock to free ONLY locks obeying POSIX
+	 * semantics, and not to free BSD-style file locks.
+	 * If the descriptor was in a message, POSIX-style locks
+	 * aren't passed with the descriptor.
+	 */
+	if (p && (p->p_flag & P_ADVLOCK) && fp->f_type == DTYPE_VNODE) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		lf.l_type = F_UNLCK;
+		vp = (struct vnode *)fp->f_data;
+		(void) VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_POSIX);
+	}
+	if (--fp->f_count > 0)
+		return (0);
+	if (fp->f_count < 0)
+		panic("closef: count < 0");
+	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		lf.l_type = F_UNLCK;
+		vp = (struct vnode *)fp->f_data;
+		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+	}
+	if (fp->f_ops)
+		error = (*fp->f_ops->fo_close)(fp, p);
+	else
+		error = 0;
+	ffree(fp);
+	return (error);
+}
+
+/*
+ * Apply an advisory lock on a file descriptor.
+ *
+ * Just attempt to get a record lock of the requested type on
+ * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct flock_args {
+	int	fd;
+	int	how;
+};
+#endif
+/* ARGSUSED */
+int
+flock(p, uap)
+	struct proc *p;
+	register struct flock_args *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	struct vnode *vp;
+	struct flock lf;
+
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+		return (EBADF);
+	if (fp->f_type != DTYPE_VNODE)
+		return (EOPNOTSUPP);
+	vp = (struct vnode *)fp->f_data;
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	if (uap->how & LOCK_UN) {
+		lf.l_type = F_UNLCK;
+		fp->f_flag &= ~FHASLOCK;
+		return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK));
+	}
+	if (uap->how & LOCK_EX)
+		lf.l_type = F_WRLCK;
+	else if (uap->how & LOCK_SH)
+		lf.l_type = F_RDLCK;
+	else
+		return (EBADF);
+	fp->f_flag |= FHASLOCK;
+	if (uap->how & LOCK_NB)
+		return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK));
+	return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT));
+}
+
+/*
+ * File Descriptor pseudo-device driver (/dev/fd/).
+ *
+ * Opening minor device N dup()s the file (if any) connected to file
+ * descriptor N belonging to the calling process.  Note that this driver
+ * consists of only the ``open()'' routine, because all subsequent
+ * references to this file will be direct to the other driver.
+ */
+/* ARGSUSED */
+static int
+fdopen(dev, mode, type, p)
+	dev_t dev;
+	int mode, type;
+	struct proc *p;
+{
+
+	/*
+	 * XXX Kludge: set curproc->p_dupfd to contain the value of the
+	 * the file descriptor being sought for duplication. The error
+	 * return ensures that the vnode for this device will be released
+	 * by vn_open. Open will detect this special error and take the
+	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
+	 * will simply report the error.
+	 */
+	p->p_dupfd = minor(dev);
+	return (ENODEV);
+}
+
+/*
+ * Duplicate the specified descriptor to a free descriptor.
+ */
+int
+dupfdopen(fdp, indx, dfd, mode, error)
+	register struct filedesc *fdp;
+	register int indx, dfd;
+	int mode;
+	int error;
+{
+	register struct file *wfp;
+	struct file *fp;
+
+	/*
+	 * If the to-be-dup'd fd number is greater than the allowed number
+	 * of file descriptors, or the fd to be dup'd has already been
+	 * closed, reject.  Note, check for new == old is necessary as
+	 * falloc could allocate an already closed to-be-dup'd descriptor
+	 * as the new descriptor.
+	 */
+	fp = fdp->fd_ofiles[indx];
+	if ((u_int)dfd >= fdp->fd_nfiles ||
+	    (wfp = fdp->fd_ofiles[dfd]) == NULL || fp == wfp)
+		return (EBADF);
+
+	/*
+	 * There are two cases of interest here.
+	 *
+	 * For ENODEV simply dup (dfd) to file descriptor
+	 * (indx) and return.
+	 *
+	 * For ENXIO steal away the file structure from (dfd) and
+	 * store it in (indx).  (dfd) is effectively closed by
+	 * this operation.
+	 *
+	 * Any other error code is just returned.
+	 */
+	switch (error) {
+	case ENODEV:
+		/*
+		 * Check that the mode the file is being opened for is a
+		 * subset of the mode of the existing descriptor.
+		 */
+		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag)
+			return (EACCES);
+		fdp->fd_ofiles[indx] = wfp;
+		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
+		wfp->f_count++;
+		if (indx > fdp->fd_lastfile)
+			fdp->fd_lastfile = indx;
+		return (0);
+
+	case ENXIO:
+		/*
+		 * Steal away the file pointer from dfd, and stuff it into indx.
+		 */
+		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
+		fdp->fd_ofiles[dfd] = NULL;
+		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
+		fdp->fd_ofileflags[dfd] = 0;
+		/*
+		 * Complete the clean up of the filedesc structure by
+		 * recomputing the various hints.
+		 */
+		if (indx > fdp->fd_lastfile)
+			fdp->fd_lastfile = indx;
+		else
+			while (fdp->fd_lastfile > 0 &&
+			       fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+				fdp->fd_lastfile--;
+			if (dfd < fdp->fd_freefile)
+				fdp->fd_freefile = dfd;
+		return (0);
+
+	default:
+		return (error);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Get file structures.
+ */
+static int
+sysctl_kern_file SYSCTL_HANDLER_ARGS
+{
+	int error;
+	struct file *fp;
+
+	if (!req->oldptr) {
+		/*
+		 * overestimate by 10 files
+		 */
+		return (SYSCTL_OUT(req, 0, sizeof(filehead) + 
+				(nfiles + 10) * sizeof(struct file)));
+	}
+
+	error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead));
+	if (error)
+		return (error);
+
+	/*
+	 * followed by an array of file structures
+	 */
+	for (fp = filehead.lh_first; fp != NULL; fp = fp->f_list.le_next) {
+		error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file));
+		if (error)
+			return (error);
+	}
+	return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, 0, sysctl_kern_file, "S,file", "");
+
+SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc,
+	CTLFLAG_RW, &maxfilesperproc, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "");
+
+static fildesc_devsw_installed = 0;
+#ifdef DEVFS
+static	void *devfs_token_stdin;
+static	void *devfs_token_stdout;
+static	void *devfs_token_stderr;
+static	void *devfs_token_fildesc[NUMFDESC];
+#endif
+
+static void 	fildesc_drvinit(void *unused)
+{
+	dev_t dev;
+#ifdef DEVFS
+	int fd;
+#endif
+
+	if( ! fildesc_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR,0);
+		cdevsw_add(&dev,&fildesc_cdevsw,NULL);
+		fildesc_devsw_installed = 1;
+#ifdef DEVFS
+		for (fd = 0; fd < NUMFDESC; fd++)
+			devfs_token_fildesc[fd] =
+				devfs_add_devswf(&fildesc_cdevsw, fd, DV_CHR,
+						 UID_BIN, GID_BIN, 0666,
+						 "fd/%d", fd);
+		devfs_token_stdin =
+			devfs_add_devswf(&fildesc_cdevsw, 0, DV_CHR,
+					 UID_ROOT, GID_WHEEL, 0666,
+					 "stdin");
+		devfs_token_stdout =
+			devfs_add_devswf(&fildesc_cdevsw, 1, DV_CHR,
+					 UID_ROOT, GID_WHEEL, 0666,
+					 "stdout");
+		devfs_token_stderr =
+			devfs_add_devswf(&fildesc_cdevsw, 2, DV_CHR,
+					 UID_ROOT, GID_WHEEL, 0666,
+					 "stderr");
+#endif
+    	}
+}
+
+SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
+					fildesc_drvinit,NULL)
+
+
diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c
new file mode 100644
index 0000000..2243e27
--- /dev/null
+++ b/sys/kern/kern_environment.c
@@ -0,0 +1,137 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: kern_environment.c,v 1.3 1998/10/09 21:21:34 msmith Exp $
+ */
+
+/*
+ * The unified bootloader passes us a pointer to a preserved copy of
+ * bootstrap/kernel environment variables.
+ * We make these available using sysctl for both in-kernel and
+ * out-of-kernel consumers.
+ *
+ * Note that the current sysctl infrastructure doesn't allow 
+ * dynamic insertion or traversal through handled spaces.  Grr.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+#include <machine/bootinfo.h>
+
+char	*kern_envp;
+
+static char	*kernenv_next(char *cp);
+
+char *
+getenv(char *name)
+{
+    char	*cp, *ep;
+    int		len;
+    
+    for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+	for (ep = cp; (*ep != '=') && (*ep != 0); ep++)
+	    ;
+	len = ep - cp;
+	if (*ep = '=')
+	    ep++;
+	if (!strncmp(name, cp, len))
+	    return(ep);
+    }
+    return(NULL);
+}
+
+/*
+ * Return an integer value from an environment variable.
+ */
+int
+getenv_int(char *name, int *data)
+{
+    char	*value, *vtp;
+    quad_t	iv;
+    
+    if ((value = getenv(name)) == NULL)
+	return(0);
+    
+    iv = strtoq(value, &vtp, 0);
+    if ((vtp == value) || (*vtp != 0))
+	return(0);
+    
+    *data = (int)iv;
+    return(1);
+}
+
+static int
+sysctl_kernenv SYSCTL_HANDLER_ARGS
+{
+    int		*name = (int *)arg1;
+    u_int	namelen = arg2;
+    char	*cp;
+    int		i, error;
+
+    if (kern_envp == NULL)
+	return(ENOENT);
+    
+    name++;
+    namelen--;
+    
+    if (namelen != 1)
+	return(EINVAL);
+
+    cp = kern_envp;
+    for (i = 0; i < name[0]; i++) {
+	cp = kernenv_next(cp);
+	if (cp == NULL)
+	    break;
+    }
+    
+    if (cp == NULL)
+	return(ENOENT);
+    
+    error = SYSCTL_OUT(req, cp, strlen(cp) + 1);
+    return (error);
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, environment, CTLFLAG_RD, sysctl_kernenv, "kernel environment space");
+
+/*
+ * Find the next entry after the one which (cp) falls within, return a
+ * pointer to its start or NULL if there are no more.
+ */
+static char *
+kernenv_next(char *cp)
+{
+    if (cp != NULL) {
+	while (*cp != 0)
+	    cp++;
+	cp++;
+	if (*cp == 0)
+	    cp = NULL;
+    }
+    return(cp);
+}
+
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
new file mode 100644
index 0000000..dd63672
--- /dev/null
+++ b/sys/kern/kern_exec.c
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: kern_exec.c,v 1.92 1998/12/30 10:38:59 dfr Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/fcntl.h>
+#include <sys/acct.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/wait.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/sysent.h>
+#include <sys/shm.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_zone.h>
+#include <vm/vm_pager.h>
+
+#include <machine/reg.h>
+
+static long *exec_copyout_strings __P((struct image_params *));
+
+static long ps_strings = PS_STRINGS;
+SYSCTL_LONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, "");
+
+static long usrstack = USRSTACK;
+SYSCTL_LONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, "");
+
+/*
+ * Each of the items is a pointer to a `const struct execsw', hence the
+ * double pointer here.
+ */
+static const struct execsw **execsw;
+
+#ifndef _SYS_SYSPROTO_H_
+struct execve_args {
+        char    *fname; 
+        char    **argv;
+        char    **envv; 
+};
+#endif
+
+/*
+ * execve() system call.
+ */
+int
+execve(p, uap)
+	struct proc *p;
+	register struct execve_args *uap;
+{
+	struct nameidata nd, *ndp;
+	long *stack_base;
+	int error, len, i;
+	struct image_params image_params, *imgp;
+	struct vattr attr;
+
+	imgp = &image_params;
+
+	/*
+	 * Initialize part of the common data
+	 */
+	imgp->proc = p;
+	imgp->uap = uap;
+	imgp->attr = &attr;
+	imgp->argc = imgp->envc = 0;
+	imgp->argv0 = NULL;
+	imgp->entry_addr = 0;
+	imgp->vmspace_destroyed = 0;
+	imgp->interpreted = 0;
+	imgp->interpreter_name[0] = '\0';
+	imgp->auxargs = NULL;
+	imgp->vp = NULL;
+	imgp->firstpage = NULL;
+
+	/*
+	 * Allocate temporary demand zeroed space for argument and
+	 *	environment strings
+	 */
+	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE);
+	if (imgp->stringbase == NULL) {
+		error = ENOMEM;
+		goto exec_fail;
+	}
+	imgp->stringp = imgp->stringbase;
+	imgp->stringspace = ARG_MAX;
+	imgp->image_header = imgp->stringbase + ARG_MAX;
+
+	/*
+	 * Translate the file name. namei() returns a vnode pointer
+	 *	in ni_vp amoung other things.
+	 */
+	ndp = &nd;
+	NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+	    UIO_USERSPACE, uap->fname, p);
+
+interpret:
+
+	error = namei(ndp);
+	if (error) {
+		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
+			ARG_MAX + PAGE_SIZE);
+		goto exec_fail;
+	}
+
+	imgp->vp = ndp->ni_vp;
+	imgp->fname = uap->fname;
+
+	/*
+	 * Check file permissions (also 'opens' file)
+	 */
+	error = exec_check_permissions(imgp);
+	if (error) {
+		VOP_UNLOCK(imgp->vp, 0, p);
+		goto exec_fail_dealloc;
+	}
+
+	error = exec_map_first_page(imgp);
+	VOP_UNLOCK(imgp->vp, 0, p);
+	if (error)
+		goto exec_fail_dealloc;
+
+	/*
+	 * Loop through list of image activators, calling each one.
+	 *	If there is no match, the activator returns -1. If there
+	 *	is a match, but there was an error during the activation,
+	 *	the error is returned. Otherwise 0 means success. If the
+	 *	image is interpreted, loop back up and try activating
+	 *	the interpreter.
+	 */
+	for (i = 0; execsw[i]; ++i) {
+		if (execsw[i]->ex_imgact)
+			error = (*execsw[i]->ex_imgact)(imgp);
+		else
+			continue;
+		if (error == -1)
+			continue;
+		if (error)
+			goto exec_fail_dealloc;
+		if (imgp->interpreted) {
+			exec_unmap_first_page(imgp);
+			/* free old vnode and name buffer */
+			vrele(ndp->ni_vp);
+			zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
+			/* set new name to that of the interpreter */
+			NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+			    UIO_SYSSPACE, imgp->interpreter_name, p);
+			goto interpret;
+		}
+		break;
+	}
+	/* If we made it through all the activators and none matched, exit. */
+	if (error == -1) {
+		error = ENOEXEC;
+		goto exec_fail_dealloc;
+	}
+
+	/*
+	 * Copy out strings (args and env) and initialize stack base
+	 */
+	stack_base = exec_copyout_strings(imgp);
+	p->p_vmspace->vm_minsaddr = (char *)stack_base;
+
+	/*
+	 * If custom stack fixup routine present for this process
+	 * let it do the stack setup.
+	 * Else stuff argument count as first item on stack
+	 */
+	if (p->p_sysent->sv_fixup)
+		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
+	else
+		suword(--stack_base, imgp->argc);
+
+	/*
+	 * For security and other reasons, the file descriptor table cannot
+	 * be shared after an exec.
+	 */
+	if (p->p_fd->fd_refcnt > 1) {
+		struct filedesc *tmp;
+
+		tmp = fdcopy(p);
+		fdfree(p);
+		p->p_fd = tmp;
+	}
+
+	/* close files on exec */
+	fdcloseexec(p);
+
+	/* reset caught signals */
+	execsigs(p);
+
+	/* name this process - nameiexec(p, ndp) */
+	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
+	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
+	p->p_comm[len] = 0;
+
+	/*
+	 * mark as execed, wakeup the process that vforked (if any) and tell
+	 * it that it now has its own resources back
+	 */
+	p->p_flag |= P_EXEC;
+	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
+		p->p_flag &= ~P_PPWAIT;
+		wakeup((caddr_t)p->p_pptr);
+	}
+
+	/*
+	 * Implement image setuid/setgid.
+	 *
+	 * Don't honor setuid/setgid if the filesystem prohibits it or if
+	 * the process is being traced.
+	 */
+	if ((attr.va_mode & VSUID && p->p_ucred->cr_uid != attr.va_uid ||
+	     attr.va_mode & VSGID && p->p_ucred->cr_gid != attr.va_gid) &&
+	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
+	    (p->p_flag & P_TRACED) == 0) {
+		/*
+		 * Turn off syscall tracing for set-id programs, except for
+		 * root.
+		 */
+		if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) {
+			p->p_traceflag = 0;
+			vrele(p->p_tracep);
+			p->p_tracep = NULL;
+		}
+		/*
+		 * Set the new credentials.
+		 */
+		p->p_ucred = crcopy(p->p_ucred);
+		if (attr.va_mode & VSUID)
+			p->p_ucred->cr_uid = attr.va_uid;
+		if (attr.va_mode & VSGID)
+			p->p_ucred->cr_gid = attr.va_gid;
+		setsugid(p);
+	} else {
+		if (p->p_ucred->cr_uid == p->p_cred->p_ruid &&
+		    p->p_ucred->cr_gid == p->p_cred->p_rgid)
+			p->p_flag &= ~P_SUGID;
+	}
+
+	/*
+	 * Implement correct POSIX saved-id behavior.
+	 */
+	p->p_cred->p_svuid = p->p_ucred->cr_uid;
+	p->p_cred->p_svgid = p->p_ucred->cr_gid;
+
+	/*
+	 * Store the vp for use in procfs
+	 */
+	if (p->p_textvp)		/* release old reference */
+		vrele(p->p_textvp);
+	VREF(ndp->ni_vp);
+	p->p_textvp = ndp->ni_vp;
+
+	/*
+	 * If tracing the process, trap to debugger so breakpoints
+	 * 	can be set before the program executes.
+	 */
+	STOPEVENT(p, S_EXEC, 0);
+
+	if (p->p_flag & P_TRACED)
+		psignal(p, SIGTRAP);
+
+	/* clear "fork but no exec" flag, as we _are_ execing */
+	p->p_acflag &= ~AFORK;
+
+	/* Set entry address */
+	setregs(p, imgp->entry_addr, (u_long)(uintptr_t)stack_base);
+
+exec_fail_dealloc:
+
+	/*
+	 * free various allocated resources
+	 */
+	if (imgp->firstpage)
+		exec_unmap_first_page(imgp);
+
+	if (imgp->stringbase != NULL)
+		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
+			ARG_MAX + PAGE_SIZE);
+
+	if (imgp->vp) {
+		vrele(imgp->vp);
+		zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
+	}
+
+	if (error == 0)
+		return (0);
+
+exec_fail:
+	if (imgp->vmspace_destroyed) {
+		/* sorry, no more process anymore. exit gracefully */
+		exit1(p, W_EXITCODE(0, SIGABRT));
+		/* NOT REACHED */
+		return(0);
+	} else {
+		return(error);
+	}
+}
+
+int
+exec_map_first_page(imgp)
+	struct image_params *imgp;
+{
+	int s, rv, i;
+	int initial_pagein;
+	vm_page_t ma[VM_INITIAL_PAGEIN];
+	vm_object_t object;
+
+
+	if (imgp->firstpage) {
+		exec_unmap_first_page(imgp);
+	}
+
+	object = imgp->vp->v_object;
+	s = splvm();
+
+	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+
+	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
+		initial_pagein = VM_INITIAL_PAGEIN;
+		if (initial_pagein > object->size)
+			initial_pagein = object->size;
+		for (i = 1; i < initial_pagein; i++) {
+			if (ma[i] = vm_page_lookup(object, i)) {
+				if ((ma[i]->flags & PG_BUSY) || ma[i]->busy)
+					break;
+				if (ma[i]->valid)
+					break;
+				vm_page_busy(ma[i]);
+			} else {
+				ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL);
+				if (ma[i] == NULL)
+					break;
+			}
+		}
+		initial_pagein = i;
+
+		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
+		ma[0] = vm_page_lookup(object, 0);
+
+		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) {
+			if (ma[0]) {
+				vm_page_protect(ma[0], VM_PROT_NONE);
+				vm_page_free(ma[0]);
+			}
+			splx(s);
+			return EIO;
+		}
+	}
+
+	vm_page_wire(ma[0]);
+	vm_page_wakeup(ma[0]);
+	splx(s);
+
+	pmap_kenter((vm_offset_t) imgp->image_header, VM_PAGE_TO_PHYS(ma[0]));
+	imgp->firstpage = ma[0];
+
+	return 0;
+}
+
+void
+exec_unmap_first_page(imgp)
+	struct image_params *imgp;
+{
+	if (imgp->firstpage) {
+		pmap_kremove((vm_offset_t) imgp->image_header);
+		vm_page_unwire(imgp->firstpage, 1);
+		imgp->firstpage = NULL;
+	}
+}
+
+/*
+ * Destroy old address space, and allocate a new stack
+ *	The new stack is only SGROWSIZ large because it is grown
+ *	automatically in trap.c.
+ */
+int
+exec_new_vmspace(imgp)
+	struct image_params *imgp;
+{
+	int error;
+	struct vmspace *vmspace = imgp->proc->p_vmspace;
+#ifdef VM_STACK
+	caddr_t	stack_addr = (caddr_t) (USRSTACK - MAXSSIZ);
+#else
+	caddr_t	stack_addr = (caddr_t) (USRSTACK - SGROWSIZ);
+#endif
+	vm_map_t map = &vmspace->vm_map;
+
+	imgp->vmspace_destroyed = 1;
+
+	/*
+	 * Blow away entire process VM, if address space not shared,
+	 * otherwise, create a new VM space so that other threads are
+	 * not disrupted
+	 */
+	if (vmspace->vm_refcnt == 1) {
+		if (vmspace->vm_shm)
+			shmexit(imgp->proc);
+		pmap_remove_pages(&vmspace->vm_pmap, 0, VM_MAXUSER_ADDRESS);
+		vm_map_remove(map, 0, VM_MAXUSER_ADDRESS);
+	} else {
+		vmspace_exec(imgp->proc);
+		vmspace = imgp->proc->p_vmspace;
+		map = &vmspace->vm_map;
+	}
+
+	/* Allocate a new stack */
+#ifdef VM_STACK
+	error = vm_map_stack (&vmspace->vm_map, (vm_offset_t)stack_addr,
+			      (vm_size_t)MAXSSIZ, VM_PROT_ALL, VM_PROT_ALL, 0);
+	if (error)
+		return (error);
+
+	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
+	 * VM_STACK case, but they are still used to monitor the size of the
+	 * process stack so we can check the stack rlimit.
+	 */
+	vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT;
+	vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ;
+#else
+	error = vm_map_insert(&vmspace->vm_map, NULL, 0,
+		(vm_offset_t) stack_addr, (vm_offset_t) USRSTACK,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
+	if (error)
+		return (error);
+
+	vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT;
+
+	/* Initialize maximum stack address */
+	vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ;
+#endif
+
+	return(0);
+}
+
+/*
+ * Copy out argument and environment strings from the old process
+ *	address space into the temporary string buffer.
+ */
+int
+exec_extract_strings(imgp)
+	struct image_params *imgp;
+{
+	char	**argv, **envv;
+	char	*argp, *envp;
+	int	error;
+	size_t	length;
+
+	/*
+	 * extract arguments first
+	 */
+
+	argv = imgp->uap->argv;
+
+	if (argv) {
+		argp = (caddr_t) (intptr_t) fuword(argv);
+		if (argp == (caddr_t) -1)
+			return (EFAULT);
+		if (argp)
+			argv++;
+		if (imgp->argv0)
+			argp = imgp->argv0;
+		if (argp) {
+			do {
+				if (argp == (caddr_t) -1)
+					return (EFAULT);
+				if ((error = copyinstr(argp, imgp->stringp,
+				    imgp->stringspace, &length))) {
+					if (error == ENAMETOOLONG)
+						return(E2BIG);
+					return (error);
+				}
+				imgp->stringspace -= length;
+				imgp->stringp += length;
+				imgp->argc++;
+			} while ((argp = (caddr_t) (intptr_t) fuword(argv++)));
+		}
+	}	
+
+	/*
+	 * extract environment strings
+	 */
+
+	envv = imgp->uap->envv;
+
+	if (envv) {
+		while ((envp = (caddr_t) (intptr_t) fuword(envv++))) {
+			if (envp == (caddr_t) -1)
+				return (EFAULT);
+			if ((error = copyinstr(envp, imgp->stringp,
+			    imgp->stringspace, &length))) {
+				if (error == ENAMETOOLONG)
+					return(E2BIG);
+				return (error);
+			}
+			imgp->stringspace -= length;
+			imgp->stringp += length;
+			imgp->envc++;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Copy strings out to the new process address space, constructing
+ *	new arg and env vector tables. Return a pointer to the base
+ *	so that it can be used as the initial stack pointer.
+ */
+long *
+exec_copyout_strings(imgp)
+	struct image_params *imgp;
+{
+	int argc, envc;
+	char **vectp;
+	char *stringp, *destp;
+	long *stack_base;
+	struct ps_strings *arginfo;
+	int szsigcode;
+
+	/*
+	 * Calculate string base and vector table pointers.
+	 * Also deal with signal trampoline code for this exec type.
+	 */
+	arginfo = (struct ps_strings *)PS_STRINGS;
+	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
+	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
+		roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
+
+	/*
+	 * install sigcode
+	 */
+	if (szsigcode)
+		copyout(imgp->proc->p_sysent->sv_sigcode,
+			((caddr_t)arginfo - szsigcode), szsigcode);
+
+	/*
+	 * If we have a valid auxargs ptr, prepare some room
+	 * on the stack.
+	 */
+	if (imgp->auxargs)
+	/*
+	 * The '+ 2' is for the null pointers at the end of each of the
+	 * arg and env vector sets, and 'AT_COUNT*2' is room for the
+	 * ELF Auxargs data.
+	 */
+		vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
+				  AT_COUNT*2) * sizeof(char*));
+	else 
+	/*
+	 * The '+ 2' is for the null pointers at the end of each of the
+	 * arg and env vector sets
+	 */
+		vectp = (char **)
+			(destp - (imgp->argc + imgp->envc + 2) * sizeof(char*));
+
+	/*
+	 * vectp also becomes our initial stack base
+	 */
+	stack_base = (long *)vectp;
+
+	stringp = imgp->stringbase;
+	argc = imgp->argc;
+	envc = imgp->envc;
+
+	/*
+	 * Copy out strings - arguments and environment.
+	 */
+	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
+
+	/*
+	 * Fill in "ps_strings" struct for ps, w, etc.
+	 */
+	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
+	suword(&arginfo->ps_nargvstr, argc);
+
+	/*
+	 * Fill in argument portion of vector table.
+	 */
+	for (; argc > 0; --argc) {
+		suword(vectp++, (long)(intptr_t)destp);
+		while (*stringp++ != 0)
+			destp++;
+		destp++;
+	}
+
+	/* a null vector table pointer seperates the argp's from the envp's */
+	suword(vectp++, 0);
+
+	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
+	suword(&arginfo->ps_nenvstr, envc);
+
+	/*
+	 * Fill in environment portion of vector table.
+	 */
+	for (; envc > 0; --envc) {
+		suword(vectp++, (long)(intptr_t)destp);
+		while (*stringp++ != 0)
+			destp++;
+		destp++;
+	}
+
+	/* end of vector table is a null pointer */
+	suword(vectp, 0);
+
+	return (stack_base);
+}
+
+/*
+ * Check permissions of file to execute.
+ *	Return 0 for success or error code on failure.
+ */
+int
+exec_check_permissions(imgp)
+	struct image_params *imgp;
+{
+	struct proc *p = imgp->proc;
+	struct vnode *vp = imgp->vp;
+	struct vattr *attr = imgp->attr;
+	int error;
+
+	/* Get file attributes */
+	error = VOP_GETATTR(vp, attr, p->p_ucred, p);
+	if (error)
+		return (error);
+
+	/*
+	 * 1) Check if file execution is disabled for the filesystem that this
+	 *	file resides on.
+	 * 2) Insure that at least one execute bit is on - otherwise root
+	 *	will always succeed, and we don't want to happen unless the
+	 *	file really is executable.
+	 * 3) Insure that the file is a regular file.
+	 */
+	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
+	    ((attr->va_mode & 0111) == 0) ||
+	    (attr->va_type != VREG)) {
+		return (EACCES);
+	}
+
+	/*
+	 * Zero length files can't be exec'd
+	 */
+	if (attr->va_size == 0)
+		return (ENOEXEC);
+
+	/*
+	 *  Check for execute permission to file based on current credentials.
+	 */
+	error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+	if (error)
+		return (error);
+
+	/*
+	 * Check number of open-for-writes on the file and deny execution
+	 * if there are any.
+	 */
+	if (vp->v_writecount)
+		return (ETXTBSY);
+
+	/*
+	 * Call filesystem specific open routine (which does nothing in the
+	 * general case).
+	 */
+	error = VOP_OPEN(vp, FREAD, p->p_ucred, p);
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+/*
+ * Exec handler registration
+ */
+int
+exec_register(execsw_arg)
+	const struct execsw *execsw_arg;
+{
+	const struct execsw **es, **xs, **newexecsw;
+	int count = 2;	/* New slot and trailing NULL */
+
+	if (execsw)
+		for (es = execsw; *es; es++)
+			count++;
+	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+	if (newexecsw == NULL)
+		return ENOMEM;
+	xs = newexecsw;
+	if (execsw)
+		for (es = execsw; *es; es++)
+			*xs++ = *es;
+	*xs++ = execsw_arg;
+	*xs = NULL;
+	if (execsw)
+		free(execsw, M_TEMP);
+	execsw = newexecsw;
+	return 0;
+}
+
+int
+exec_unregister(execsw_arg)
+	const struct execsw *execsw_arg;
+{
+	const struct execsw **es, **xs, **newexecsw;
+	int count = 1;
+
+	if (execsw == NULL)
+		panic("unregister with no handlers left?\n");
+
+	for (es = execsw; *es; es++) {
+		if (*es == execsw_arg)
+			break;
+	}
+	if (*es == NULL)
+		return ENOENT;
+	for (es = execsw; *es; es++)
+		if (*es != execsw_arg)
+			count++;
+	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+	if (newexecsw == NULL)
+		return ENOMEM;
+	xs = newexecsw;
+	for (es = execsw; *es; es++)
+		if (*es != execsw_arg)
+			*xs++ = *es;
+	*xs = NULL;
+	if (execsw)
+		free(execsw, M_TEMP);
+	execsw = newexecsw;
+	return 0;
+}
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
new file mode 100644
index 0000000..7be01af
--- /dev/null
+++ b/sys/kern/kern_exit.c
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
+ * $Id: kern_exit.c,v 1.70 1998/12/19 02:55:33 julian Exp $
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/tty.h>
+#include <sys/wait.h>
+#include <sys/vnode.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/ptrace.h>
+#include <sys/acct.h>		/* for acct_process() function prototype */
+#include <sys/filedesc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+#include <sys/aio.h>
+
+#ifdef COMPAT_43
+#include <machine/reg.h>
+#include <machine/psl.h>
+#endif
+#include <machine/limits.h>	/* for UCHAR_MAX = typeof(p_priority)_MAX */
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_zone.h>
+#ifdef COMPAT_LINUX_THREADS
+#include <sys/user.h>
+#endif
+
+static MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status");
+
+static int wait1 __P((struct proc *, struct wait_args *, int));
+
+/*
+ * callout list for things to do at exit time
+ */
+typedef struct exit_list_element {
+	struct exit_list_element *next;
+	exitlist_fn function;
+} *ele_p;
+
+static ele_p exit_list;
+
+/*
+ * exit --
+ *	Death of process.
+ */
+void
+exit(p, uap)
+	struct proc *p;
+	struct rexit_args /* {
+		int	rval;
+	} */ *uap;
+{
+
+	exit1(p, W_EXITCODE(uap->rval, 0));
+	/* NOTREACHED */
+}
+
+/*
+ * Exit: deallocate address space and other resources, change proc state
+ * to zombie, and unlink proc from allproc and parent's lists.  Save exit
+ * status and rusage for wait().  Check for child processes and orphan them.
+ */
+void
+exit1(p, rv)
+	register struct proc *p;
+	int rv;
+{
+	register struct proc *q, *nq;
+	register struct vmspace *vm;
+	ele_p ep = exit_list;
+
+	if (p->p_pid == 1) {
+		printf("init died (signal %d, exit %d)\n",
+		    WTERMSIG(rv), WEXITSTATUS(rv));
+		panic("Going nowhere without my init!");
+	}
+
+	aio_proc_rundown(p);
+
+	/* are we a task leader? */
+	if(p == p->p_leader) {
+        	struct kill_args killArgs;
+		killArgs.signum = SIGKILL;
+		q = p->p_peers;
+		while(q) {
+			killArgs.pid = q->p_pid;
+			/*
+		         * The interface for kill is better
+			 * than the internal signal
+			 */
+			kill(p, &killArgs);
+			nq = q;
+			q = q->p_peers;
+			/*
+			 * orphan the threads so we don't mess up
+			 * when they call exit
+			 */
+			nq->p_peers = 0;
+			nq->p_leader = nq;
+		}
+
+	/* otherwise are we a peer? */
+	} else if(p->p_peers) {
+		q = p->p_leader;
+		while(q->p_peers != p)
+			q = q->p_peers;
+		q->p_peers = p->p_peers;
+	}
+
+#ifdef PGINPROF
+	vmsizmon();
+#endif
+	STOPEVENT(p, S_EXIT, rv);
+
+	/* 
+	 * Check if any LKMs need anything done at process exit.
+	 * e.g. SYSV IPC stuff
+	 * XXX what if one of these generates an error?
+	 */
+	while (ep) {
+		(*ep->function)(p);
+		ep = ep->next;
+	}
+
+	if (p->p_flag & P_PROFIL)
+		stopprofclock(p);
+	MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
+		M_ZOMBIE, M_WAITOK);
+	/*
+	 * If parent is waiting for us to exit or exec,
+	 * P_PPWAIT is set; we will wakeup the parent below.
+	 */
+	p->p_flag &= ~(P_TRACED | P_PPWAIT);
+	p->p_flag |= P_WEXIT;
+#ifndef COMPAT_LINUX_THREADS
+	p->p_sigignore = ~0;
+#endif /* COMPAT_LINUX_THREADS */
+	p->p_siglist = 0;
+	if (timevalisset(&p->p_realtimer.it_value))
+		untimeout(realitexpire, (caddr_t)p, p->p_ithandle);
+
+	/*
+	 * Reset any sigio structures pointing to us as a result of
+	 * F_SETOWN with our pid.
+	 */
+	funsetownlst(&p->p_sigiolst);
+
+	/*
+	 * Close open files and release open-file table.
+	 * This may block!
+	 */
+	fdfree(p);
+
+	/*
+	 * XXX Shutdown SYSV semaphores
+	 */
+	semexit(p);
+
+	/* The next two chunks should probably be moved to vmspace_exit. */
+	vm = p->p_vmspace;
+	/*
+	 * Release user portion of address space.
+	 * This releases references to vnodes,
+	 * which could cause I/O if the file has been unlinked.
+	 * Need to do this early enough that we can still sleep.
+	 * Can't free the entire vmspace as the kernel stack
+	 * may be mapped within that space also.
+	 */
+	if (vm->vm_refcnt == 1) {
+		if (vm->vm_shm)
+			shmexit(p);
+		pmap_remove_pages(&vm->vm_pmap, VM_MIN_ADDRESS,
+		    VM_MAXUSER_ADDRESS);
+		(void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS,
+		    VM_MAXUSER_ADDRESS);
+	}
+
+	if (SESS_LEADER(p)) {
+		register struct session *sp = p->p_session;
+
+		if (sp->s_ttyvp) {
+			/*
+			 * Controlling process.
+			 * Signal foreground pgrp,
+			 * drain controlling terminal
+			 * and revoke access to controlling terminal.
+			 */
+			if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
+				if (sp->s_ttyp->t_pgrp)
+					pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
+				(void) ttywait(sp->s_ttyp);
+				/*
+				 * The tty could have been revoked
+				 * if we blocked.
+				 */
+				if (sp->s_ttyvp)
+					VOP_REVOKE(sp->s_ttyvp, REVOKEALL);
+			}
+			if (sp->s_ttyvp)
+				vrele(sp->s_ttyvp);
+			sp->s_ttyvp = NULL;
+			/*
+			 * s_ttyp is not zero'd; we use this to indicate
+			 * that the session once had a controlling terminal.
+			 * (for logging and informational purposes)
+			 */
+		}
+		sp->s_leader = NULL;
+	}
+	fixjobc(p, p->p_pgrp, 0);
+	(void)acct_process(p);
+#ifdef KTRACE
+	/*
+	 * release trace file
+	 */
+	p->p_traceflag = 0;	/* don't trace the vrele() */
+	if (p->p_tracep)
+		vrele(p->p_tracep);
+#endif
+	/*
+	 * Remove proc from allproc queue and pidhash chain.
+	 * Place onto zombproc.  Unlink from parent's child list.
+	 */
+	LIST_REMOVE(p, p_list);
+	LIST_INSERT_HEAD(&zombproc, p, p_list);
+	p->p_stat = SZOMB;
+
+	LIST_REMOVE(p, p_hash);
+
+	q = p->p_children.lh_first;
+	if (q)		/* only need this if any child is S_ZOMB */
+		wakeup((caddr_t) initproc);
+	for (; q != 0; q = nq) {
+		nq = q->p_sibling.le_next;
+		LIST_REMOVE(q, p_sibling);
+		LIST_INSERT_HEAD(&initproc->p_children, q, p_sibling);
+		q->p_pptr = initproc;
+#ifdef COMPAT_LINUX_THREADS
+		q->p_sigparent = 0;
+#endif /* COMPAT_LINUX_THREADS */
+		/*
+		 * Traced processes are killed
+		 * since their existence means someone is screwing up.
+		 */
+		if (q->p_flag & P_TRACED) {
+			q->p_flag &= ~P_TRACED;
+			psignal(q, SIGKILL);
+		}
+	}
+
+	/*
+	 * Save exit status and final rusage info, adding in child rusage
+	 * info and self times.
+	 */
+	p->p_xstat = rv;
+	*p->p_ru = p->p_stats->p_ru;
+	calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL);
+	ruadd(p->p_ru, &p->p_stats->p_cru);
+
+	/*
+	 * Notify parent that we're gone.  If parent has the P_NOCLDWAIT
+	 * flag set, notify process 1 instead (and hope it will handle
+	 * this situation).
+	 */
+#ifndef COMPAT_LINUX_THREADS
+	if (p->p_pptr->p_flag & P_NOCLDWAIT) {
+#else
+	if (p->p_pptr->p_procsig->ps_flag & P_NOCLDWAIT) {
+#endif /* COMPAT_LINUX_THREADS */
+		struct proc *pp = p->p_pptr;
+		proc_reparent(p, initproc);
+		/*
+		 * If this was the last child of our parent, notify
+		 * parent, so in case he was wait(2)ing, he will
+		 * continue.
+		 */
+		if (LIST_EMPTY(&pp->p_children))
+			wakeup((caddr_t)pp);
+	}
+
+#ifndef COMPAT_LINUX_THREADS
+	psignal(p->p_pptr, SIGCHLD);
+#else
+	if (p->p_sigparent && p->p_pptr != initproc) {
+	        psignal(p->p_pptr, p->p_sigparent);
+	} else {
+	        psignal(p->p_pptr, SIGCHLD);
+	}
+#endif /* COMPAT_LINUX_THREADS */
+	wakeup((caddr_t)p->p_pptr);
+#if defined(tahoe)
+	/* move this to cpu_exit */
+	p->p_addr->u_pcb.pcb_savacc.faddr = (float *)NULL;
+#endif
+	/*
+	 * Clear curproc after we've done all operations
+	 * that could block, and before tearing down the rest
+	 * of the process state that might be used from clock, etc.
+	 * Also, can't clear curproc while we're still runnable,
+	 * as we're not on a run queue (we are current, just not
+	 * a proper proc any longer!).
+	 *
+	 * Other substructures are freed from wait().
+	 */
+	curproc = NULL;
+	if (--p->p_limit->p_refcnt == 0) {
+		FREE(p->p_limit, M_SUBPROC);
+		p->p_limit = NULL;
+	}
+
+	/*
+	 * Finally, call machine-dependent code to release the remaining
+	 * resources including address space, the kernel stack and pcb.
+	 * The address space is released by "vmspace_free(p->p_vmspace)";
+	 * This is machine-dependent, as we may have to change stacks
+	 * or ensure that the current one isn't reallocated before we
+	 * finish.  cpu_exit will end with a call to cpu_switch(), finishing
+	 * our execution (pun intended).
+	 */
+	cpu_exit(p);
+}
+
+#ifdef COMPAT_43
+#if defined(hp300) || defined(luna68k)
+#include <machine/frame.h>
+#define GETPS(rp)	((struct frame *)(rp))->f_sr
+#else
+#define GETPS(rp)	(rp)[PS]
+#endif
+
+int
+owait(p, uap)
+	struct proc *p;
+	register struct owait_args /* {
+		int     dummy;
+	} */ *uap;
+{
+	struct wait_args w;
+
+#ifdef PSL_ALLCC
+	if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) {
+		w.options = 0;
+		w.rusage = NULL;
+	} else {
+		w.options = p->p_md.md_regs[R0];
+		w.rusage = (struct rusage *)p->p_md.md_regs[R1];
+	}
+#else
+	w.options = 0;
+	w.rusage = NULL;
+#endif
+	w.pid = WAIT_ANY;
+	w.status = NULL;
+	return (wait1(p, &w, 1));
+}
+#endif /* COMPAT_43 */
+
+int
+wait4(p, uap)
+	struct proc *p;
+	struct wait_args *uap;
+{
+
+	return (wait1(p, uap, 0));
+}
+
+static int
+wait1(q, uap, compat)
+	register struct proc *q;
+	register struct wait_args /* {
+		int pid;
+		int *status;
+		int options;
+		struct rusage *rusage;
+	} */ *uap;
+	int compat;
+{
+	register int nfound;
+	register struct proc *p, *t;
+	int status, error;
+
+	if (uap->pid == 0)
+		uap->pid = -q->p_pgid;
+	if (uap->options &~ (WUNTRACED|WNOHANG))
+		return (EINVAL);
+loop:
+	nfound = 0;
+	for (p = q->p_children.lh_first; p != 0; p = p->p_sibling.le_next) {
+		if (uap->pid != WAIT_ANY &&
+		    p->p_pid != uap->pid && p->p_pgid != -uap->pid)
+			continue;
+		nfound++;
+		if (p->p_stat == SZOMB) {
+			/* charge childs scheduling cpu usage to parent */
+			if (curproc->p_pid != 1) {
+				curproc->p_estcpu = min(curproc->p_estcpu +
+				    p->p_estcpu, UCHAR_MAX);
+			}
+
+			q->p_retval[0] = p->p_pid;
+#ifdef COMPAT_43
+			if (compat)
+				q->p_retval[1] = p->p_xstat;
+			else
+#endif
+			if (uap->status) {
+				status = p->p_xstat;	/* convert to int */
+				if ((error = copyout((caddr_t)&status,
+				    (caddr_t)uap->status, sizeof(status))))
+					return (error);
+			}
+			if (uap->rusage && (error = copyout((caddr_t)p->p_ru,
+			    (caddr_t)uap->rusage, sizeof (struct rusage))))
+				return (error);
+			/*
+			 * If we got the child via a ptrace 'attach',
+			 * we need to give it back to the old parent.
+			 */
+			if (p->p_oppid && (t = pfind(p->p_oppid))) {
+				p->p_oppid = 0;
+				proc_reparent(p, t);
+				psignal(t, SIGCHLD);
+				wakeup((caddr_t)t);
+				return (0);
+			}
+			p->p_xstat = 0;
+			ruadd(&q->p_stats->p_cru, p->p_ru);
+			FREE(p->p_ru, M_ZOMBIE);
+			p->p_ru = NULL;
+
+			/*
+			 * Decrement the count of procs running with this uid.
+			 */
+			(void)chgproccnt(p->p_cred->p_ruid, -1);
+
+			/*
+			 * Release reference to text vnode
+			 */
+			if (p->p_textvp)
+				vrele(p->p_textvp);
+
+			/*
+			 * Free up credentials.
+			 */
+			if (--p->p_cred->p_refcnt == 0) {
+				crfree(p->p_cred->pc_ucred);
+				FREE(p->p_cred, M_SUBPROC);
+				p->p_cred = NULL;
+			}
+
+			/*
+			 * Finally finished with old proc entry.
+			 * Unlink it from its process group and free it.
+			 */
+			leavepgrp(p);
+			LIST_REMOVE(p, p_list);	/* off zombproc */
+			LIST_REMOVE(p, p_sibling);
+
+#ifdef COMPAT_LINUX_THREADS
+			if (--p->p_procsig->ps_refcnt == 0) {
+				if (p->p_sigacts != &p->p_addr->u_sigacts)
+					FREE(p->p_sigacts, M_SUBPROC);
+			        FREE(p->p_procsig, M_SUBPROC);
+				p->p_procsig = NULL;
+			}
+#endif /* COMPAT_LINUX_THREADS */
+			/*
+			 * Give machine-dependent layer a chance
+			 * to free anything that cpu_exit couldn't
+			 * release while still running in process context.
+			 */
+			cpu_wait(p);
+			zfree(proc_zone, p);
+			nprocs--;
+			return (0);
+		}
+		if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 &&
+		    (p->p_flag & P_TRACED || uap->options & WUNTRACED)) {
+			p->p_flag |= P_WAITED;
+			q->p_retval[0] = p->p_pid;
+#ifdef COMPAT_43
+			if (compat) {
+				q->p_retval[1] = W_STOPCODE(p->p_xstat);
+				error = 0;
+			} else
+#endif
+			if (uap->status) {
+				status = W_STOPCODE(p->p_xstat);
+				error = copyout((caddr_t)&status,
+					(caddr_t)uap->status, sizeof(status));
+			} else
+				error = 0;
+			return (error);
+		}
+	}
+	if (nfound == 0)
+		return (ECHILD);
+	if (uap->options & WNOHANG) {
+		q->p_retval[0] = 0;
+		return (0);
+	}
+	if ((error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0)))
+		return (error);
+	goto loop;
+}
+
+/*
+ * make process 'parent' the new parent of process 'child'.
+ */
+void
+proc_reparent(child, parent)
+	register struct proc *child;
+	register struct proc *parent;
+{
+
+	if (child->p_pptr == parent)
+		return;
+
+	LIST_REMOVE(child, p_sibling);
+	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
+	child->p_pptr = parent;
+}
+
+/*
+ * The next two functions are to handle adding/deleting items on the
+ * exit callout list
+ * 
+ * at_exit():
+ * Take the arguments given and put them onto the exit callout list,
+ * However first make sure that it's not already there.
+ * returns 0 on success.
+ */
+int
+at_exit(function)
+	exitlist_fn function;
+{
+	ele_p ep;
+
+	/* Be noisy if the programmer has lost track of things */
+	if (rm_at_exit(function)) 
+		printf("exit callout entry already present\n");
+	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->next = exit_list;
+	ep->function = function;
+	exit_list = ep;
+	return (0);
+}
+/*
+ * Scan the exit callout list for the given items and remove them.
+ * Returns the number of items removed.
+ * Logically this can only be 0 or 1.
+ */
+int
+rm_at_exit(function)
+	exitlist_fn function;
+{
+	ele_p *epp, ep;
+	int count;
+
+	count = 0;
+	epp = &exit_list;
+	ep = *epp;
+	while (ep) {
+		if (ep->function == function) {
+			*epp = ep->next;
+			free(ep, M_TEMP);
+			count++;
+		} else {
+			epp = &ep->next;
+		}
+		ep = *epp;
+	}
+	return (count);
+}
+
+#ifdef COMPAT_LINUX_THREADS
+void check_sigacts (void)
+{
+	struct proc *p = curproc;
+	struct sigacts *pss;
+	int s;
+
+	if (p->p_procsig->ps_refcnt == 1 &&
+	    p->p_sigacts != &p->p_addr->u_sigacts) {
+		pss = p->p_sigacts;
+		s = splhigh();
+		p->p_addr->u_sigacts = *pss;
+		p->p_sigacts = &p->p_addr->u_sigacts;
+		splx(s);
+		FREE(pss, M_SUBPROC);
+	}
+}
+#endif
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
new file mode 100644
index 0000000..732712b
--- /dev/null
+++ b/sys/kern/kern_fork.c
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
+ * $Id: kern_fork.c,v 1.53 1998/12/19 02:55:33 julian Exp $
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/vnode.h>
+#include <sys/acct.h>
+#include <sys/ktrace.h>
+#include <sys/unistd.h>	
+
+#include <vm/vm.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
+
+#ifdef COMPAT_LINUX_THREADS
+#include <machine/frame.h>
+#include <sys/user.h>
+#endif /* COMPAT_LINUX_THREADS */
+#ifdef SMP
+static int	fast_vfork = 0;	/* Doesn't work on SMP yet. */
+#else
+static int	fast_vfork = 1;
+#endif
+SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0, "");
+
+/*
+ * These are the stuctures used to create a callout list for things to do
+ * when forking a process
+ */
+typedef struct fork_list_element {
+	struct fork_list_element *next;
+	forklist_fn function;
+} *fle_p;
+
+static fle_p	fork_list;
+
+#ifndef _SYS_SYSPROTO_H_
+struct fork_args {
+	int     dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+fork(p, uap)
+	struct proc *p;
+	struct fork_args *uap;
+{
+
+	return (fork1(p, RFFDG | RFPROC));
+}
+
+/* ARGSUSED */
+int
+vfork(p, uap)
+	struct proc *p;
+	struct vfork_args *uap;
+{
+
+	return (fork1(p, RFFDG | RFPROC | RFPPWAIT | (fast_vfork ? RFMEM : 0)));
+}
+
+/* ARGSUSED */
+int
+rfork(p, uap)
+	struct proc *p;
+	struct rfork_args *uap;
+{
+
+	return (fork1(p, uap->flags));
+}
+
+
+int	nprocs = 1;		/* process 0 */
+static int nextpid = 0;
+
+int
+fork1(p1, flags)
+	register struct proc *p1;
+	int flags;
+{
+	register struct proc *p2, *pptr;
+	register uid_t uid;
+	struct proc *newproc;
+	int count;
+	static int pidchecked = 0;
+	fle_p ep ;
+
+	ep = fork_list;
+
+	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+		return (EINVAL);
+
+#ifdef SMP
+	/*
+	 * FATAL now, we cannot have the same PTD on both cpus, the PTD
+	 * needs to move out of PTmap and be per-process, even for shared
+	 * page table processes.  Unfortunately, this means either removing
+	 * PTD[] as a fixed virtual address, or move it to the per-cpu map
+	 * area for SMP mode.  Both cases require seperate management of
+	 * the per-process-even-if-PTmap-is-shared PTD.
+	 */
+	if (flags & RFMEM) {
+		printf("shared address space fork attempted: pid: %d\n",
+		    p1->p_pid);
+		return (EOPNOTSUPP);
+	}
+#endif
+
+	/*
+	 * Here we don't create a new process, but we divorce
+	 * certain parts of a process from itself.
+	 */
+	if ((flags & RFPROC) == 0) {
+
+		/*
+		 * Divorce the memory, if it is shared, essentially
+		 * this changes shared memory amongst threads, into
+		 * COW locally.
+		 */
+		if ((flags & RFMEM) == 0) {
+			if (p1->p_vmspace->vm_refcnt > 1) {
+				vmspace_unshare(p1);
+			}
+		}
+
+		/*
+		 * Close all file descriptors.
+		 */
+		if (flags & RFCFDG) {
+			struct filedesc *fdtmp;
+			fdtmp = fdinit(p1);
+			fdfree(p1);
+			p1->p_fd = fdtmp;
+		}
+
+		/*
+		 * Unshare file descriptors (from parent.)
+		 */
+		if (flags & RFFDG) {
+			if (p1->p_fd->fd_refcnt > 1) {
+				struct filedesc *newfd;
+				newfd = fdcopy(p1);
+				fdfree(p1);
+				p1->p_fd = newfd;
+			}
+		}
+		return (0);
+	}
+
+	/*
+	 * Although process entries are dynamically created, we still keep
+	 * a global limit on the maximum number we will create.  Don't allow
+	 * a nonprivileged user to use the last process; don't let root
+	 * exceed the limit. The variable nprocs is the current number of
+	 * processes, maxproc is the limit.
+	 */
+	uid = p1->p_cred->p_ruid;
+	if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
+		tablefull("proc");
+		return (EAGAIN);
+	}
+	/*
+	 * Increment the nprocs resource before blocking can occur.  There
+	 * are hard-limits as to the number of processes that can run.
+	 */
+	nprocs++;
+
+	/*
+	 * Increment the count of procs running with this uid. Don't allow
+	 * a nonprivileged user to exceed their current limit.
+	 */
+	count = chgproccnt(uid, 1);
+	if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) {
+		(void)chgproccnt(uid, -1);
+		/*
+		 * Back out the process count
+		 */
+		nprocs--;
+		return (EAGAIN);
+	}
+
+	/* Allocate new proc. */
+	newproc = zalloc(proc_zone);
+
+/*
+ * Setup linkage for kernel based threading
+ */
+	if((flags & RFTHREAD) != 0) {
+		newproc->p_peers = p1->p_peers;
+		p1->p_peers = newproc;
+		newproc->p_leader = p1->p_leader;
+	} else {
+		newproc->p_peers = 0;
+		newproc->p_leader = newproc;
+	}
+
+	newproc->p_wakeup = 0;
+
+	/*
+	 * Find an unused process ID.  We remember a range of unused IDs
+	 * ready to use (from nextpid+1 through pidchecked-1).
+	 */
+	nextpid++;
+retry:
+	/*
+	 * If the process ID prototype has wrapped around,
+	 * restart somewhat above 0, as the low-numbered procs
+	 * tend to include daemons that don't exit.
+	 */
+	if (nextpid >= PID_MAX) {
+		nextpid = 100;
+		pidchecked = 0;
+	}
+	if (nextpid >= pidchecked) {
+		int doingzomb = 0;
+
+		pidchecked = PID_MAX;
+		/*
+		 * Scan the active and zombie procs to check whether this pid
+		 * is in use.  Remember the lowest pid that's greater
+		 * than nextpid, so we can avoid checking for a while.
+		 */
+		p2 = allproc.lh_first;
+again:
+		for (; p2 != 0; p2 = p2->p_list.le_next) {
+			while (p2->p_pid == nextpid ||
+			    p2->p_pgrp->pg_id == nextpid ||
+			    p2->p_session->s_sid == nextpid) {
+				nextpid++;
+				if (nextpid >= pidchecked)
+					goto retry;
+			}
+			if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
+				pidchecked = p2->p_pid;
+			if (p2->p_pgrp->pg_id > nextpid &&
+			    pidchecked > p2->p_pgrp->pg_id)
+				pidchecked = p2->p_pgrp->pg_id;
+			if (p2->p_session->s_sid > nextpid &&
+			    pidchecked > p2->p_session->s_sid)
+				pidchecked = p2->p_session->s_sid;
+		}
+		if (!doingzomb) {
+			doingzomb = 1;
+			p2 = zombproc.lh_first;
+			goto again;
+		}
+	}
+
+	p2 = newproc;
+	p2->p_stat = SIDL;			/* protect against others */
+	p2->p_pid = nextpid;
+	LIST_INSERT_HEAD(&allproc, p2, p_list);
+	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
+
+	/*
+	 * Make a proc table entry for the new process.
+	 * Start by zeroing the section of proc that is zero-initialized,
+	 * then copy the section that is copied directly from the parent.
+	 */
+	bzero(&p2->p_startzero,
+	    (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
+	bcopy(&p1->p_startcopy, &p2->p_startcopy,
+	    (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
+
+	p2->p_aioinfo = NULL;
+
+	/*
+	 * Duplicate sub-structures as needed.
+	 * Increase reference counts on shared objects.
+	 * The p_stats and p_sigacts substructs are set in vm_fork.
+	 */
+	p2->p_flag = P_INMEM;
+	if (p1->p_flag & P_PROFIL)
+		startprofclock(p2);
+	MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
+	    M_SUBPROC, M_WAITOK);
+	bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
+	p2->p_cred->p_refcnt = 1;
+	crhold(p1->p_ucred);
+
+#ifdef COMPAT_LINUX_THREADS
+	if (flags & RFSIGSHARE) {
+		p2->p_procsig = p1->p_procsig;
+		p2->p_procsig->ps_refcnt++;
+		if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
+			struct sigacts *newsigacts;
+			int s;
+
+			if (p2->p_procsig->ps_refcnt != 2)
+				printf ("PID:%d Creating shared sigacts with procsig->ps_refcnt %d\n",
+					p2->p_pid, p2->p_procsig->ps_refcnt);
+			/* Create the shared sigacts structure */
+			MALLOC (newsigacts, struct sigacts *, sizeof (struct sigacts),
+				M_SUBPROC, M_WAITOK);
+			s = splhigh();
+			/* Set p_sigacts to the new shared structure.  Note that this
+			 * is updating p1->p_sigacts at the same time, since p_sigacts
+			 * is just a pointer to the shared p_procsig->ps_sigacts.
+			 */
+			p2->p_sigacts  = newsigacts;
+			/* Copy in the values from the u area */
+			*p2->p_sigacts = p1->p_addr->u_sigacts;
+			splx (s);
+		}
+	} else {
+		MALLOC (p2->p_procsig, struct procsig *, sizeof(struct procsig),
+			M_SUBPROC, M_WAITOK);
+		bcopy(&p1->p_procsig->ps_begincopy, &p2->p_procsig->ps_begincopy,
+			(unsigned)&p1->p_procsig->ps_endcopy -
+			(unsigned)&p1->p_procsig->ps_begincopy);
+		p2->p_procsig->ps_refcnt = 1;
+		/* Note that we fill in the values of sigacts in vm_fork */
+		p2->p_sigacts = NULL;
+	}
+	if (flags & RFLINUXTHPN) {
+	        p2->p_sigparent = SIGUSR1;
+	}
+#endif /* COMPAT_LINUX_THREADS */
+	/* bump references to the text vnode (for procfs) */
+	p2->p_textvp = p1->p_textvp;
+	if (p2->p_textvp)
+		VREF(p2->p_textvp);
+
+	if (flags & RFCFDG)
+		p2->p_fd = fdinit(p1);
+	else if (flags & RFFDG)
+		p2->p_fd = fdcopy(p1);
+	else
+		p2->p_fd = fdshare(p1);
+
+	/*
+	 * If p_limit is still copy-on-write, bump refcnt,
+	 * otherwise get a copy that won't be modified.
+	 * (If PL_SHAREMOD is clear, the structure is shared
+	 * copy-on-write.)
+	 */
+	if (p1->p_limit->p_lflags & PL_SHAREMOD)
+		p2->p_limit = limcopy(p1->p_limit);
+	else {
+		p2->p_limit = p1->p_limit;
+		p2->p_limit->p_refcnt++;
+	}
+
+	/*
+	 * Preserve some more flags in subprocess.  P_PROFIL has already
+	 * been preserved.
+	 */
+	p2->p_flag |= p1->p_flag & P_SUGID;
+	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
+		p2->p_flag |= P_CONTROLT;
+	if (flags & RFPPWAIT)
+		p2->p_flag |= P_PPWAIT;
+
+	LIST_INSERT_AFTER(p1, p2, p_pglist);
+
+	/*
+	 * Attach the new process to its parent.
+	 *
+	 * If RFNOWAIT is set, the newly created process becomes a child
+	 * of init.  This effectively disassociates the child from the
+	 * parent.
+	 */
+	if (flags & RFNOWAIT)
+		pptr = initproc;
+	else
+		pptr = p1;
+	p2->p_pptr = pptr;
+	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
+	LIST_INIT(&p2->p_children);
+
+#ifdef KTRACE
+	/*
+	 * Copy traceflag and tracefile if enabled.
+	 * If not inherited, these were zeroed above.
+	 */
+	if (p1->p_traceflag&KTRFAC_INHERIT) {
+		p2->p_traceflag = p1->p_traceflag;
+		if ((p2->p_tracep = p1->p_tracep) != NULL)
+			VREF(p2->p_tracep);
+	}
+#endif
+
+	/*
+	 * set priority of child to be that of parent
+	 */
+	p2->p_estcpu = p1->p_estcpu;
+
+	/*
+	 * This begins the section where we must prevent the parent
+	 * from being swapped.
+	 */
+	p1->p_flag |= P_NOSWAP;
+
+	/*
+	 * Finish creating the child process.  It will return via a different
+	 * execution path later.  (ie: directly into user mode)
+	 */
+	vm_fork(p1, p2, flags);
+
+	/*
+	 * Both processes are set up, now check if any LKMs want
+	 * to adjust anything.
+	 *   What if they have an error? XXX
+	 */
+	while (ep) {
+		(*ep->function)(p1, p2, flags);
+		ep = ep->next;
+	}
+
+	/*
+	 * Make child runnable and add to run queue.
+	 */
+	microtime(&(p2->p_stats->p_start));
+	p2->p_acflag = AFORK;
+	(void) splhigh();
+	p2->p_stat = SRUN;
+	setrunqueue(p2);
+	(void) spl0();
+
+	/*
+	 * Now can be swapped.
+	 */
+	p1->p_flag &= ~P_NOSWAP;
+
+	/*
+	 * Preserve synchronization semantics of vfork.  If waiting for
+	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
+	 * proc (in case of exit).
+	 */
+	while (p2->p_flag & P_PPWAIT)
+		tsleep(p1, PWAIT, "ppwait", 0);
+
+	/*
+	 * Return child pid to parent process,
+	 * marking us as parent via p1->p_retval[1].
+	 */
+	p1->p_retval[0] = p2->p_pid;
+	p1->p_retval[1] = 0;
+	return (0);
+}
+
+/*
+ * The next two functionms are general routines to handle adding/deleting
+ * items on the fork callout list.
+ *
+ * at_fork():
+ * Take the arguments given and put them onto the fork callout list,
+ * However first make sure that it's not already there.
+ * Returns 0 on success or a standard error number.
+ */
+int
+at_fork(function)
+	forklist_fn function;
+{
+	fle_p ep;
+
+	/* let the programmer know if he's been stupid */
+	if (rm_at_fork(function)) 
+		printf("fork callout entry already present\n");
+	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->next = fork_list;
+	ep->function = function;
+	fork_list = ep;
+	return (0);
+}
+
+/*
+ * Scan the exit callout list for the given items and remove them.
+ * Returns the number of items removed.
+ * Theoretically this value can only be 0 or 1.
+ */
+int
+rm_at_fork(function)
+	forklist_fn function;
+{
+	fle_p *epp, ep;
+	int count;
+
+	count= 0;
+	epp = &fork_list;
+	ep = *epp;
+	while (ep) {
+		if (ep->function == function) {
+			*epp = ep->next;
+			free(ep, M_TEMP);
+			count++;
+		} else {
+			epp = &ep->next;
+		}
+		ep = *epp;
+	}
+	return (count);
+}
diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c
new file mode 100644
index 0000000..1d6756c
--- /dev/null
+++ b/sys/kern/kern_intr.c
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id: kern_intr.c,v 1.20 1998/09/26 14:25:31 dfr Exp $
+ *
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/errno.h>
+#ifdef RESOURCE_CHECK
+#include <sys/drvresource.h>
+#endif /* RESOURCE_CHECK */
+
+#include <machine/ipl.h>
+
+#ifdef __i386__
+#include <i386/isa/icu.h>
+#include <i386/isa/intr_machdep.h>
+#endif
+
+#include <sys/interrupt.h>
+
+#include <stddef.h>
+
+#ifdef __i386__
+
+typedef struct intrec {
+	intrmask_t	mask;
+	inthand2_t	*handler;
+	void		*argument;
+	struct intrec	*next;
+	void		*devdata;
+	int		intr;
+	intrmask_t	*maskptr;
+	int		flags;
+} intrec;
+
+static intrec *intreclist_head[NHWI];
+
+#endif
+
+struct swilist {
+	swihand_t	*sl_handler;
+	struct swilist	*sl_next;
+};
+
+static struct swilist swilists[NSWI];
+
+#ifdef __i386__
+
+/*
+ * The interrupt multiplexer calls each of the handlers in turn,
+ * and applies the associated interrupt mask to "cpl", which is
+ * defined as a ".long" in /sys/i386/isa/ipl.s
+ */
+
+#ifndef SMP
+static __inline intrmask_t
+splq(intrmask_t mask)
+{
+	intrmask_t tmp = cpl;
+	cpl |= mask;
+	return (tmp);
+}
+#endif /* SMP */
+
+static void
+intr_mux(void *arg)
+{
+	intrec *p = arg;
+
+	while (p != NULL) {
+		int oldspl = splq(p->mask);
+		p->handler(p->argument);
+		splx(oldspl);
+		p = p->next;
+	}
+}
+
+static intrec*
+find_idesc(unsigned *maskptr, int irq)
+{
+	intrec *p = intreclist_head[irq];
+
+	while (p && p->maskptr != maskptr)
+		p = p->next;
+
+	return (p);
+}
+
+static intrec**
+find_pred(intrec *idesc, int irq)
+{
+	intrec **pp = &intreclist_head[irq];
+	intrec *p = *pp;
+
+	while (p != idesc) {
+		if (p == NULL)
+			return (NULL);
+		pp = &p->next;
+		p = *pp;
+	}
+	return (pp);
+}
+
+/*
+ * Both the low level handler and the shared interrupt multiplexer
+ * block out further interrupts as set in the handlers "mask", while
+ * the handler is running. In fact *maskptr should be used for this
+ * purpose, but since this requires one more pointer dereference on
+ * each interrupt, we rather bother update "mask" whenever *maskptr
+ * changes. The function "update_masks" should be called **after**
+ * all manipulation of the linked list of interrupt handlers hung
+ * off of intrdec_head[irq] is complete, since the chain of handlers
+ * will both determine the *maskptr values and the instances of mask
+ * that are fixed. This function should be called with the irq for
+ * which a new handler has been add blocked, since the masks may not
+ * yet know about the use of this irq for a device of a certain class.
+ */
+
+static void
+update_mux_masks(void)
+{
+	int irq;
+	for (irq = 0; irq < ICU_LEN; irq++) {
+		intrec *idesc = intreclist_head[irq];
+		while (idesc != NULL) {
+			if (idesc->maskptr != NULL) {
+				/* our copy of *maskptr may be stale, refresh */
+				idesc->mask = *idesc->maskptr;
+			}
+			idesc = idesc->next;
+		}
+	}
+}
+
+static void
+update_masks(intrmask_t *maskptr, int irq)
+{
+	intrmask_t mask = 1 << irq;
+
+	if (maskptr == NULL)
+		return;
+
+	if (find_idesc(maskptr, irq) == NULL) {
+		/* no reference to this maskptr was found in this irq's chain */
+		if ((*maskptr & mask) == 0)
+			return;
+		/* the irq was included in the classes mask, remove it */
+		INTRUNMASK(*maskptr, mask);
+	} else {
+		/* a reference to this maskptr was found in this irq's chain */
+		if ((*maskptr & mask) != 0)
+			return;
+		/* put the irq into the classes mask */
+		INTRMASK(*maskptr, mask);
+	}
+	/* we need to update all values in the intr_mask[irq] array */
+	update_intr_masks();
+	/* update mask in chains of the interrupt multiplex handler as well */
+	update_mux_masks();
+}
+
+/*
+ * Add interrupt handler to linked list hung off of intreclist_head[irq]
+ * and install shared interrupt multiplex handler, if necessary
+ */
+
+static int
+add_intrdesc(intrec *idesc)
+{
+	int irq = idesc->intr;
+
+	intrec *head = intreclist_head[irq];
+
+	if (head == NULL) {
+		/* first handler for this irq, just install it */
+		if (icu_setup(irq, idesc->handler, idesc->argument, 
+			      idesc->maskptr, idesc->flags) != 0)
+			return (-1);
+
+		update_intrname(irq, (intptr_t)idesc->devdata);
+		/* keep reference */
+		intreclist_head[irq] = idesc;
+	} else {
+		if ((idesc->flags & INTR_EXCL) != 0
+		    || (head->flags & INTR_EXCL) != 0) {
+			/*
+			 * can't append new handler, if either list head or
+			 * new handler do not allow interrupts to be shared
+			 */
+			if (bootverbose)
+				printf("\tdevice combination doesn't support "
+				       "shared irq%d\n", irq);
+			return (-1);
+		}
+		if (head->next == NULL) {
+			/*
+			 * second handler for this irq, replace device driver's
+			 * handler by shared interrupt multiplexer function
+			 */
+			icu_unset(irq, head->handler);
+			if (icu_setup(irq, (inthand2_t*)intr_mux, head, 0, 0) != 0)
+				return (-1);
+			if (bootverbose)
+				printf("\tusing shared irq%d.\n", irq);
+			update_intrname(irq, -1);
+		}
+		/* just append to the end of the chain */
+		while (head->next != NULL)
+			head = head->next;
+		head->next = idesc;
+	}
+	update_masks(idesc->maskptr, irq);
+	return (0);
+}
+
+/*
+ * Add the interrupt handler descriptor data structure created by an
+ * earlier call of create_intr() to the linked list for its irq and
+ * adjust the interrupt masks if necessary.
+ *
+ * This function effectively activates the handler.
+ */
+
+int
+intr_connect(intrec *idesc)
+{
+	int errcode = -1;
+	int irq;
+
+#ifdef RESOURCE_CHECK
+	int resflag;
+#endif /* RESOURCE_CHECK */
+
+	if (idesc == NULL)
+		return (-1);
+
+	irq = idesc->intr;
+#ifdef RESOURCE_CHECK
+	resflag = (idesc->flags & INTR_EXCL) ? RESF_NONE : RESF_SHARED;
+	if (resource_claim(idesc->devdata, REST_INT, resflag, irq, irq) == 0)
+#endif /* RESOURCE_CHECK */
+	{
+		/* block this irq */
+		intrmask_t oldspl = splq(1 << irq);
+
+		/* add irq to class selected by maskptr */
+		errcode = add_intrdesc(idesc);
+		splx(oldspl);
+	}
+	if (errcode != 0 && bootverbose)
+		printf("\tintr_connect(irq%d) failed, result=%d\n", 
+		       irq, errcode);
+
+	return (errcode);
+}
+
+/*
+ * Remove the interrupt handler descriptor data connected created by an
+ * earlier call of intr_connect() from the linked list and adjust the
+ * interrupt masks if necessary.
+ *
+ * This function deactivates the handler.
+ */
+
+int
+intr_disconnect(intrec *idesc)
+{
+	intrec **hook, *head;
+	int irq;
+	int errcode = 0;
+
+	if (idesc == NULL)
+		return (-1);
+
+	irq = idesc->intr;
+
+	/* find pointer that keeps the reference to this interrupt descriptor */
+	hook = find_pred(idesc, irq);
+	if (hook == NULL)
+		return (-1);
+
+	/* make copy of original list head, the line after may overwrite it */
+	head = intreclist_head[irq];
+
+	/* unlink: make predecessor point to idesc->next instead of to idesc */
+	*hook = idesc->next;
+
+	/* now check whether the element we removed was the list head */
+	if (idesc == head) {
+		intrmask_t oldspl = splq(1 << irq);
+
+		/* we want to remove the list head, which was known to intr_mux */
+		icu_unset(irq, (inthand2_t*)intr_mux);
+
+		/* check whether the new list head is the only element on list */
+		head = intreclist_head[irq];
+		if (head != NULL) {
+			if (head->next != NULL) {
+				/* install the multiplex handler with new list head as argument */
+				errcode = icu_setup(irq, (inthand2_t*)intr_mux, head, 0, 0);
+				if (errcode == 0)
+					update_intrname(irq, -1);
+			} else {
+				/* install the one remaining handler for this irq */
+				errcode = icu_setup(irq, head->handler,
+						    head->argument,
+						    head->maskptr, head->flags);
+				if (errcode == 0)
+					update_intrname(irq, (intptr_t)head->devdata);
+			}
+		}
+		splx(oldspl);
+	}
+	update_masks(idesc->maskptr, irq);
+#ifdef RESOURCE_CHECK
+	resource_free(idesc->devdata);
+#endif /* RESOURCE_CHECK */
+	return (0);
+}
+
+/*
+ * Create an interrupt handler descriptor data structure, which later can
+ * be activated or deactivated at will by calls of [dis]connect(intrec*).
+ *
+ * The dev_instance pointer is required for resource management, and will
+ * only be passed through to resource_claim().
+ *
+ * The interrupt handler takes an argument of type (void*), which is not
+ * what is currently used for ISA devices. But since the unit number passed
+ * to an ISA interrupt handler can be stored in a (void*) variable, this
+ * causes no problems. Eventually all the ISA interrupt handlers should be
+ * modified to accept the pointer to their private data, too, instead of
+ * an integer index.
+ *
+ * There will be functions that derive a driver and unit name from a
+ * dev_instance variable, and those functions will be used to maintain the
+ * interrupt counter label array referenced by systat and vmstat to report
+ * device interrupt rates (->update_intrlabels).
+ */
+
+intrec *
+intr_create(void *dev_instance, int irq, inthand2_t handler, void *arg,
+	     intrmask_t *maskptr, int flags)
+{
+	intrec *idesc;
+
+	if (ICU_LEN > 8 * sizeof *maskptr) {
+		printf("create_intr: ICU_LEN of %d too high for %d bit intrmask\n",
+		       ICU_LEN, 8 * sizeof *maskptr);
+		return (NULL);
+	}
+	if ((unsigned)irq >= ICU_LEN) {
+		printf("create_intr: requested irq%d too high, limit is %d\n",
+		       irq, ICU_LEN -1);
+		return (NULL);
+	}
+
+	idesc = malloc(sizeof *idesc, M_DEVBUF, M_WAITOK);
+	if (idesc) {
+		idesc->next     = NULL;
+		bzero(idesc, sizeof *idesc);
+
+		idesc->devdata  = dev_instance;
+		idesc->handler  = handler;
+		idesc->argument = arg;
+		idesc->maskptr  = maskptr;
+		idesc->intr     = irq;
+		idesc->flags    = flags;
+	}
+	return (idesc);
+}
+
+/*
+ * Return the memory held by the interrupt handler descriptor data structure
+ * to the system. Make sure, the handler is not actively used anymore, before.
+ */
+
+int
+intr_destroy(intrec *rec)
+{
+	if (intr_disconnect(rec) != 0)
+		return (-1);
+	free(rec, M_DEVBUF);
+	return (0);
+}
+
+/*
+ * Emulate the register_intr() call previously defined as low level function.
+ * That function (now icu_setup()) may no longer be directly called, since 
+ * a conflict between an ISA and PCI interrupt might go by unnocticed, else.
+ */
+
+int
+register_intr(int intr, int device_id, u_int flags,
+	      inthand2_t handler, u_int *maskptr, int unit)
+{
+	/* XXX modify to include isa_device instead of device_id */
+	intrec *idesc;
+
+	flags |= INTR_EXCL;
+	idesc = intr_create((void *)(intptr_t)device_id, intr, handler, 
+			    (void*)(intptr_t)unit, maskptr, flags);
+	return (intr_connect(idesc));
+}
+
+/*
+ * Emulate the old unregister_intr() low level function. 
+ * Make sure there is just one interrupt, that it was 
+ * registered as non-shared, and that the handlers match.
+ */
+
+int
+unregister_intr(int intr, inthand2_t handler)
+{
+	intrec *p = intreclist_head[intr];
+
+	if (p != NULL && (p->flags & INTR_EXCL) != 0 && p->handler == handler)
+		return (intr_destroy(p));
+	return (EINVAL);
+}
+
+#endif /* __i386__ */
+
+void
+register_swi(intr, handler)
+	int intr;
+	swihand_t *handler;
+{
+	struct swilist *slp, *slq;
+	int s;
+
+	if (intr < NHWI || intr >= NHWI + NSWI)
+		panic("register_swi: bad intr %d", intr);
+	if (handler == swi_generic || handler == swi_null)
+		panic("register_swi: bad handler %p", (void *)handler);
+	slp = &swilists[intr - NHWI];
+	s = splhigh();
+	if (ihandlers[intr] == swi_null)
+		ihandlers[intr] = handler;
+	else {
+		if (slp->sl_next == NULL) {
+			slp->sl_handler = ihandlers[intr];
+			ihandlers[intr] = swi_generic;
+		}
+		slq = malloc(sizeof(*slq), M_DEVBUF, M_NOWAIT);
+		if (slq == NULL)
+			panic("register_swi: malloc failed");
+		slq->sl_handler = handler;
+		slq->sl_next = NULL;
+		while (slp->sl_next != NULL)
+			slp = slp->sl_next;
+		slp->sl_next = slq;
+	}
+	splx(s);
+}
+
+void
+swi_dispatcher(intr)
+	int intr;
+{
+	struct swilist *slp;
+
+	slp = &swilists[intr - NHWI];
+	do {
+		(*slp->sl_handler)();
+		slp = slp->sl_next;
+	} while (slp != NULL);
+}
+
+void
+unregister_swi(intr, handler)
+	int intr;
+	swihand_t *handler;
+{
+	struct swilist *slfoundpred, *slp, *slq;
+	int s;
+
+	if (intr < NHWI || intr >= NHWI + NSWI)
+		panic("unregister_swi: bad intr %d", intr);
+	if (handler == swi_generic || handler == swi_null)
+		panic("unregister_swi: bad handler %p", (void *)handler);
+	slp = &swilists[intr - NHWI];
+	s = splhigh();
+	if (ihandlers[intr] == handler)
+		ihandlers[intr] = swi_null;
+	else if (slp->sl_next != NULL) {
+		slfoundpred = NULL;
+		for (slq = slp->sl_next; slq != NULL;
+		    slp = slq, slq = slp->sl_next)
+			if (slq->sl_handler == handler)
+				slfoundpred = slp;
+		slp = &swilists[intr - NHWI];
+		if (slfoundpred != NULL) {
+			slq = slfoundpred->sl_next;
+			slfoundpred->sl_next = slq->sl_next;
+			free(slq, M_DEVBUF);
+		} else if (slp->sl_handler == handler) {
+			slq = slp->sl_next;
+			slp->sl_next = slq->sl_next;
+			slp->sl_handler = slq->sl_handler;
+			free(slq, M_DEVBUF);
+		}
+		if (slp->sl_next == NULL)
+			ihandlers[intr] = slp->sl_handler;
+	}
+	splx(s);
+}
+
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
new file mode 100644
index 0000000..7a6d237
--- /dev/null
+++ b/sys/kern/kern_ktrace.c
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_ktrace.c	8.2 (Berkeley) 9/23/93
+ * $Id: kern_ktrace.c,v 1.24 1998/11/10 09:16:29 peter Exp $
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/lock.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/ktrace.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+
+static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
+
+#ifdef KTRACE
+static struct ktr_header *ktrgetheader __P((int type));
+static void ktrwrite __P((struct vnode *, struct ktr_header *));
+static int ktrcanset __P((struct proc *,struct proc *));
+static int ktrsetchildren __P((struct proc *,struct proc *,int,int,struct vnode *));
+static int ktrops __P((struct proc *,struct proc *,int,int,struct vnode *));
+
+
+static struct ktr_header *
+ktrgetheader(type)
+	int type;
+{
+	register struct ktr_header *kth;
+	struct proc *p = curproc;	/* XXX */
+
+	MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header),
+		M_KTRACE, M_WAITOK);
+	kth->ktr_type = type;
+	microtime(&kth->ktr_time);
+	kth->ktr_pid = p->p_pid;
+	bcopy(p->p_comm, kth->ktr_comm, MAXCOMLEN);
+	return (kth);
+}
+
+void
+ktrsyscall(vp, code, narg, args)
+	struct vnode *vp;
+	int code, narg, args[];
+{
+	struct	ktr_header *kth;
+	struct	ktr_syscall *ktp;
+	register int len = sizeof(struct ktr_syscall) + (narg * sizeof(int));
+	struct proc *p = curproc;	/* XXX */
+	int 	*argp, i;
+
+	p->p_traceflag |= KTRFAC_ACTIVE;
+	kth = ktrgetheader(KTR_SYSCALL);
+	MALLOC(ktp, struct ktr_syscall *, len, M_KTRACE, M_WAITOK);
+	ktp->ktr_code = code;
+	ktp->ktr_narg = narg;
+	argp = (int *)((char *)ktp + sizeof(struct ktr_syscall));
+	for (i = 0; i < narg; i++)
+		*argp++ = args[i];
+	kth->ktr_buf = (caddr_t)ktp;
+	kth->ktr_len = len;
+	ktrwrite(vp, kth);
+	FREE(ktp, M_KTRACE);
+	FREE(kth, M_KTRACE);
+	p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+
+void
+ktrsysret(vp, code, error, retval)
+	struct vnode *vp;
+	int code, error, retval;
+{
+	struct ktr_header *kth;
+	struct ktr_sysret ktp;
+	struct proc *p = curproc;	/* XXX */
+
+	p->p_traceflag |= KTRFAC_ACTIVE;
+	kth = ktrgetheader(KTR_SYSRET);
+	ktp.ktr_code = code;
+	ktp.ktr_error = error;
+	ktp.ktr_retval = retval;		/* what about val2 ? */
+
+	kth->ktr_buf = (caddr_t)&ktp;
+	kth->ktr_len = sizeof(struct ktr_sysret);
+
+	ktrwrite(vp, kth);
+	FREE(kth, M_KTRACE);
+	p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+
+void
+ktrnamei(vp, path)
+	struct vnode *vp;
+	char *path;
+{
+	struct ktr_header *kth;
+	struct proc *p = curproc;	/* XXX */
+
+	p->p_traceflag |= KTRFAC_ACTIVE;
+	kth = ktrgetheader(KTR_NAMEI);
+	kth->ktr_len = strlen(path);
+	kth->ktr_buf = path;
+
+	ktrwrite(vp, kth);
+	FREE(kth, M_KTRACE);
+	p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+
+void
+ktrgenio(vp, fd, rw, iov, len, error)
+	struct vnode *vp;
+	int fd;
+	enum uio_rw rw;
+	register struct iovec *iov;
+	int len, error;
+{
+	struct ktr_header *kth;
+	register struct ktr_genio *ktp;
+	register caddr_t cp;
+	register int resid = len, cnt;
+	struct proc *p = curproc;	/* XXX */
+
+	if (error)
+		return;
+	p->p_traceflag |= KTRFAC_ACTIVE;
+	kth = ktrgetheader(KTR_GENIO);
+	MALLOC(ktp, struct ktr_genio *, sizeof(struct ktr_genio) + len,
+		M_KTRACE, M_WAITOK);
+	ktp->ktr_fd = fd;
+	ktp->ktr_rw = rw;
+	cp = (caddr_t)((char *)ktp + sizeof (struct ktr_genio));
+	while (resid > 0) {
+		if ((cnt = iov->iov_len) > resid)
+			cnt = resid;
+		if (copyin(iov->iov_base, cp, (unsigned)cnt))
+			goto done;
+		cp += cnt;
+		resid -= cnt;
+		iov++;
+	}
+	kth->ktr_buf = (caddr_t)ktp;
+	kth->ktr_len = sizeof (struct ktr_genio) + len;
+
+	ktrwrite(vp, kth);
+done:
+	FREE(kth, M_KTRACE);
+	FREE(ktp, M_KTRACE);
+	p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+
+void
+ktrpsig(vp, sig, action, mask, code)
+	struct vnode *vp;
+	int sig;
+	sig_t action;
+	int mask, code;
+{
+	struct ktr_header *kth;
+	struct ktr_psig	kp;
+	struct proc *p = curproc;	/* XXX */
+
+	p->p_traceflag |= KTRFAC_ACTIVE;
+	kth = ktrgetheader(KTR_PSIG);
+	kp.signo = (char)sig;
+	kp.action = action;
+	kp.mask = mask;
+	kp.code = code;
+	kth->ktr_buf = (caddr_t)&kp;
+	kth->ktr_len = sizeof (struct ktr_psig);
+
+	ktrwrite(vp, kth);
+	FREE(kth, M_KTRACE);
+	p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+
+void
+ktrcsw(vp, out, user)
+	struct vnode *vp;
+	int out, user;
+{
+	struct ktr_header *kth;
+	struct	ktr_csw kc;
+	struct proc *p = curproc;	/* XXX */
+
+	p->p_traceflag |= KTRFAC_ACTIVE;
+	kth = ktrgetheader(KTR_CSW);
+	kc.out = out;
+	kc.user = user;
+	kth->ktr_buf = (caddr_t)&kc;
+	kth->ktr_len = sizeof (struct ktr_csw);
+
+	ktrwrite(vp, kth);
+	FREE(kth, M_KTRACE);
+	p->p_traceflag &= ~KTRFAC_ACTIVE;
+}
+#endif
+
+/* Interface and common routines */
+
+/*
+ * ktrace system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ktrace_args {
+	char	*fname;
+	int	ops;
+	int	facs;
+	int	pid;
+};
+#endif
+/* ARGSUSED */
+int
+ktrace(curp, uap)
+	struct proc *curp;
+	register struct ktrace_args *uap;
+{
+#ifdef KTRACE
+	register struct vnode *vp = NULL;
+	register struct proc *p;
+	struct pgrp *pg;
+	int facs = uap->facs & ~KTRFAC_ROOT;
+	int ops = KTROP(uap->ops);
+	int descend = uap->ops & KTRFLAG_DESCEND;
+	int ret = 0;
+	int error = 0;
+	struct nameidata nd;
+
+	curp->p_traceflag |= KTRFAC_ACTIVE;
+	if (ops != KTROP_CLEAR) {
+		/*
+		 * an operation which requires a file argument.
+		 */
+		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->fname, curp);
+		error = vn_open(&nd, FREAD|FWRITE, 0);
+		if (error) {
+			curp->p_traceflag &= ~KTRFAC_ACTIVE;
+			return (error);
+		}
+		vp = nd.ni_vp;
+		VOP_UNLOCK(vp, 0, curp);
+		if (vp->v_type != VREG) {
+			(void) vn_close(vp, FREAD|FWRITE, curp->p_ucred, curp);
+			curp->p_traceflag &= ~KTRFAC_ACTIVE;
+			return (EACCES);
+		}
+	}
+	/*
+	 * Clear all uses of the tracefile
+	 */
+	if (ops == KTROP_CLEARFILE) {
+		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+			if (p->p_tracep == vp) {
+				if (ktrcanset(curp, p)) {
+					p->p_tracep = NULL;
+					p->p_traceflag = 0;
+					(void) vn_close(vp, FREAD|FWRITE,
+						p->p_ucred, p);
+				} else
+					error = EPERM;
+			}
+		}
+		goto done;
+	}
+	/*
+	 * need something to (un)trace (XXX - why is this here?)
+	 */
+	if (!facs) {
+		error = EINVAL;
+		goto done;
+	}
+	/*
+	 * do it
+	 */
+	if (uap->pid < 0) {
+		/*
+		 * by process group
+		 */
+		pg = pgfind(-uap->pid);
+		if (pg == NULL) {
+			error = ESRCH;
+			goto done;
+		}
+		for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next)
+			if (descend)
+				ret |= ktrsetchildren(curp, p, ops, facs, vp);
+			else
+				ret |= ktrops(curp, p, ops, facs, vp);
+
+	} else {
+		/*
+		 * by pid
+		 */
+		p = pfind(uap->pid);
+		if (p == NULL) {
+			error = ESRCH;
+			goto done;
+		}
+		if (descend)
+			ret |= ktrsetchildren(curp, p, ops, facs, vp);
+		else
+			ret |= ktrops(curp, p, ops, facs, vp);
+	}
+	if (!ret)
+		error = EPERM;
+done:
+	if (vp != NULL)
+		(void) vn_close(vp, FWRITE, curp->p_ucred, curp);
+	curp->p_traceflag &= ~KTRFAC_ACTIVE;
+	return (error);
+#else
+	return ENOSYS;
+#endif
+}
+
+/*
+ * utrace system call
+ */
+/* ARGSUSED */
+int
+utrace(curp, uap)
+	struct proc *curp;
+	register struct utrace_args *uap;
+{
+#ifdef KTRACE
+	struct ktr_header *kth;
+	struct proc *p = curproc;	/* XXX */
+	register caddr_t cp;
+
+	if (!KTRPOINT(p, KTR_USER))
+		return (0);
+	p->p_traceflag |= KTRFAC_ACTIVE;
+	kth = ktrgetheader(KTR_USER);
+	MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK);
+	if (!copyin(uap->addr, cp, uap->len)) {
+		kth->ktr_buf = cp;
+		kth->ktr_len = uap->len;
+		ktrwrite(p->p_tracep, kth);
+	}
+	FREE(kth, M_KTRACE);
+	FREE(cp, M_KTRACE);
+	p->p_traceflag &= ~KTRFAC_ACTIVE;
+
+	return (0);
+#else
+	return (ENOSYS);
+#endif
+}
+
+#ifdef KTRACE
+static int
+ktrops(curp, p, ops, facs, vp)
+	struct proc *p, *curp;
+	int ops, facs;
+	struct vnode *vp;
+{
+
+	if (!ktrcanset(curp, p))
+		return (0);
+	if (ops == KTROP_SET) {
+		if (p->p_tracep != vp) {
+			/*
+			 * if trace file already in use, relinquish
+			 */
+			if (p->p_tracep != NULL)
+				vrele(p->p_tracep);
+			VREF(vp);
+			p->p_tracep = vp;
+		}
+		p->p_traceflag |= facs;
+		if (curp->p_ucred->cr_uid == 0)
+			p->p_traceflag |= KTRFAC_ROOT;
+	} else {
+		/* KTROP_CLEAR */
+		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
+			/* no more tracing */
+			p->p_traceflag = 0;
+			if (p->p_tracep != NULL) {
+				vrele(p->p_tracep);
+				p->p_tracep = NULL;
+			}
+		}
+	}
+
+	return (1);
+}
+
+static int
+ktrsetchildren(curp, top, ops, facs, vp)
+	struct proc *curp, *top;
+	int ops, facs;
+	struct vnode *vp;
+{
+	register struct proc *p;
+	register int ret = 0;
+
+	p = top;
+	for (;;) {
+		ret |= ktrops(curp, p, ops, facs, vp);
+		/*
+		 * If this process has children, descend to them next,
+		 * otherwise do any siblings, and if done with this level,
+		 * follow back up the tree (but not past top).
+		 */
+		if (p->p_children.lh_first)
+			p = p->p_children.lh_first;
+		else for (;;) {
+			if (p == top)
+				return (ret);
+			if (p->p_sibling.le_next) {
+				p = p->p_sibling.le_next;
+				break;
+			}
+			p = p->p_pptr;
+		}
+	}
+	/*NOTREACHED*/
+}
+
+static void
+ktrwrite(vp, kth)
+	struct vnode *vp;
+	register struct ktr_header *kth;
+{
+	struct uio auio;
+	struct iovec aiov[2];
+	register struct proc *p = curproc;	/* XXX */
+	int error;
+
+	if (vp == NULL)
+		return;
+	auio.uio_iov = &aiov[0];
+	auio.uio_offset = 0;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_rw = UIO_WRITE;
+	aiov[0].iov_base = (caddr_t)kth;
+	aiov[0].iov_len = sizeof(struct ktr_header);
+	auio.uio_resid = sizeof(struct ktr_header);
+	auio.uio_iovcnt = 1;
+	auio.uio_procp = curproc;
+	if (kth->ktr_len > 0) {
+		auio.uio_iovcnt++;
+		aiov[1].iov_base = kth->ktr_buf;
+		aiov[1].iov_len = kth->ktr_len;
+		auio.uio_resid += kth->ktr_len;
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	error = VOP_WRITE(vp, &auio, IO_UNIT|IO_APPEND, p->p_ucred);
+	VOP_UNLOCK(vp, 0, p);
+	if (!error)
+		return;
+	/*
+	 * If error encountered, give up tracing on this vnode.
+	 */
+	log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
+	    error);
+	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+		if (p->p_tracep == vp) {
+			p->p_tracep = NULL;
+			p->p_traceflag = 0;
+			vrele(vp);
+		}
+	}
+}
+
+/*
+ * Return true if caller has permission to set the ktracing state
+ * of target.  Essentially, the target can't possess any
+ * more permissions than the caller.  KTRFAC_ROOT signifies that
+ * root previously set the tracing status on the target process, and
+ * so, only root may further change it.
+ *
+ * TODO: check groups.  use caller effective gid.
+ */
+static int
+ktrcanset(callp, targetp)
+	struct proc *callp, *targetp;
+{
+	register struct pcred *caller = callp->p_cred;
+	register struct pcred *target = targetp->p_cred;
+
+	if ((caller->pc_ucred->cr_uid == target->p_ruid &&
+	     target->p_ruid == target->p_svuid &&
+	     caller->p_rgid == target->p_rgid &&	/* XXX */
+	     target->p_rgid == target->p_svgid &&
+	     (targetp->p_traceflag & KTRFAC_ROOT) == 0) ||
+	     caller->pc_ucred->cr_uid == 0)
+		return (1);
+
+	return (0);
+}
+
+#endif /* KTRACE */
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
new file mode 100644
index 0000000..97def9f
--- /dev/null
+++ b/sys/kern/kern_linker.c
@@ -0,0 +1,1016 @@
+/*-
+ * Copyright (c) 1997 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: kern_linker.c,v 1.20 1999/01/19 16:26:32 peter Exp $
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <machine/cpu.h>
+#include <machine/bootinfo.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+#include <sys/unistd.h>
+#include <sys/fcntl.h>
+#include <sys/libkern.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/sysctl.h>
+
+#ifdef KLD_DEBUG
+int kld_debug = 0;
+#endif
+
+MALLOC_DEFINE(M_LINKER, "kld", "kernel linker");
+linker_file_t linker_current_file;
+linker_file_t linker_kernel_file;
+
+static struct lock lock;	/* lock for the file list */
+static linker_class_list_t classes;
+static linker_file_list_t files;
+static int next_file_id = 1;
+
+static void
+linker_init(void* arg)
+{
+    lockinit(&lock, PVM, "klink", 0, 0);
+    TAILQ_INIT(&classes);
+    TAILQ_INIT(&files);
+}
+
+SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0);
+
+int
+linker_add_class(const char* desc, void* priv,
+		 struct linker_class_ops* ops)
+{
+    linker_class_t lc;
+
+    lc = malloc(sizeof(struct linker_class), M_LINKER, M_NOWAIT);
+    if (!lc)
+	return ENOMEM;
+    bzero(lc, sizeof(*lc));
+
+    lc->desc = desc;
+    lc->priv = priv;
+    lc->ops = ops;
+    TAILQ_INSERT_HEAD(&classes, lc, link);
+
+    return 0;
+}
+
+static void
+linker_file_sysinit(linker_file_t lf)
+{
+    struct linker_set* sysinits;
+    struct sysinit** sipp;
+    struct sysinit** xipp;
+    struct sysinit* save;
+    moduledata_t *moddata;
+
+    KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n",
+		   lf->filename));
+
+    sysinits = (struct linker_set*)
+	linker_file_lookup_symbol(lf, "sysinit_set", 0);
+
+    KLD_DPF(FILE, ("linker_file_sysinit: SYSINITs %p\n", sysinits));
+    if (!sysinits)
+	return;
+
+    /* HACK ALERT! */
+    for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) {
+	if ((*sipp)->func == module_register_init) {
+	    moddata = (*sipp)->udata;
+	    moddata->_file = lf;
+	}
+    }
+	    
+    /*
+     * Perform a bubble sort of the system initialization objects by
+     * their subsystem (primary key) and order (secondary key).
+     *
+     * Since some things care about execution order, this is the
+     * operation which ensures continued function.
+     */
+    for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) {
+	for (xipp = sipp + 1; *xipp; xipp++) {
+	    if ((*sipp)->subsystem <= (*xipp)->subsystem ||
+		 ((*sipp)->subsystem == (*xipp)->subsystem &&
+		  (*sipp)->order <= (*xipp)->order))
+		continue;	/* skip*/
+	    save = *sipp;
+	    *sipp = *xipp;
+	    *xipp = save;
+	}
+    }
+
+
+    /*
+     * Traverse the (now) ordered list of system initialization tasks.
+     * Perform each task, and continue on to the next task.
+     */
+    for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) {
+	if ((*sipp)->subsystem == SI_SUB_DUMMY)
+	    continue;	/* skip dummy task(s)*/
+
+	switch ((*sipp)->type) {
+	case SI_TYPE_DEFAULT:
+	    /* no special processing*/
+	    (*((*sipp)->func))((*sipp)->udata);
+	    break;
+
+	case SI_TYPE_KTHREAD:
+#if !defined(SMP)
+	    /* kernel thread*/
+	    if (fork1(&proc0, RFFDG|RFPROC|RFMEM))
+		panic("fork kernel thread");
+	    cpu_set_fork_handler(pfind(proc0.p_retval[0]),
+		(*sipp)->func, (*sipp)->udata);
+	    break;
+#endif
+
+	case SI_TYPE_KPROCESS:
+	    /* kernel thread*/
+	    if (fork1(&proc0, RFFDG|RFPROC))
+		panic("fork kernel process");
+	    cpu_set_fork_handler(pfind(proc0.p_retval[0]),
+		(*sipp)->func, (*sipp)->udata);
+	    break;
+
+	default:
+	    panic ("linker_file_sysinit: unrecognized init type");
+	}
+    }
+}
+
+static void
+linker_file_sysuninit(linker_file_t lf)
+{
+    struct linker_set* sysuninits;
+    struct sysinit** sipp;
+    struct sysinit** xipp;
+    struct sysinit* save;
+
+    KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n",
+		   lf->filename));
+
+    sysuninits = (struct linker_set*)
+	linker_file_lookup_symbol(lf, "sysuninit_set", 0);
+
+    KLD_DPF(FILE, ("linker_file_sysuninit: SYSUNINITs %p\n", sysuninits));
+    if (!sysuninits)
+	return;
+
+    /*
+     * Perform a reverse bubble sort of the system initialization objects
+     * by their subsystem (primary key) and order (secondary key).
+     *
+     * Since some things care about execution order, this is the
+     * operation which ensures continued function.
+     */
+    for (sipp = (struct sysinit **)sysuninits->ls_items; *sipp; sipp++) {
+	for (xipp = sipp + 1; *xipp; xipp++) {
+	    if ((*sipp)->subsystem >= (*xipp)->subsystem ||
+		 ((*sipp)->subsystem == (*xipp)->subsystem &&
+		  (*sipp)->order >= (*xipp)->order))
+		continue;	/* skip*/
+	    save = *sipp;
+	    *sipp = *xipp;
+	    *xipp = save;
+	}
+    }
+
+
+    /*
+     * Traverse the (now) ordered list of system initialization tasks.
+     * Perform each task, and continue on to the next task.
+     */
+    for (sipp = (struct sysinit **)sysuninits->ls_items; *sipp; sipp++) {
+	if ((*sipp)->subsystem == SI_SUB_DUMMY)
+	    continue;	/* skip dummy task(s)*/
+
+	switch ((*sipp)->type) {
+	case SI_TYPE_DEFAULT:
+	    /* no special processing*/
+	    (*((*sipp)->func))((*sipp)->udata);
+	    break;
+
+	default:
+	    panic("linker_file_sysuninit: unrecognized uninit type");
+	}
+    }
+}
+
+int
+linker_load_file(const char* filename, linker_file_t* result)
+{
+    linker_class_t lc;
+    linker_file_t lf;
+    int foundfile, error = 0;
+    char *koname = NULL;
+
+    lf = linker_find_file_by_name(filename);
+    if (lf) {
+	KLD_DPF(FILE, ("linker_load_file: file %s is already loaded, incrementing refs\n", filename));
+	*result = lf;
+	lf->refs++;
+	goto out;
+    }
+
+    koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
+    if (koname == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    sprintf(koname, "%s.ko", filename);
+    lf = NULL;
+    foundfile = 0;
+    for (lc = TAILQ_FIRST(&classes); lc; lc = TAILQ_NEXT(lc, link)) {
+	KLD_DPF(FILE, ("linker_load_file: trying to load %s as %s\n",
+		       filename, lc->desc));
+
+	error = lc->ops->load_file(koname, &lf);	/* First with .ko */
+	if (lf == NULL && error == ENOENT)
+	    error = lc->ops->load_file(filename, &lf);	/* Then try without */
+	/*
+	 * If we got something other than ENOENT, then it exists but we cannot
+	 * load it for some other reason.
+	 */
+	if (error != ENOENT)
+	    foundfile = 1;
+	if (lf) {
+	    linker_file_sysinit(lf);
+
+	    *result = lf;
+	    error = 0;
+	    goto out;
+	}
+    }
+    /*
+     * Less than ideal, but tells the user whether it failed to load or
+     * the module was not found.
+     */
+    if (foundfile)
+	error = ENOEXEC;	/* Format not recognised (or unloadable) */
+    else
+	error = ENOENT;		/* Nothing found */
+
+out:
+    if (koname)
+	free(koname, M_LINKER);
+    return error;
+}
+
+linker_file_t
+linker_find_file_by_name(const char* filename)
+{
+    linker_file_t lf = 0;
+    char *koname;
+
+    koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
+    if (koname == NULL)
+	goto out;
+    sprintf(koname, "%s.ko", filename);
+
+    lockmgr(&lock, LK_SHARED, 0, curproc);
+    for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+	if (!strcmp(lf->filename, koname))
+	    break;
+	if (!strcmp(lf->filename, filename))
+	    break;
+    }
+    lockmgr(&lock, LK_RELEASE, 0, curproc);
+
+out:
+    if (koname)
+	free(koname, M_LINKER);
+    return lf;
+}
+
+linker_file_t
+linker_find_file_by_id(int fileid)
+{
+    linker_file_t lf = 0;
+
+    lockmgr(&lock, LK_SHARED, 0, curproc);
+    for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link))
+	if (lf->id == fileid)
+	    break;
+    lockmgr(&lock, LK_RELEASE, 0, curproc);
+
+    return lf;
+}
+
+linker_file_t
+linker_make_file(const char* pathname, void* priv, struct linker_file_ops* ops)
+{
+    linker_file_t lf = 0;
+    int namelen;
+    const char *filename;
+
+    filename = rindex(pathname, '/');
+    if (filename && filename[1])
+	filename++;
+    else
+	filename = pathname;
+
+    KLD_DPF(FILE, ("linker_make_file: new file, filename=%s\n", filename));
+    lockmgr(&lock, LK_EXCLUSIVE|LK_RETRY, 0, curproc);
+    namelen = strlen(filename) + 1;
+    lf = malloc(sizeof(struct linker_file) + namelen, M_LINKER, M_WAITOK);
+    if (!lf)
+	goto out;
+    bzero(lf, sizeof(*lf));
+
+    lf->refs = 1;
+    lf->userrefs = 0;
+    lf->filename = (char*) (lf + 1);
+    strcpy(lf->filename, filename);
+    lf->id = next_file_id++;
+    lf->ndeps = 0;
+    lf->deps = NULL;
+    STAILQ_INIT(&lf->common);
+    TAILQ_INIT(&lf->modules);
+
+    lf->priv = priv;
+    lf->ops = ops;
+    TAILQ_INSERT_TAIL(&files, lf, link);
+
+out:
+    lockmgr(&lock, LK_RELEASE, 0, curproc);
+    return lf;
+}
+
+int
+linker_file_unload(linker_file_t file)
+{
+    module_t mod, next;
+    struct common_symbol* cp;
+    int error = 0;
+    int i;
+
+    KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
+    lockmgr(&lock, LK_EXCLUSIVE|LK_RETRY, 0, curproc);
+    if (file->refs == 1) {
+	KLD_DPF(FILE, ("linker_file_unload: file is unloading, informing modules\n"));
+	/*
+	 * Inform any modules associated with this file.
+	 */
+	for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
+	    next = module_getfnext(mod);
+
+	    /*
+	     * Give the module a chance to veto the unload.
+	     */
+	    if (error = module_unload(mod)) {
+		KLD_DPF(FILE, ("linker_file_unload: module %x vetoes unload\n",
+			       mod));
+		lockmgr(&lock, LK_RELEASE, 0, curproc);
+		goto out;
+	    }
+
+	    module_release(mod);
+	}
+    }
+
+    file->refs--;
+    if (file->refs > 0) {
+	lockmgr(&lock, LK_RELEASE, 0, curproc);
+	goto out;
+    }
+
+    linker_file_sysuninit(file);
+
+    TAILQ_REMOVE(&files, file, link);
+    lockmgr(&lock, LK_RELEASE, 0, curproc);
+
+    for (i = 0; i < file->ndeps; i++)
+	linker_file_unload(file->deps[i]);
+    free(file->deps, M_LINKER);
+
+    for (cp = STAILQ_FIRST(&file->common); cp;
+	 cp = STAILQ_FIRST(&file->common)) {
+	STAILQ_REMOVE(&file->common, cp, common_symbol, link);
+	free(cp, M_LINKER);
+    }
+
+    file->ops->unload(file);
+    free(file, M_LINKER);
+
+out:
+    return error;
+}
+
+int
+linker_file_add_dependancy(linker_file_t file, linker_file_t dep)
+{
+    linker_file_t* newdeps;
+
+    newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t*),
+		     M_LINKER, M_WAITOK);
+    if (newdeps == NULL)
+	return ENOMEM;
+    bzero(newdeps, (file->ndeps + 1) * sizeof(linker_file_t*));
+
+    if (file->deps) {
+	bcopy(file->deps, newdeps, file->ndeps * sizeof(linker_file_t*));
+	free(file->deps, M_LINKER);
+    }
+    file->deps = newdeps;
+    file->deps[file->ndeps] = dep;
+    file->ndeps++;
+
+    return 0;
+}
+
+caddr_t
+linker_file_lookup_symbol(linker_file_t file, const char* name, int deps)
+{
+    linker_sym_t sym;
+    linker_symval_t symval;
+    linker_file_t lf;
+    caddr_t address;
+    size_t common_size = 0;
+    int i;
+
+    KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%x, name=%s, deps=%d\n",
+		  file, name, deps));
+
+    if (file->ops->lookup_symbol(file, name, &sym) == 0) {
+	file->ops->symbol_values(file, sym, &symval);
+	if (symval.value == 0)
+	    /*
+	     * For commons, first look them up in the dependancies and
+	     * only allocate space if not found there.
+	     */
+	    common_size = symval.size;
+	else {
+	    KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol.value=%x\n", symval.value));
+	    return symval.value;
+	}
+    }
+
+    if (deps) {
+	for (i = 0; i < file->ndeps; i++) {
+	    address = linker_file_lookup_symbol(file->deps[i], name, 0);
+	    if (address) {
+		KLD_DPF(SYM, ("linker_file_lookup_symbol: deps value=%x\n", address));
+		return address;
+	    }
+	}
+
+	/* If we have not found it in the dependencies, search globally */
+	for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+	    /* But skip the current file if it's on the list */
+	    if (lf == file)
+		continue;
+	    /* And skip the files we searched above */
+	    for (i = 0; i < file->ndeps; i++)
+		if (lf == file->deps[i])
+		    break;
+	    if (i < file->ndeps)
+		continue;
+	    address = linker_file_lookup_symbol(lf, name, 0);
+	    if (address) {
+		KLD_DPF(SYM, ("linker_file_lookup_symbol: global value=%x\n", address));
+		return address;
+	    }
+	}
+    }
+
+    if (common_size > 0) {
+	/*
+	 * This is a common symbol which was not found in the
+	 * dependancies.  We maintain a simple common symbol table in
+	 * the file object.
+	 */
+	struct common_symbol* cp;
+
+	for (cp = STAILQ_FIRST(&file->common); cp;
+	     cp = STAILQ_NEXT(cp, link))
+	    if (!strcmp(cp->name, name)) {
+		KLD_DPF(SYM, ("linker_file_lookup_symbol: old common value=%x\n", cp->address));
+		return cp->address;
+	    }
+
+	/*
+	 * Round the symbol size up to align.
+	 */
+	common_size = (common_size + sizeof(int) - 1) & -sizeof(int);
+	cp = malloc(sizeof(struct common_symbol)
+		    + common_size
+		    + strlen(name) + 1,
+		    M_LINKER, M_WAITOK);
+	if (!cp) {
+	    KLD_DPF(SYM, ("linker_file_lookup_symbol: nomem\n"));
+	    return 0;
+	}
+	bzero(cp, sizeof(struct common_symbol) + common_size + strlen(name)+ 1);
+
+	cp->address = (caddr_t) (cp + 1);
+	cp->name = cp->address + common_size;
+	strcpy(cp->name, name);
+	bzero(cp->address, common_size);
+	STAILQ_INSERT_TAIL(&file->common, cp, link);
+
+	KLD_DPF(SYM, ("linker_file_lookup_symbol: new common value=%x\n", cp->address));
+	return cp->address;
+    }
+
+    KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n"));
+    return 0;
+}
+
+#ifdef DDB
+/*
+ * DDB Helpers.  DDB has to look across multiple files with their own
+ * symbol tables and string tables.
+ *
+ * Note that we do not obey list locking protocols here.  We really don't
+ * need DDB to hang because somebody's got the lock held.  We'll take the
+ * chance that the files list is inconsistant instead.
+ */
+
+int
+linker_ddb_lookup(char *symstr, linker_sym_t *sym)
+{
+    linker_file_t lf;
+
+    for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+	if (lf->ops->lookup_symbol(lf, symstr, sym) == 0)
+	    return 0;
+    }
+    return ENOENT;
+}
+
+int
+linker_ddb_search_symbol(caddr_t value, linker_sym_t *sym, long *diffp)
+{
+    linker_file_t lf;
+    u_long off = (u_long)value;
+    u_long diff, bestdiff;
+    linker_sym_t best;
+    linker_sym_t es;
+
+    best = 0;
+    bestdiff = off;
+    for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+	if (lf->ops->search_symbol(lf, value, &es, &diff) != 0)
+	    continue;
+	if (es != 0 && diff < bestdiff) {
+	    best = es;
+	    bestdiff = diff;
+	}
+	if (bestdiff == 0)
+	    break;
+    }
+    if (best) {
+	*sym = best;
+	*diffp = bestdiff;
+	return 0;
+    } else {
+	*sym = 0;
+	*diffp = off;
+	return ENOENT;
+    }
+}
+
+int
+linker_ddb_symbol_values(linker_sym_t sym, linker_symval_t *symval)
+{
+    linker_file_t lf;
+
+    for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+	if (lf->ops->symbol_values(lf, sym, symval) == 0)
+	    return 0;
+    }
+    return ENOENT;
+}
+
+#endif
+
+/*
+ * Syscalls.
+ */
+
+int
+kldload(struct proc* p, struct kldload_args* uap)
+{
+    char* filename = NULL, *modulename;
+    linker_file_t lf;
+    int error = 0;
+
+    p->p_retval[0] = -1;
+
+    if (securelevel > 0)
+	return EPERM;
+
+    if (error = suser(p->p_ucred, &p->p_acflag))
+	return error;
+
+    filename = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+    if (error = copyinstr(SCARG(uap, file), filename, MAXPATHLEN, NULL))
+	goto out;
+
+    /* Can't load more than one module with the same name */
+    modulename = rindex(filename, '/');
+    if (modulename == NULL)
+	modulename = filename;
+    if (linker_find_file_by_name(modulename)) {
+	error = EEXIST;
+	goto out;
+    }
+
+    if (error = linker_load_file(filename, &lf))
+	goto out;
+
+    lf->userrefs++;
+    p->p_retval[0] = lf->id;
+
+out:
+    if (filename)
+	free(filename, M_TEMP);
+    return error;
+}
+
+int
+kldunload(struct proc* p, struct kldunload_args* uap)
+{
+    linker_file_t lf;
+    int error = 0;
+
+    if (securelevel > 0)
+	return EPERM;
+
+    if (error = suser(p->p_ucred, &p->p_acflag))
+	return error;
+
+    lf = linker_find_file_by_id(SCARG(uap, fileid));
+    if (lf) {
+	KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
+	if (lf->userrefs == 0) {
+	    printf("linkerunload: attempt to unload file which was not loaded by user\n");
+	    error = EBUSY;
+	    goto out;
+	}
+	error = linker_file_unload(lf);
+	if (error)
+	    goto out;
+	lf->userrefs--;
+    } else
+	error = ENOENT;
+
+out:
+    return error;
+}
+
+int
+kldfind(struct proc* p, struct kldfind_args* uap)
+{
+    char* filename = NULL, *modulename;
+    linker_file_t lf;
+    int error = 0;
+
+    p->p_retval[0] = -1;
+
+    filename = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+    if (error = copyinstr(SCARG(uap, file), filename, MAXPATHLEN, NULL))
+	goto out;
+
+    modulename = rindex(filename, '/');
+    if (modulename == NULL)
+	modulename = filename;
+
+    lf = linker_find_file_by_name(modulename);
+    if (lf)
+	p->p_retval[0] = lf->id;
+    else
+	error = ENOENT;
+
+out:
+    if (filename)
+	free(filename, M_TEMP);
+    return error;
+}
+
+int
+kldnext(struct proc* p, struct kldnext_args* uap)
+{
+    linker_file_t lf;
+    int error = 0;
+
+    if (SCARG(uap, fileid) == 0) {
+	if (TAILQ_FIRST(&files))
+	    p->p_retval[0] = TAILQ_FIRST(&files)->id;
+	else
+	    p->p_retval[0] = 0;
+	return 0;
+    }
+
+    lf = linker_find_file_by_id(SCARG(uap, fileid));
+    if (lf) {
+	if (TAILQ_NEXT(lf, link))
+	    p->p_retval[0] = TAILQ_NEXT(lf, link)->id;
+	else
+	    p->p_retval[0] = 0;
+    } else
+	error = ENOENT;
+
+    return error;
+}
+
+int
+kldstat(struct proc* p, struct kldstat_args* uap)
+{
+    linker_file_t lf;
+    int error = 0;
+    int version;
+    struct kld_file_stat* stat;
+    int namelen;
+
+    lf = linker_find_file_by_id(SCARG(uap, fileid));
+    if (!lf) {
+	error = ENOENT;
+	goto out;
+    }
+
+    stat = SCARG(uap, stat);
+
+    /*
+     * Check the version of the user's structure.
+     */
+    if (error = copyin(&stat->version, &version, sizeof(version)))
+	goto out;
+    if (version != sizeof(struct kld_file_stat)) {
+	error = EINVAL;
+	goto out;
+    }
+
+    namelen = strlen(lf->filename) + 1;
+    if (namelen > MAXPATHLEN)
+	namelen = MAXPATHLEN;
+    if (error = copyout(lf->filename, &stat->name[0], namelen))
+	goto out;
+    if (error = copyout(&lf->refs, &stat->refs, sizeof(int)))
+	goto out;
+    if (error = copyout(&lf->id, &stat->id, sizeof(int)))
+	goto out;
+    if (error = copyout(&lf->address, &stat->address, sizeof(caddr_t)))
+	goto out;
+    if (error = copyout(&lf->size, &stat->size, sizeof(size_t)))
+	goto out;
+
+    p->p_retval[0] = 0;
+
+out:
+    return error;
+}
+
+int
+kldfirstmod(struct proc* p, struct kldfirstmod_args* uap)
+{
+    linker_file_t lf;
+    int error = 0;
+
+    lf = linker_find_file_by_id(SCARG(uap, fileid));
+    if (lf) {
+	if (TAILQ_FIRST(&lf->modules))
+	    p->p_retval[0] = module_getid(TAILQ_FIRST(&lf->modules));
+	else
+	    p->p_retval[0] = 0;
+    } else
+	error = ENOENT;
+
+    return error;
+}
+
+int
+kldsym(struct proc *p, struct kldsym_args *uap)
+{
+    char *symstr = NULL;
+    linker_sym_t sym;
+    linker_symval_t symval;
+    linker_file_t lf;
+    struct kld_sym_lookup lookup;
+    int error = 0;
+
+    if (error = copyin(SCARG(uap, data), &lookup, sizeof(lookup)))
+	goto out;
+    if (lookup.version != sizeof(lookup) || SCARG(uap, cmd) != KLDSYM_LOOKUP) {
+	error = EINVAL;
+	goto out;
+    }
+
+    symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+    if (error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL))
+	goto out;
+
+    if (SCARG(uap, fileid) != 0) {
+	lf = linker_find_file_by_id(SCARG(uap, fileid));
+	if (lf == NULL) {
+	    error = ENOENT;
+	    goto out;
+	}
+	if (lf->ops->lookup_symbol(lf, symstr, &sym) == 0 &&
+	    lf->ops->symbol_values(lf, sym, &symval) == 0) {
+	    lookup.symvalue = (u_long)symval.value;
+	    lookup.symsize = symval.size;
+	    error = copyout(&lookup, SCARG(uap, data), sizeof(lookup));
+	} else
+	    error = ENOENT;
+    } else {
+	for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) {
+	    if (lf->ops->lookup_symbol(lf, symstr, &sym) == 0 &&
+		lf->ops->symbol_values(lf, sym, &symval) == 0) {
+		lookup.symvalue = (u_long)symval.value;
+		lookup.symsize = symval.size;
+		error = copyout(&lookup, SCARG(uap, data), sizeof(lookup));
+		break;
+	    }
+	}
+	if (!lf)
+	    error = ENOENT;
+    }
+out:
+    if (symstr)
+	free(symstr, M_TEMP);
+    return error;
+}
+
+/*
+ * Preloaded module support
+ */
+
+static void
+linker_preload(void* arg)
+{
+    caddr_t		modptr;
+    char		*modname;
+    char		*modtype;
+    linker_file_t	lf;
+    linker_class_t	lc;
+    int			error;
+    struct linker_set	*sysinits;
+    struct sysinit	**sipp;
+    moduledata_t	*moddata;
+
+    modptr = NULL;
+    while ((modptr = preload_search_next_name(modptr)) != NULL) {
+	modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+	modtype = (char *)preload_search_info(modptr, MODINFO_TYPE);
+	if (modname == NULL) {
+	    printf("Preloaded module at %p does not have a name!\n", modptr);
+	    continue;
+	}
+	if (modtype == NULL) {
+	    printf("Preloaded module at %p does not have a type!\n", modptr);
+	    continue;
+	}
+	printf("Preloaded %s \"%s\" at %p.\n", modtype, modname, modptr);
+	lf = linker_find_file_by_name(modname);
+	if (lf) {
+	    lf->userrefs++;
+	    continue;
+	}
+	lf = NULL;
+	for (lc = TAILQ_FIRST(&classes); lc; lc = TAILQ_NEXT(lc, link)) {
+	    error = lc->ops->load_file(modname, &lf);
+	    if (error) {
+		lf = NULL;
+		break;
+	    }
+	}
+	if (lf) {
+	    lf->userrefs++;
+
+	    sysinits = (struct linker_set*)
+		linker_file_lookup_symbol(lf, "sysinit_set", 0);
+	    if (sysinits) {
+		/* HACK ALERT!
+		 * This is to set the sysinit moduledata so that the module
+		 * can attach itself to the correct containing file.
+		 * The sysinit could be run at *any* time.
+		 */
+		for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) {
+		    if ((*sipp)->func == module_register_init) {
+			moddata = (*sipp)->udata;
+			moddata->_file = lf;
+		    }
+		}
+		sysinit_add((struct sysinit **)sysinits->ls_items);
+	    }
+	}
+    }
+}
+
+SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0);
+
+/*
+ * Search for a not-loaded module by name.
+ *
+ * Modules may be found in the following locations:
+ *
+ * - preloaded (result is just the module name)
+ * - on disk (result is full path to module)
+ *
+ * If the module name is qualified in any way (contains path, etc.)
+ * the we simply return a copy of it.
+ *
+ * The search path can be manipulated via sysctl.  Note that we use the ';'
+ * character as a separator to be consistent with the bootloader.
+ */
+
+static char linker_path[MAXPATHLEN + 1] = "/;/boot/;/modules/";
+
+SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path,
+	      sizeof(linker_path), "module load search path");
+
+static char *
+linker_strdup(const char *str)
+{
+    char	*result;
+
+    if ((result = malloc((strlen(str) + 1), M_LINKER, M_WAITOK)) != NULL)
+	strcpy(result, str);
+    return(result);
+}
+
+char *
+linker_search_path(const char *name)
+{
+    struct nameidata	nd;
+    struct proc		*p = curproc;	/* XXX */
+    char		*cp, *ep, *result;
+    int			error;
+    enum vtype		type;
+
+    /* qualified at all? */
+    if (index(name, '/'))
+	return(linker_strdup(name));
+
+    /* traverse the linker path */
+    cp = linker_path;
+    for (;;) {
+
+	/* find the end of this component */
+	for (ep = cp; (*ep != 0) && (*ep != ';'); ep++)
+	    ;
+	result = malloc((strlen(name) + (ep - cp) + 1), M_LINKER, M_WAITOK);
+	if (result == NULL)	/* actually ENOMEM */
+	    return(NULL);
+
+	strncpy(result, cp, ep - cp);
+	strcpy(result + (ep - cp), name);
+
+	/*
+	 * Attempt to open the file, and return the path if we succeed and it's
+	 * a regular file.
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, p);
+	error = vn_open(&nd, FREAD, 0);
+	if (error == 0) {
+	    type = nd.ni_vp->v_type;
+	    VOP_UNLOCK(nd.ni_vp, 0, p);
+	    vn_close(nd.ni_vp, FREAD, p->p_ucred, p);
+	    if (type == VREG)
+		return(result);
+	}
+	free(result, M_LINKER);
+
+	if (*ep == 0)
+	    break;
+	cp = ep + 1;
+    }
+    return(NULL);
+}
diff --git a/sys/kern/kern_lkm.c b/sys/kern/kern_lkm.c
new file mode 100644
index 0000000..e5ea629
--- /dev/null
+++ b/sys/kern/kern_lkm.c
@@ -0,0 +1,838 @@
+/*-
+ * Copyright (c) 1992 Terrence R. Lambert.
+ * Copyright (c) 1994 Christopher G. Demetriou
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Terrence R. Lambert.
+ * 4. The name Terrence R. Lambert may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY TERRENCE R. LAMBERT ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_lkm.c,v 1.59 1998/11/10 09:12:40 peter Exp $
+ */
+
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/exec.h>
+#include <sys/lkm.h>
+#include <sys/vnode.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+
+#define PAGESIZE 1024		/* kmem_alloc() allocation quantum */
+
+#define	LKM_ALLOC	0x01
+#define	LKM_WANT	0x02
+
+#define	LKMS_IDLE	0x00
+#define	LKMS_RESERVED	0x01
+#define	LKMS_LOADING	0x02
+#define	LKMS_LOADED	0x04
+#define	LKMS_UNLOADING	0x08
+
+static int	lkm_v = 0;
+static int	lkm_state = LKMS_IDLE;
+
+#ifndef MAXLKMS
+#define	MAXLKMS		20
+#endif
+
+static struct lkm_table	lkmods[MAXLKMS];	/* table of loaded modules */
+static struct lkm_table	*curp;			/* global for in-progress ops */
+
+static int	_lkm_dev __P((struct lkm_table *lkmtp, int cmd));
+static int	_lkm_exec __P((struct lkm_table *lkmtp, int cmd));
+static int	_lkm_vfs __P((struct lkm_table *lkmtp, int cmd));
+static int	_lkm_syscall __P((struct lkm_table *lkmtp, int cmd));
+static void	lkmunreserve __P((void));
+
+static	d_open_t	lkmcopen;
+static	d_close_t	lkmcclose;
+static	d_ioctl_t	lkmcioctl;
+
+#define CDEV_MAJOR 32
+static struct cdevsw lkmc_cdevsw = 
+	{ lkmcopen,	lkmcclose,	noread,		nowrite,	/*32*/
+	  lkmcioctl,	nostop,		nullreset,	nodevtotty,
+	  seltrue,	nommap,		NULL,	"lkm",	NULL,	-1 };
+
+
+/*ARGSUSED*/
+static	int
+lkmcopen(dev, flag, devtype, p)
+	dev_t dev;
+	int flag;
+	int devtype;
+	struct proc *p;
+{
+	int error;
+
+	if (minor(dev) != 0)
+		return(ENXIO);		/* bad minor # */
+
+	/*
+	 * Use of the loadable kernel module device must be exclusive; we
+	 * may try to remove this restriction later, but it's really no
+	 * hardship.
+	 */
+	while (lkm_v & LKM_ALLOC) {
+		if (flag & FNONBLOCK)		/* don't hang */
+			return(EBUSY);
+		lkm_v |= LKM_WANT;
+		/*
+		 * Sleep pending unlock; we use tsleep() to allow
+		 * an alarm out of the open.
+		 */
+		error = tsleep((caddr_t)&lkm_v, TTIPRI|PCATCH, "lkmopn", 0);
+		if (error)
+			return(error);	/* leave LKM_WANT set -- no problem */
+	}
+	lkm_v |= LKM_ALLOC;
+
+	return(0);		/* pseudo-device open */
+}
+
+/*
+ * Unreserve the memory associated with the current loaded module; done on
+ * a coerced close of the lkm device (close on premature exit of modload)
+ * or explicitly by modload as a result of a link failure.
+ */
+static void
+lkmunreserve()
+{
+
+	if (lkm_state == LKMS_IDLE)
+		return;
+
+	/*
+	 * Actually unreserve the memory
+	 */
+	if (curp && curp->area) {
+		kmem_free(kernel_map, curp->area, curp->size);/**/
+		curp->area = 0;
+		if (curp->private.lkm_any != NULL)
+			curp->private.lkm_any = NULL;
+	}
+
+	lkm_state = LKMS_IDLE;
+}
+
+static	int
+lkmcclose(dev, flag, mode, p)
+	dev_t dev;
+	int flag;
+	int mode;
+	struct proc *p;
+{
+
+	if (!(lkm_v & LKM_ALLOC)) {
+#ifdef DEBUG
+		printf("LKM: close before open!\n");
+#endif	/* DEBUG */
+		return(EBADF);
+	}
+
+	/* do this before waking the herd... */
+	if (curp && !curp->used) {
+		/*
+		 * If we close before setting used, we have aborted
+		 * by way of error or by way of close-on-exit from
+		 * a premature exit of "modload".
+		 */
+		lkmunreserve();	/* coerce state to LKM_IDLE */
+	}
+
+	lkm_v &= ~LKM_ALLOC;
+	wakeup((caddr_t)&lkm_v);	/* thundering herd "problem" here */
+
+	return(0);		/* pseudo-device closed */
+}
+
+/*ARGSUSED*/
+static	int
+lkmcioctl(dev, cmd, data, flag, p)
+	dev_t dev;
+	u_long cmd;
+	caddr_t data;
+	int flag;
+	struct proc *p;
+{
+	int err = 0;
+	int i;
+	struct lmc_resrv *resrvp;
+	struct lmc_loadbuf *loadbufp;
+	struct lmc_unload *unloadp;
+	struct lmc_stat	 *statp;
+	char istr[MAXLKMNAME];
+
+	switch(cmd) {
+	case LMRESERV:		/* reserve pages for a module */
+		if ((flag & FWRITE) == 0 || securelevel > 0) 
+			/* only allow this if writing and insecure */
+			return EPERM;
+
+		resrvp = (struct lmc_resrv *)data;
+
+		/*
+		 * Find a free slot.
+		 */
+		for (i = 0; i < MAXLKMS; i++)
+			if (!lkmods[i].used)
+				break;
+		if (i == MAXLKMS) {
+			err = ENOMEM;		/* no slots available */
+			break;
+		}
+		curp = &lkmods[i];
+		curp->id = i;		/* self reference slot offset */
+
+		resrvp->slot = i;		/* return slot */
+
+		/*
+		 * Get memory for module
+		 */
+		curp->size = resrvp->size;
+
+		curp->area = kmem_alloc(kernel_map, curp->size);/**/
+
+		curp->offset = 0;		/* load offset */
+
+		resrvp->addr = curp->area; /* ret kernel addr */
+
+#ifdef DEBUG
+		printf("LKM: LMRESERV (actual   = 0x%08lx)\n", curp->area);
+		printf("LKM: LMRESERV (adjusted = 0x%08lx)\n",
+		    trunc_page(curp->area));
+#endif	/* DEBUG */
+		lkm_state = LKMS_RESERVED;
+		break;
+
+	case LMLOADBUF:		/* Copy in; stateful, follows LMRESERV */
+		if ((flag & FWRITE) == 0 || securelevel > 0)
+			/* only allow this if writing and insecure */
+			return EPERM;
+
+		loadbufp = (struct lmc_loadbuf *)data;
+		i = loadbufp->cnt;
+		if ((lkm_state != LKMS_RESERVED && lkm_state != LKMS_LOADING)
+		    || i < 0
+		    || i > MODIOBUF
+		    || i > curp->size - curp->offset) {
+			err = ENOMEM;
+			break;
+		}
+
+		/* copy in buffer full of data */
+		err = copyin((caddr_t)loadbufp->data,
+		    (caddr_t)(uintptr_t)(curp->area + curp->offset), i);
+		if (err)
+			break;
+
+		if ((curp->offset + i) < curp->size) {
+			lkm_state = LKMS_LOADING;
+#ifdef DEBUG
+			printf(
+			    "LKM: LMLOADBUF (loading @ %lu of %lu, i = %d)\n",
+			    curp->offset, curp->size, i);
+#endif	/* DEBUG */
+		} else {
+			lkm_state = LKMS_LOADED;
+#ifdef DEBUG
+			printf("LKM: LMLOADBUF (loaded)\n");
+#endif	/* DEBUG */
+		}
+		curp->offset += i;
+		break;
+
+	case LMUNRESRV:		/* discard reserved pages for a module */
+		if ((flag & FWRITE) == 0 || securelevel > 0)
+			/* only allow this if writing and insecure */
+			return EPERM;
+
+		lkmunreserve();	/* coerce state to LKM_IDLE */
+#ifdef DEBUG
+		printf("LKM: LMUNRESERV\n");
+#endif	/* DEBUG */
+		break;
+
+	case LMREADY:		/* module loaded: call entry */
+		if ((flag & FWRITE) == 0 || securelevel > 0)
+			/* only allow this if writing or insecure */
+			return EPERM;
+
+		switch (lkm_state) {
+		case LKMS_LOADED:
+			break;
+		case LKMS_LOADING:
+			/* The remainder must be bss, so we clear it */
+			bzero((caddr_t)(uintptr_t)(curp->area + curp->offset),
+			      curp->size - curp->offset);
+			break;
+		default:
+
+#ifdef DEBUG
+			printf("lkm_state is %02x\n", lkm_state);
+#endif	/* DEBUG */
+			return ENXIO;
+		}
+
+		/* XXX gack */
+		curp->entry = (int (*) __P((struct lkm_table *, int, int)))
+			      (*(uintfptr_t *)data);
+
+		/* call entry(load)... (assigns "private" portion) */
+		err = (*(curp->entry))(curp, LKM_E_LOAD, LKM_VERSION);
+		if (err) {
+			/*
+			 * Module may refuse loading or may have a
+			 * version mismatch...
+			 */
+			lkm_state = LKMS_UNLOADING;	/* for lkmunreserve */
+			lkmunreserve();			/* free memory */
+			curp->used = 0;			/* free slot */
+			break;
+		}
+		/*
+		 * It's possible for a user to load a module that doesn't
+		 * initialize itself correctly. (You can even get away with
+		 * using it for a while.) Unfortunately, we are faced with
+		 * the following problems:
+		 * - we can't tell a good module from a bad one until
+		 *   after we've run its entry function (if the private
+		 *   section is uninitalized after we return from the
+		 *   entry, then something's fishy)
+		 * - now that we've called the entry function, we can't
+		 *   forcibly unload the module without risking a crash
+		 * - since we don't know what the module's entry function
+		 *   did, we can't easily clean up the mess it may have
+		 *   made, so we can't know just how unstable the system
+		 *   may be
+		 * So, being stuck between a rock and a hard place, we
+		 * have no choice but to do this...
+		 */
+		if (curp->private.lkm_any == NULL)
+			panic("loadable module initialization failed");
+
+		curp->used = 1;
+#ifdef DEBUG
+		printf("LKM: LMREADY\n");
+#endif	/* DEBUG */
+		lkm_state = LKMS_IDLE;
+		break;
+
+	case LMUNLOAD:		/* unload a module */
+		if ((flag & FWRITE) == 0 || securelevel > 0)
+			/* only allow this if writing and insecure */
+			return EPERM;
+
+		unloadp = (struct lmc_unload *)data;
+
+		if ((i = unloadp->id) == -1) {		/* unload by name */
+			/*
+			 * Copy name and lookup id from all loaded
+			 * modules.  May fail.
+			 */
+		 	err =copyinstr(unloadp->name, istr, MAXLKMNAME-1, NULL);
+		 	if (err)
+				break;
+
+			/*
+			 * look up id...
+			 */
+			for (i = 0; i < MAXLKMS; i++) {
+				if (!lkmods[i].used)
+					continue;
+				if (!strcmp(istr,
+				        lkmods[i].private.lkm_any->lkm_name))
+					break;
+			}
+		}
+
+		/*
+		 * Range check the value; on failure, return EINVAL
+		 */
+		if (i < 0 || i >= MAXLKMS) {
+			err = EINVAL;
+			break;
+		}
+
+		curp = &lkmods[i];
+
+		if (!curp->used) {
+			err = ENOENT;
+			break;
+		}
+
+		/* call entry(unload) */
+		if ((*(curp->entry))(curp, LKM_E_UNLOAD, LKM_VERSION)) {
+			err = EBUSY;
+			break;
+		}
+
+		lkm_state = LKMS_UNLOADING;	/* non-idle for lkmunreserve */
+		lkmunreserve();			/* free memory */
+		curp->used = 0;			/* free slot */
+		break;
+
+	case LMSTAT:		/* stat a module by id/name */
+		/* allow readers and writers to stat */
+
+		statp = (struct lmc_stat *)data;
+
+		if ((i = statp->id) == -1) {		/* stat by name */
+			/*
+			 * Copy name and lookup id from all loaded
+			 * modules.
+			 */
+		 	copystr(statp->name, istr, MAXLKMNAME-1, NULL);
+			/*
+			 * look up id...
+			 */
+			for (i = 0; i < MAXLKMS; i++) {
+				if (!lkmods[i].used)
+					continue;
+				if (!strcmp(istr,
+				        lkmods[i].private.lkm_any->lkm_name))
+					break;
+			}
+
+			if (i == MAXLKMS) {		/* Not found */
+				err = ENOENT;
+				break;
+			}
+		}
+
+		/*
+		 * Range check the value; on failure, return EINVAL
+		 */
+		if (i < 0 || i >= MAXLKMS) {
+			err = EINVAL;
+			break;
+		}
+
+		curp = &lkmods[i];
+
+		if (!curp->used) {			/* Not found */
+			err = ENOENT;
+			break;
+		}
+
+		/*
+		 * Copy out stat information for this module...
+		 */
+		statp->id	= curp->id;
+		statp->offset	= curp->private.lkm_any->lkm_offset;
+		statp->type	= curp->private.lkm_any->lkm_type;
+		statp->area	= curp->area;
+		statp->size	= curp->size / PAGESIZE;
+		statp->private	= (uintptr_t)curp->private.lkm_any;
+		statp->ver	= curp->private.lkm_any->lkm_ver;
+		copystr(curp->private.lkm_any->lkm_name,
+			  statp->name,
+			  MAXLKMNAME - 2,
+			  NULL);
+
+		break;
+
+	default:		/* bad ioctl()... */
+		err = ENOTTY;
+		break;
+	}
+
+	return (err);
+}
+
+int
+lkmexists(lkmtp)
+	struct lkm_table *lkmtp;
+{
+	int i;
+
+	/*
+	 * see if name exists...
+	 */
+	for (i = 0; i < MAXLKMS; i++) {
+		/*
+		 * An unused module and the one we are testing are not
+		 * considered.
+		 */
+		if (!lkmods[i].used || &lkmods[i] == lkmtp)
+			continue;
+		if (!strcmp(lkmtp->private.lkm_any->lkm_name,
+			lkmods[i].private.lkm_any->lkm_name))
+			return(1);		/* already loaded... */
+	}
+
+	return(0);		/* module not loaded... */
+}
+
+/*
+ * For the loadable system call described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_syscall(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	struct lkm_syscall *args = lkmtp->private.lkm_syscall;
+	int i;
+	int err = 0;
+
+	switch(cmd) {
+	case LKM_E_LOAD:
+		/* don't load twice! */
+		if (lkmexists(lkmtp))
+			return(EEXIST);
+
+		if (args->lkm_offset == LKM_ANON)
+			i = NO_SYSCALL;
+		else
+			i = args->lkm_offset;
+
+		err = syscall_register(&i, args->lkm_sysent,
+			&(args->lkm_oldent));
+		if (err)
+			return(err);
+
+		/* done! */
+		args->lkm_offset = i;	/* slot in sysent[] */
+
+		break;
+
+	case LKM_E_UNLOAD:
+		/* current slot... */
+		i = args->lkm_offset;
+
+		err = syscall_deregister(&i, &(args->lkm_oldent));
+		if (err)
+			return(err);
+		break;
+
+	case LKM_E_STAT:	/* no special handling... */
+		break;
+	}
+
+	return(err);
+}
+
+/*
+ * For the loadable virtual file system described by the structure pointed
+ * to by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_vfs(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	struct lkm_vfs *args = lkmtp->private.lkm_vfs;
+	struct vfsconf *vfc = args->lkm_vfsconf;
+	int error, i;
+
+	switch(cmd) {
+	case LKM_E_LOAD:
+		/* don't load twice! */
+		if (lkmexists(lkmtp))
+			return(EEXIST);
+
+		for(i = 0; args->lkm_vnodeops->ls_items[i]; i++)
+			vfs_add_vnodeops((void*)args->lkm_vnodeops->ls_items[i]);
+		error = vfs_register(vfc);
+		if (error)
+			return(error);
+
+		args->lkm_offset = vfc->vfc_typenum;
+
+		/* done! */
+		break;
+
+	case LKM_E_UNLOAD:
+		/* current slot... */
+		i = args->lkm_offset;
+
+		error = vfs_unregister(vfc);
+		if (error)
+			return(error);
+
+		for(i = 0; args->lkm_vnodeops->ls_items[i]; i++)
+			vfs_rm_vnodeops((void*)args->lkm_vnodeops->ls_items[i]);
+
+		break;
+
+	case LKM_E_STAT:	/* no special handling... */
+		break;
+	}
+	return (0);
+}
+
+/*
+ * For the loadable device driver described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_dev(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	struct lkm_dev *args = lkmtp->private.lkm_dev;
+	int i;
+	dev_t descrip;
+	int err = 0;
+
+	switch(cmd) {
+	case LKM_E_LOAD:
+		/* don't load twice! */
+		if (lkmexists(lkmtp))
+			return(EEXIST);
+		switch(args->lkm_devtype) {
+		case LM_DT_CHAR:
+			if ((i = args->lkm_offset) == LKM_ANON)
+				descrip = (dev_t) -1;
+			else
+				descrip = makedev(args->lkm_offset,0);
+			if ( err = cdevsw_add(&descrip, args->lkm_dev.cdev,
+					&(args->lkm_olddev.cdev))) {
+				break;
+			}
+			args->lkm_offset = major(descrip) ;
+			break;
+
+		default:
+			err = ENODEV;
+			break;
+		}
+		break;
+
+	case LKM_E_UNLOAD:
+		/* current slot... */
+		i = args->lkm_offset;
+		descrip = makedev(i,0);
+
+		switch(args->lkm_devtype) {
+		case LM_DT_CHAR:
+			/* replace current slot contents with old contents */
+			cdevsw_add(&descrip, args->lkm_olddev.cdev,NULL);
+			break;
+
+		default:
+			err = ENODEV;
+			break;
+		}
+		break;
+
+	case LKM_E_STAT:	/* no special handling... */
+		break;
+	}
+
+	return(err);
+}
+
+#ifdef STREAMS
+/*
+ * For the loadable streams module described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_strmod(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	struct lkm_strmod *args = lkmtp->private.lkm_strmod;
+	int i;
+	int err = 0;
+
+	switch(cmd) {
+	case LKM_E_LOAD:
+		/* don't load twice! */
+		if (lkmexists(lkmtp))
+			return(EEXIST);
+		break;
+
+	case LKM_E_UNLOAD:
+		break;
+
+	case LKM_E_STAT:	/* no special handling... */
+		break;
+	}
+
+	return(err);
+}
+#endif	/* STREAMS */
+
+/*
+ * For the loadable execution class described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_exec(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	struct lkm_exec *args = lkmtp->private.lkm_exec;
+	int err = 0;
+
+	switch(cmd) {
+	case LKM_E_LOAD:
+		/* don't load twice! */
+		if (lkmexists(lkmtp))
+			return(EEXIST);
+		if (args->lkm_offset != LKM_ANON) {	/* auto */
+			err = EINVAL;
+			break;
+		}
+
+		err = exec_register(args->lkm_exec);
+
+		/* done! */
+		args->lkm_offset = 0;
+
+		break;
+
+	case LKM_E_UNLOAD:
+
+		err = exec_unregister(args->lkm_exec);
+
+		break;
+
+	case LKM_E_STAT:	/* no special handling... */
+		break;
+	}
+	return(err);
+}
+
+/*
+ * This code handles the per-module type "wiring-in" of loadable modules
+ * into existing kernel tables.  For "LM_MISC" modules, wiring and unwiring
+ * is assumed to be done in their entry routines internal to the module
+ * itself.
+ */
+int
+lkmdispatch(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	int err = 0;		/* default = success */
+
+	switch(lkmtp->private.lkm_any->lkm_type) {
+	case LM_SYSCALL:
+		err = _lkm_syscall(lkmtp, cmd);
+		break;
+
+	case LM_VFS:
+		err = _lkm_vfs(lkmtp, cmd);
+		break;
+
+	case LM_DEV:
+		err = _lkm_dev(lkmtp, cmd);
+		break;
+
+#ifdef STREAMS
+	case LM_STRMOD:
+	    {
+		struct lkm_strmod *args = lkmtp->private.lkm_strmod;
+	    }
+		break;
+
+#endif	/* STREAMS */
+
+	case LM_EXEC:
+		err = _lkm_exec(lkmtp, cmd);
+		break;
+
+	case LM_MISC:	/* ignore content -- no "misc-specific" procedure */
+		if (lkmexists(lkmtp))
+			err = EEXIST;
+		break;
+
+	default:
+		err = ENXIO;	/* unknown type */
+		break;
+	}
+
+	return(err);
+}
+
+int
+lkm_nullcmd(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+
+	return (0);
+}
+
+#ifdef DEVFS
+static void	*lkmc_devfs_token;
+#endif
+
+static int
+lkm_modevent(module_t mod, int type, void *data)
+{
+	dev_t dev;
+	static struct cdevsw *oldcdevsw;
+
+	switch (type) {
+	case MOD_LOAD:
+		dev = makedev(CDEV_MAJOR, 0);
+		cdevsw_add(&dev, &lkmc_cdevsw, &oldcdevsw);
+#ifdef DEVFS
+		lkmc_devfs_token = devfs_add_devswf(&lkmc_cdevsw, 0, DV_CHR,
+						    UID_ROOT, GID_WHEEL, 0644,
+						    "lkm");
+#endif
+		break;
+	case MOD_UNLOAD:
+#ifdef DEVFS
+		devfs_remove_dev(lkmc_devfs_token);
+#endif
+		cdevsw_add(&dev, oldcdevsw, NULL);
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+static moduledata_t lkm_mod = {
+	"lkm",
+	lkm_modevent,
+	NULL
+};
+DECLARE_MODULE(lkm, lkm_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR);
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
new file mode 100644
index 0000000..e832acf
--- /dev/null
+++ b/sys/kern/kern_lock.c
@@ -0,0 +1,613 @@
+/* 
+ * Copyright (c) 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Copyright (C) 1997
+ *	John S. Dyson.  All rights reserved.
+ *
+ * This code contains ideas from software contributed to Berkeley by
+ * Avadis Tevanian, Jr., Michael Wayne Young, and the Mach Operating
+ * System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_lock.c	8.18 (Berkeley) 5/21/95
+ * $Id: kern_lock.c,v 1.22 1999/01/10 01:58:24 eivind Exp $
+ */
+
+#include "opt_lint.h"
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/systm.h>
+
+/*
+ * Locking primitives implementation.
+ * Locks provide shared/exclusive sychronization.
+ */
+
+#ifdef SIMPLELOCK_DEBUG
+#define COUNT(p, x) if (p) (p)->p_locks += (x)
+#else
+#define COUNT(p, x)
+#endif
+
+#define LOCK_WAIT_TIME 100
+#define LOCK_SAMPLE_WAIT 7
+
+#if defined(DIAGNOSTIC)
+#define LOCK_INLINE
+#else
+#define LOCK_INLINE __inline
+#endif
+
+#define LK_ALL (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | \
+	LK_SHARE_NONZERO | LK_WAIT_NONZERO)
+
+static int acquire(struct lock *lkp, int extflags, int wanted);
+static int apause(struct lock *lkp, int flags);
+static int acquiredrain(struct lock *lkp, int extflags) ;
+
+static LOCK_INLINE void
+sharelock(struct lock *lkp, int incr) {
+	lkp->lk_flags |= LK_SHARE_NONZERO;
+	lkp->lk_sharecount += incr;
+}
+
+static LOCK_INLINE void
+shareunlock(struct lock *lkp, int decr) {
+
+	KASSERT(lkp->lk_sharecount >= decr, ("shareunlock: count < decr"));
+
+	if (lkp->lk_sharecount == decr) {
+		lkp->lk_flags &= ~LK_SHARE_NONZERO;
+		if (lkp->lk_flags & (LK_WANT_UPGRADE | LK_WANT_EXCL)) {
+			wakeup(lkp);
+		}
+		lkp->lk_sharecount = 0;
+	} else {
+		lkp->lk_sharecount -= decr;
+	}
+}
+
+/*
+ * This is the waitloop optimization, and note for this to work
+ * simple_lock and simple_unlock should be subroutines to avoid
+ * optimization troubles.
+ */
+static int
+apause(struct lock *lkp, int flags) {
+	int lock_wait;
+	lock_wait = LOCK_WAIT_TIME;
+	for (; lock_wait > 0; lock_wait--) {
+		int i;
+		if ((lkp->lk_flags & flags) == 0)
+			return 0;
+		simple_unlock(&lkp->lk_interlock);
+		for (i = LOCK_SAMPLE_WAIT; i > 0; i--) {
+			if ((lkp->lk_flags & flags) == 0) {
+				simple_lock(&lkp->lk_interlock);
+				if ((lkp->lk_flags & flags) == 0)
+					return 0;
+				break;
+			}
+		}
+	}
+	return 1;
+}
+
+static int
+acquire(struct lock *lkp, int extflags, int wanted) {
+	int s, error;
+
+	if ((extflags & LK_NOWAIT) && (lkp->lk_flags & wanted)) {
+		return EBUSY;
+	}
+
+	if (((lkp->lk_flags | extflags) & LK_NOPAUSE) == 0) {
+		error = apause(lkp, wanted);
+		if (error == 0)
+			return 0;
+	}
+
+	s = splhigh();
+	while ((lkp->lk_flags & wanted) != 0) {
+		lkp->lk_flags |= LK_WAIT_NONZERO;
+		lkp->lk_waitcount++;
+		simple_unlock(&lkp->lk_interlock);
+		error = tsleep(lkp, lkp->lk_prio, lkp->lk_wmesg, lkp->lk_timo);
+		simple_lock(&lkp->lk_interlock);
+		if (lkp->lk_waitcount == 1) {
+			lkp->lk_flags &= ~LK_WAIT_NONZERO;
+			lkp->lk_waitcount = 0;
+		} else {
+			lkp->lk_waitcount--;
+		}
+		if (error) {
+			splx(s);
+			return error;
+		}
+		if (extflags & LK_SLEEPFAIL) {
+			splx(s);
+			return ENOLCK;
+		}
+	}
+	splx(s);
+	return 0;
+}
+
+/*
+ * Set, change, or release a lock.
+ *
+ * Shared requests increment the shared count. Exclusive requests set the
+ * LK_WANT_EXCL flag (preventing further shared locks), and wait for already
+ * accepted shared locks and shared-to-exclusive upgrades to go away.
+ */
+int
+#ifndef	DEBUG_LOCKS
+lockmgr(lkp, flags, interlkp, p)
+#else
+debuglockmgr(lkp, flags, interlkp, p, name, file, line)
+#endif
+	struct lock *lkp;
+	u_int flags;
+	struct simplelock *interlkp;
+	struct proc *p;
+#ifdef	DEBUG_LOCKS
+	const char *name;	/* Name of lock function */
+	const char *file;	/* Name of file call is from */
+	int line;		/* Line number in file */
+#endif
+{
+	int error;
+	pid_t pid;
+	int extflags;
+
+	error = 0;
+	if (p == NULL)
+		pid = LK_KERNPROC;
+	else
+		pid = p->p_pid;
+
+	simple_lock(&lkp->lk_interlock);
+	if (flags & LK_INTERLOCK)
+		simple_unlock(interlkp);
+
+	extflags = (flags | lkp->lk_flags) & LK_EXTFLG_MASK;
+
+	switch (flags & LK_TYPE_MASK) {
+
+	case LK_SHARED:
+		if (lkp->lk_lockholder != pid) {
+			error = acquire(lkp, extflags,
+				LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE);
+			if (error)
+				break;
+			sharelock(lkp, 1);
+			COUNT(p, 1);
+			break;
+		}
+		/*
+		 * We hold an exclusive lock, so downgrade it to shared.
+		 * An alternative would be to fail with EDEADLK.
+		 */
+		sharelock(lkp, 1);
+		COUNT(p, 1);
+		/* fall into downgrade */
+
+	case LK_DOWNGRADE:
+#if !defined(MAX_PERF)
+		if (lkp->lk_lockholder != pid || lkp->lk_exclusivecount == 0)
+			panic("lockmgr: not holding exclusive lock");
+#endif
+		sharelock(lkp, lkp->lk_exclusivecount);
+		lkp->lk_exclusivecount = 0;
+		lkp->lk_flags &= ~LK_HAVE_EXCL;
+		lkp->lk_lockholder = LK_NOPROC;
+		if (lkp->lk_waitcount)
+			wakeup((void *)lkp);
+		break;
+
+	case LK_EXCLUPGRADE:
+		/*
+		 * If another process is ahead of us to get an upgrade,
+		 * then we want to fail rather than have an intervening
+		 * exclusive access.
+		 */
+		if (lkp->lk_flags & LK_WANT_UPGRADE) {
+			shareunlock(lkp, 1);
+			COUNT(p, -1);
+			error = EBUSY;
+			break;
+		}
+		/* fall into normal upgrade */
+
+	case LK_UPGRADE:
+		/*
+		 * Upgrade a shared lock to an exclusive one. If another
+		 * shared lock has already requested an upgrade to an
+		 * exclusive lock, our shared lock is released and an
+		 * exclusive lock is requested (which will be granted
+		 * after the upgrade). If we return an error, the file
+		 * will always be unlocked.
+		 */
+#if !defined(MAX_PERF)
+		if ((lkp->lk_lockholder == pid) || (lkp->lk_sharecount <= 0))
+			panic("lockmgr: upgrade exclusive lock");
+#endif
+		shareunlock(lkp, 1);
+		COUNT(p, -1);
+		/*
+		 * If we are just polling, check to see if we will block.
+		 */
+		if ((extflags & LK_NOWAIT) &&
+		    ((lkp->lk_flags & LK_WANT_UPGRADE) ||
+		     lkp->lk_sharecount > 1)) {
+			error = EBUSY;
+			break;
+		}
+		if ((lkp->lk_flags & LK_WANT_UPGRADE) == 0) {
+			/*
+			 * We are first shared lock to request an upgrade, so
+			 * request upgrade and wait for the shared count to
+			 * drop to zero, then take exclusive lock.
+			 */
+			lkp->lk_flags |= LK_WANT_UPGRADE;
+			error = acquire(lkp, extflags, LK_SHARE_NONZERO);
+			lkp->lk_flags &= ~LK_WANT_UPGRADE;
+
+			if (error)
+				break;
+			lkp->lk_flags |= LK_HAVE_EXCL;
+			lkp->lk_lockholder = pid;
+#if !defined(MAX_PERF)
+			if (lkp->lk_exclusivecount != 0)
+				panic("lockmgr: non-zero exclusive count");
+#endif
+			lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+			lkp->lk_filename = file;
+			lkp->lk_lineno = line;
+			lkp->lk_lockername = name;
+#endif
+			COUNT(p, 1);
+			break;
+		}
+		/*
+		 * Someone else has requested upgrade. Release our shared
+		 * lock, awaken upgrade requestor if we are the last shared
+		 * lock, then request an exclusive lock.
+		 */
+		if ( (lkp->lk_flags & (LK_SHARE_NONZERO|LK_WAIT_NONZERO)) ==
+			LK_WAIT_NONZERO)
+			wakeup((void *)lkp);
+		/* fall into exclusive request */
+
+	case LK_EXCLUSIVE:
+		if (lkp->lk_lockholder == pid && pid != LK_KERNPROC) {
+			/*
+			 *	Recursive lock.
+			 */
+#if !defined(MAX_PERF)
+			if ((extflags & LK_CANRECURSE) == 0)
+				panic("lockmgr: locking against myself");
+#endif
+			lkp->lk_exclusivecount++;
+			COUNT(p, 1);
+			break;
+		}
+		/*
+		 * If we are just polling, check to see if we will sleep.
+		 */
+		if ((extflags & LK_NOWAIT) &&
+		    (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | LK_SHARE_NONZERO))) {
+			error = EBUSY;
+			break;
+		}
+		/*
+		 * Try to acquire the want_exclusive flag.
+		 */
+		error = acquire(lkp, extflags, (LK_HAVE_EXCL | LK_WANT_EXCL));
+		if (error)
+			break;
+		lkp->lk_flags |= LK_WANT_EXCL;
+		/*
+		 * Wait for shared locks and upgrades to finish.
+		 */
+		error = acquire(lkp, extflags, LK_WANT_UPGRADE | LK_SHARE_NONZERO);
+		lkp->lk_flags &= ~LK_WANT_EXCL;
+		if (error)
+			break;
+		lkp->lk_flags |= LK_HAVE_EXCL;
+		lkp->lk_lockholder = pid;
+#if !defined(MAX_PERF)
+		if (lkp->lk_exclusivecount != 0)
+			panic("lockmgr: non-zero exclusive count");
+#endif
+		lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+			lkp->lk_filename = file;
+			lkp->lk_lineno = line;
+			lkp->lk_lockername = name;
+#endif
+		COUNT(p, 1);
+		break;
+
+	case LK_RELEASE:
+		if (lkp->lk_exclusivecount != 0) {
+#if !defined(MAX_PERF)
+			if (pid != lkp->lk_lockholder)
+				panic("lockmgr: pid %d, not %s %d unlocking",
+				    pid, "exclusive lock holder",
+				    lkp->lk_lockholder);
+#endif
+			COUNT(p, -1);
+			if (lkp->lk_exclusivecount == 1) {
+				lkp->lk_flags &= ~LK_HAVE_EXCL;
+				lkp->lk_lockholder = LK_NOPROC;
+				lkp->lk_exclusivecount = 0;
+			} else {
+				lkp->lk_exclusivecount--;
+			}
+		} else if (lkp->lk_flags & LK_SHARE_NONZERO) {
+			shareunlock(lkp, 1);
+			COUNT(p, -1);
+		}
+		if (lkp->lk_flags & LK_WAIT_NONZERO)
+			wakeup((void *)lkp);
+		break;
+
+	case LK_DRAIN:
+		/*
+		 * Check that we do not already hold the lock, as it can 
+		 * never drain if we do. Unfortunately, we have no way to
+		 * check for holding a shared lock, but at least we can
+		 * check for an exclusive one.
+		 */
+#if !defined(MAX_PERF)
+		if (lkp->lk_lockholder == pid)
+			panic("lockmgr: draining against myself");
+#endif
+
+		error = acquiredrain(lkp, extflags);
+		if (error)
+			break;
+		lkp->lk_flags |= LK_DRAINING | LK_HAVE_EXCL;
+		lkp->lk_lockholder = pid;
+		lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+			lkp->lk_filename = file;
+			lkp->lk_lineno = line;
+			lkp->lk_lockername = name;
+#endif
+		COUNT(p, 1);
+		break;
+
+	default:
+#if !defined(MAX_PERF)
+		simple_unlock(&lkp->lk_interlock);
+		panic("lockmgr: unknown locktype request %d",
+		    flags & LK_TYPE_MASK);
+#endif
+		/* NOTREACHED */
+	}
+	if ((lkp->lk_flags & LK_WAITDRAIN) &&
+	    (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE |
+		LK_SHARE_NONZERO | LK_WAIT_NONZERO)) == 0) {
+		lkp->lk_flags &= ~LK_WAITDRAIN;
+		wakeup((void *)&lkp->lk_flags);
+	}
+	simple_unlock(&lkp->lk_interlock);
+	return (error);
+}
+
+static int
+acquiredrain(struct lock *lkp, int extflags) {
+	int error;
+
+	if ((extflags & LK_NOWAIT) && (lkp->lk_flags & LK_ALL)) {
+		return EBUSY;
+	}
+
+	error = apause(lkp, LK_ALL);
+	if (error == 0)
+		return 0;
+
+	while (lkp->lk_flags & LK_ALL) {
+		lkp->lk_flags |= LK_WAITDRAIN;
+		simple_unlock(&lkp->lk_interlock);
+		error = tsleep(&lkp->lk_flags, lkp->lk_prio,
+			lkp->lk_wmesg, lkp->lk_timo);
+		simple_lock(&lkp->lk_interlock);
+		if (error)
+			return error;
+		if (extflags & LK_SLEEPFAIL) {
+			return ENOLCK;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Initialize a lock; required before use.
+ */
+void
+lockinit(lkp, prio, wmesg, timo, flags)
+	struct lock *lkp;
+	int prio;
+	char *wmesg;
+	int timo;
+	int flags;
+{
+
+	simple_lock_init(&lkp->lk_interlock);
+	lkp->lk_flags = (flags & LK_EXTFLG_MASK);
+	lkp->lk_sharecount = 0;
+	lkp->lk_waitcount = 0;
+	lkp->lk_exclusivecount = 0;
+	lkp->lk_prio = prio;
+	lkp->lk_wmesg = wmesg;
+	lkp->lk_timo = timo;
+	lkp->lk_lockholder = LK_NOPROC;
+}
+
+/*
+ * Determine the status of a lock.
+ */
+int
+lockstatus(lkp)
+	struct lock *lkp;
+{
+	int lock_type = 0;
+
+	simple_lock(&lkp->lk_interlock);
+	if (lkp->lk_exclusivecount != 0)
+		lock_type = LK_EXCLUSIVE;
+	else if (lkp->lk_sharecount != 0)
+		lock_type = LK_SHARED;
+	simple_unlock(&lkp->lk_interlock);
+	return (lock_type);
+}
+
+/*
+ * Print out information about state of a lock. Used by VOP_PRINT
+ * routines to display status about contained locks.
+ */
+void
+lockmgr_printinfo(lkp)
+	struct lock *lkp;
+{
+
+	if (lkp->lk_sharecount)
+		printf(" lock type %s: SHARED (count %d)", lkp->lk_wmesg,
+		    lkp->lk_sharecount);
+	else if (lkp->lk_flags & LK_HAVE_EXCL)
+		printf(" lock type %s: EXCL (count %d) by pid %d",
+		    lkp->lk_wmesg, lkp->lk_exclusivecount, lkp->lk_lockholder);
+	if (lkp->lk_waitcount > 0)
+		printf(" with %d pending", lkp->lk_waitcount);
+}
+
+#if defined(SIMPLELOCK_DEBUG) && (NCPUS == 1 || defined(COMPILING_LINT))
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+static int lockpausetime = 0;
+SYSCTL_INT(_debug, OID_AUTO, lockpausetime, CTLFLAG_RW, &lockpausetime, 0, "");
+
+static int simplelockrecurse;
+
+/*
+ * Simple lock functions so that the debugger can see from whence
+ * they are being called.
+ */
+void
+simple_lock_init(alp)
+	struct simplelock *alp;
+{
+
+	alp->lock_data = 0;
+}
+
+void
+_simple_lock(alp, id, l)
+	struct simplelock *alp;
+	const char *id;
+	int l;
+{
+
+	if (simplelockrecurse)
+		return;
+	if (alp->lock_data == 1) {
+		if (lockpausetime == -1)
+			panic("%s:%d: simple_lock: lock held", id, l);
+		printf("%s:%d: simple_lock: lock held\n", id, l);
+		if (lockpausetime == 1) {
+			Debugger("simple_lock");
+			/*BACKTRACE(curproc); */
+		} else if (lockpausetime > 1) {
+			printf("%s:%d: simple_lock: lock held...", id, l);
+			tsleep(&lockpausetime, PCATCH | PPAUSE, "slock",
+			    lockpausetime * hz);
+			printf(" continuing\n");
+		}
+	}
+	alp->lock_data = 1;
+	if (curproc)
+		curproc->p_simple_locks++;
+}
+
+int
+_simple_lock_try(alp, id, l)
+	struct simplelock *alp;
+	const char *id;
+	int l;
+{
+
+	if (alp->lock_data)
+		return (0);
+	if (simplelockrecurse)
+		return (1);
+	alp->lock_data = 1;
+	if (curproc)
+		curproc->p_simple_locks++;
+	return (1);
+}
+
+void
+_simple_unlock(alp, id, l)
+	struct simplelock *alp;
+	const char *id;
+	int l;
+{
+
+	if (simplelockrecurse)
+		return;
+	if (alp->lock_data == 0) {
+		if (lockpausetime == -1)
+			panic("%s:%d: simple_unlock: lock not held", id, l);
+		printf("%s:%d: simple_unlock: lock not held\n", id, l);
+		if (lockpausetime == 1) {
+			Debugger("simple_unlock");
+			/* BACKTRACE(curproc); */
+		} else if (lockpausetime > 1) {
+			printf("%s:%d: simple_unlock: lock not held...", id, l);
+			tsleep(&lockpausetime, PCATCH | PPAUSE, "sunlock",
+			    lockpausetime * hz);
+			printf(" continuing\n");
+		}
+	}
+	alp->lock_data = 0;
+	if (curproc)
+		curproc->p_simple_locks--;
+}
+#elif defined(SIMPLELOCK_DEBUG)
+#error "SIMPLELOCK_DEBUG is not compatible with SMP!"
+#endif /* SIMPLELOCK_DEBUG && NCPUS == 1 */
diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c
new file mode 100644
index 0000000..cc1b8a5
--- /dev/null
+++ b/sys/kern/kern_lockf.c
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Scooter Morris at Genentech Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_lockf.c	8.3 (Berkeley) 1/6/94
+ * $Id: kern_lockf.c,v 1.19 1998/07/29 17:38:14 bde Exp $
+ */
+
+#include "opt_debug_lockf.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+
+#include <sys/lockf.h>
+
+/*
+ * This variable controls the maximum number of processes that will
+ * be checked in doing deadlock detection.
+ */
+static int maxlockdepth = MAXDEPTH;
+
+#ifdef LOCKF_DEBUG
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+
+
+static int	lockf_debug = 0;
+SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
+#endif
+
+static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
+
+#define NOLOCKF (struct lockf *)0
+#define SELF	0x1
+#define OTHERS	0x2
+static int	 lf_clearlock __P((struct lockf *));
+static int	 lf_findoverlap __P((struct lockf *,
+	    struct lockf *, int, struct lockf ***, struct lockf **));
+static struct lockf *
+	 lf_getblock __P((struct lockf *));
+static int	 lf_getlock __P((struct lockf *, struct flock *));
+static int	 lf_setlock __P((struct lockf *));
+static void	 lf_split __P((struct lockf *, struct lockf *));
+static void	 lf_wakelock __P((struct lockf *));
+
+/*
+ * Advisory record locking support
+ */
+int
+lf_advlock(ap, head, size)
+	struct vop_advlock_args /* {
+		struct vnode *a_vp;
+		caddr_t  a_id;
+		int  a_op;
+		struct flock *a_fl;
+		int  a_flags;
+	} */ *ap;
+	struct lockf **head;
+	u_quad_t size;
+{
+	register struct flock *fl = ap->a_fl;
+	register struct lockf *lock;
+	off_t start, end;
+	int error;
+
+	/*
+	 * Convert the flock structure into a start and end.
+	 */
+	switch (fl->l_whence) {
+
+	case SEEK_SET:
+	case SEEK_CUR:
+		/*
+		 * Caller is responsible for adding any necessary offset
+		 * when SEEK_CUR is used.
+		 */
+		start = fl->l_start;
+		break;
+
+	case SEEK_END:
+		start = size + fl->l_start;
+		break;
+
+	default:
+		return (EINVAL);
+	}
+	if (start < 0)
+		return (EINVAL);
+	if (fl->l_len == 0)
+		end = -1;
+	else {
+		end = start + fl->l_len - 1;
+		if (end < start)
+			return (EINVAL);
+	}
+	/*
+	 * Avoid the common case of unlocking when inode has no locks.
+	 */
+	if (*head == (struct lockf *)0) {
+		if (ap->a_op != F_SETLK) {
+			fl->l_type = F_UNLCK;
+			return (0);
+		}
+	}
+	/*
+	 * Create the lockf structure
+	 */
+	MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+	lock->lf_start = start;
+	lock->lf_end = end;
+	lock->lf_id = ap->a_id;
+/*	lock->lf_inode = ip; */	/* XXX JH */
+	lock->lf_type = fl->l_type;
+	lock->lf_head = head;
+	lock->lf_next = (struct lockf *)0;
+	TAILQ_INIT(&lock->lf_blkhd);
+	lock->lf_flags = ap->a_flags;
+	/*
+	 * Do the requested operation.
+	 */
+	switch(ap->a_op) {
+	case F_SETLK:
+		return (lf_setlock(lock));
+
+	case F_UNLCK:
+		error = lf_clearlock(lock);
+		FREE(lock, M_LOCKF);
+		return (error);
+
+	case F_GETLK:
+		error = lf_getlock(lock, fl);
+		FREE(lock, M_LOCKF);
+		return (error);
+
+	default:
+		free(lock, M_LOCKF);
+		return (EINVAL);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Set a byte-range lock.
+ */
+static int
+lf_setlock(lock)
+	register struct lockf *lock;
+{
+	register struct lockf *block;
+	struct lockf **head = lock->lf_head;
+	struct lockf **prev, *overlap, *ltmp;
+	static char lockstr[] = "lockf";
+	int ovcase, priority, needtolink, error;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_print("lf_setlock", lock);
+#endif /* LOCKF_DEBUG */
+
+	/*
+	 * Set the priority
+	 */
+	priority = PLOCK;
+	if (lock->lf_type == F_WRLCK)
+		priority += 4;
+	priority |= PCATCH;
+	/*
+	 * Scan lock list for this file looking for locks that would block us.
+	 */
+	while ((block = lf_getblock(lock))) {
+		/*
+		 * Free the structure and return if nonblocking.
+		 */
+		if ((lock->lf_flags & F_WAIT) == 0) {
+			FREE(lock, M_LOCKF);
+			return (EAGAIN);
+		}
+		/*
+		 * We are blocked. Since flock style locks cover
+		 * the whole file, there is no chance for deadlock.
+		 * For byte-range locks we must check for deadlock.
+		 *
+		 * Deadlock detection is done by looking through the
+		 * wait channels to see if there are any cycles that
+		 * involve us. MAXDEPTH is set just to make sure we
+		 * do not go off into neverland.
+		 */
+		if ((lock->lf_flags & F_POSIX) &&
+		    (block->lf_flags & F_POSIX)) {
+			register struct proc *wproc;
+			register struct lockf *waitblock;
+			int i = 0;
+
+			/* The block is waiting on something */
+			wproc = (struct proc *)block->lf_id;
+			while (wproc->p_wchan &&
+			       (wproc->p_wmesg == lockstr) &&
+			       (i++ < maxlockdepth)) {
+				waitblock = (struct lockf *)wproc->p_wchan;
+				/* Get the owner of the blocking lock */
+				waitblock = waitblock->lf_next;
+				if ((waitblock->lf_flags & F_POSIX) == 0)
+					break;
+				wproc = (struct proc *)waitblock->lf_id;
+				if (wproc == (struct proc *)lock->lf_id) {
+					free(lock, M_LOCKF);
+					return (EDEADLK);
+				}
+			}
+		}
+		/*
+		 * For flock type locks, we must first remove
+		 * any shared locks that we hold before we sleep
+		 * waiting for an exclusive lock.
+		 */
+		if ((lock->lf_flags & F_FLOCK) &&
+		    lock->lf_type == F_WRLCK) {
+			lock->lf_type = F_UNLCK;
+			(void) lf_clearlock(lock);
+			lock->lf_type = F_WRLCK;
+		}
+		/*
+		 * Add our lock to the blocked list and sleep until we're free.
+		 * Remember who blocked us (for deadlock detection).
+		 */
+		lock->lf_next = block;
+		TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 1) {
+			lf_print("lf_setlock: blocking on", block);
+			lf_printlist("lf_setlock", block);
+		}
+#endif /* LOCKF_DEBUG */
+		if ((error = tsleep((caddr_t)lock, priority, lockstr, 0))) {
+                        /*
+			 * We may have been awakened by a signal (in
+			 * which case we must remove ourselves from the
+			 * blocked list) and/or by another process
+			 * releasing a lock (in which case we have already
+			 * been removed from the blocked list and our
+			 * lf_next field set to NOLOCKF).
+                         */
+			if (lock->lf_next)
+				TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock,
+					lf_block);
+                        free(lock, M_LOCKF);
+                        return (error);
+		}
+	}
+	/*
+	 * No blocks!!  Add the lock.  Note that we will
+	 * downgrade or upgrade any overlapping locks this
+	 * process already owns.
+	 *
+	 * Skip over locks owned by other processes.
+	 * Handle any locks that overlap and are owned by ourselves.
+	 */
+	prev = head;
+	block = *head;
+	needtolink = 1;
+	for (;;) {
+		ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
+		if (ovcase)
+			block = overlap->lf_next;
+		/*
+		 * Six cases:
+		 *	0) no overlap
+		 *	1) overlap == lock
+		 *	2) overlap contains lock
+		 *	3) lock contains overlap
+		 *	4) overlap starts before lock
+		 *	5) overlap ends after lock
+		 */
+		switch (ovcase) {
+		case 0: /* no overlap */
+			if (needtolink) {
+				*prev = lock;
+				lock->lf_next = overlap;
+			}
+			break;
+
+		case 1: /* overlap == lock */
+			/*
+			 * If downgrading lock, others may be
+			 * able to acquire it.
+			 */
+			if (lock->lf_type == F_RDLCK &&
+			    overlap->lf_type == F_WRLCK)
+				lf_wakelock(overlap);
+			overlap->lf_type = lock->lf_type;
+			FREE(lock, M_LOCKF);
+			lock = overlap; /* for debug output below */
+			break;
+
+		case 2: /* overlap contains lock */
+			/*
+			 * Check for common starting point and different types.
+			 */
+			if (overlap->lf_type == lock->lf_type) {
+				free(lock, M_LOCKF);
+				lock = overlap; /* for debug output below */
+				break;
+			}
+			if (overlap->lf_start == lock->lf_start) {
+				*prev = lock;
+				lock->lf_next = overlap;
+				overlap->lf_start = lock->lf_end + 1;
+			} else
+				lf_split(overlap, lock);
+			lf_wakelock(overlap);
+			break;
+
+		case 3: /* lock contains overlap */
+			/*
+			 * If downgrading lock, others may be able to
+			 * acquire it, otherwise take the list.
+			 */
+			if (lock->lf_type == F_RDLCK &&
+			    overlap->lf_type == F_WRLCK) {
+				lf_wakelock(overlap);
+			} else {
+				while (ltmp = overlap->lf_blkhd.tqh_first) {
+					TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
+					    lf_block);
+					TAILQ_INSERT_TAIL(&lock->lf_blkhd,
+					    ltmp, lf_block);
+				}
+			}
+			/*
+			 * Add the new lock if necessary and delete the overlap.
+			 */
+			if (needtolink) {
+				*prev = lock;
+				lock->lf_next = overlap->lf_next;
+				prev = &lock->lf_next;
+				needtolink = 0;
+			} else
+				*prev = overlap->lf_next;
+			free(overlap, M_LOCKF);
+			continue;
+
+		case 4: /* overlap starts before lock */
+			/*
+			 * Add lock after overlap on the list.
+			 */
+			lock->lf_next = overlap->lf_next;
+			overlap->lf_next = lock;
+			overlap->lf_end = lock->lf_start - 1;
+			prev = &lock->lf_next;
+			lf_wakelock(overlap);
+			needtolink = 0;
+			continue;
+
+		case 5: /* overlap ends after lock */
+			/*
+			 * Add the new lock before overlap.
+			 */
+			if (needtolink) {
+				*prev = lock;
+				lock->lf_next = overlap;
+			}
+			overlap->lf_start = lock->lf_end + 1;
+			lf_wakelock(overlap);
+			break;
+		}
+		break;
+	}
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1) {
+		lf_print("lf_setlock: got the lock", lock);
+		lf_printlist("lf_setlock", lock);
+	}
+#endif /* LOCKF_DEBUG */
+	return (0);
+}
+
+/*
+ * Remove a byte-range lock on an inode.
+ *
+ * Generally, find the lock (or an overlap to that lock)
+ * and remove it (or shrink it), then wakeup anyone we can.
+ */
+static int
+lf_clearlock(unlock)
+	register struct lockf *unlock;
+{
+	struct lockf **head = unlock->lf_head;
+	register struct lockf *lf = *head;
+	struct lockf *overlap, **prev;
+	int ovcase;
+
+	if (lf == NOLOCKF)
+		return (0);
+#ifdef LOCKF_DEBUG
+	if (unlock->lf_type != F_UNLCK)
+		panic("lf_clearlock: bad type");
+	if (lockf_debug & 1)
+		lf_print("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+	prev = head;
+	while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) {
+		/*
+		 * Wakeup the list of locks to be retried.
+		 */
+		lf_wakelock(overlap);
+
+		switch (ovcase) {
+
+		case 1: /* overlap == lock */
+			*prev = overlap->lf_next;
+			FREE(overlap, M_LOCKF);
+			break;
+
+		case 2: /* overlap contains lock: split it */
+			if (overlap->lf_start == unlock->lf_start) {
+				overlap->lf_start = unlock->lf_end + 1;
+				break;
+			}
+			lf_split(overlap, unlock);
+			overlap->lf_next = unlock->lf_next;
+			break;
+
+		case 3: /* lock contains overlap */
+			*prev = overlap->lf_next;
+			lf = overlap->lf_next;
+			free(overlap, M_LOCKF);
+			continue;
+
+		case 4: /* overlap starts before lock */
+			overlap->lf_end = unlock->lf_start - 1;
+			prev = &overlap->lf_next;
+			lf = overlap->lf_next;
+			continue;
+
+		case 5: /* overlap ends after lock */
+			overlap->lf_start = unlock->lf_end + 1;
+			break;
+		}
+		break;
+	}
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_printlist("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+	return (0);
+}
+
+/*
+ * Check whether there is a blocking lock,
+ * and if so return its process identifier.
+ */
+static int
+lf_getlock(lock, fl)
+	register struct lockf *lock;
+	register struct flock *fl;
+{
+	register struct lockf *block;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_print("lf_getlock", lock);
+#endif /* LOCKF_DEBUG */
+
+	if ((block = lf_getblock(lock))) {
+		fl->l_type = block->lf_type;
+		fl->l_whence = SEEK_SET;
+		fl->l_start = block->lf_start;
+		if (block->lf_end == -1)
+			fl->l_len = 0;
+		else
+			fl->l_len = block->lf_end - block->lf_start + 1;
+		if (block->lf_flags & F_POSIX)
+			fl->l_pid = ((struct proc *)(block->lf_id))->p_pid;
+		else
+			fl->l_pid = -1;
+	} else {
+		fl->l_type = F_UNLCK;
+	}
+	return (0);
+}
+
+/*
+ * Walk the list of locks for an inode and
+ * return the first blocking lock.
+ */
+static struct lockf *
+lf_getblock(lock)
+	register struct lockf *lock;
+{
+	struct lockf **prev, *overlap, *lf = *(lock->lf_head);
+	int ovcase;
+
+	prev = lock->lf_head;
+	while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) {
+		/*
+		 * We've found an overlap, see if it blocks us
+		 */
+		if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
+			return (overlap);
+		/*
+		 * Nope, point to the next one on the list and
+		 * see if it blocks us
+		 */
+		lf = overlap->lf_next;
+	}
+	return (NOLOCKF);
+}
+
+/*
+ * Walk the list of locks for an inode to
+ * find an overlapping lock (if any).
+ *
+ * NOTE: this returns only the FIRST overlapping lock.  There
+ *	 may be more than one.
+ */
+static int
+lf_findoverlap(lf, lock, type, prev, overlap)
+	register struct lockf *lf;
+	struct lockf *lock;
+	int type;
+	struct lockf ***prev;
+	struct lockf **overlap;
+{
+	off_t start, end;
+
+	*overlap = lf;
+	if (lf == NOLOCKF)
+		return (0);
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 2)
+		lf_print("lf_findoverlap: looking for overlap in", lock);
+#endif /* LOCKF_DEBUG */
+	start = lock->lf_start;
+	end = lock->lf_end;
+	while (lf != NOLOCKF) {
+		if (((type & SELF) && lf->lf_id != lock->lf_id) ||
+		    ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
+			*prev = &lf->lf_next;
+			*overlap = lf = lf->lf_next;
+			continue;
+		}
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 2)
+			lf_print("\tchecking", lf);
+#endif /* LOCKF_DEBUG */
+		/*
+		 * OK, check for overlap
+		 *
+		 * Six cases:
+		 *	0) no overlap
+		 *	1) overlap == lock
+		 *	2) overlap contains lock
+		 *	3) lock contains overlap
+		 *	4) overlap starts before lock
+		 *	5) overlap ends after lock
+		 */
+		if ((lf->lf_end != -1 && start > lf->lf_end) ||
+		    (end != -1 && lf->lf_start > end)) {
+			/* Case 0 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("no overlap\n");
+#endif /* LOCKF_DEBUG */
+			if ((type & SELF) && end != -1 && lf->lf_start > end)
+				return (0);
+			*prev = &lf->lf_next;
+			*overlap = lf = lf->lf_next;
+			continue;
+		}
+		if ((lf->lf_start == start) && (lf->lf_end == end)) {
+			/* Case 1 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap == lock\n");
+#endif /* LOCKF_DEBUG */
+			return (1);
+		}
+		if ((lf->lf_start <= start) &&
+		    (end != -1) &&
+		    ((lf->lf_end >= end) || (lf->lf_end == -1))) {
+			/* Case 2 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap contains lock\n");
+#endif /* LOCKF_DEBUG */
+			return (2);
+		}
+		if (start <= lf->lf_start &&
+		           (end == -1 ||
+			   (lf->lf_end != -1 && end >= lf->lf_end))) {
+			/* Case 3 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("lock contains overlap\n");
+#endif /* LOCKF_DEBUG */
+			return (3);
+		}
+		if ((lf->lf_start < start) &&
+			((lf->lf_end >= start) || (lf->lf_end == -1))) {
+			/* Case 4 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap starts before lock\n");
+#endif /* LOCKF_DEBUG */
+			return (4);
+		}
+		if ((lf->lf_start > start) &&
+			(end != -1) &&
+			((lf->lf_end > end) || (lf->lf_end == -1))) {
+			/* Case 5 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap ends after lock\n");
+#endif /* LOCKF_DEBUG */
+			return (5);
+		}
+		panic("lf_findoverlap: default");
+	}
+	return (0);
+}
+
+/*
+ * Split a lock and a contained region into
+ * two or three locks as necessary.
+ */
+static void
+lf_split(lock1, lock2)
+	register struct lockf *lock1;
+	register struct lockf *lock2;
+{
+	register struct lockf *splitlock;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 2) {
+		lf_print("lf_split", lock1);
+		lf_print("splitting from", lock2);
+	}
+#endif /* LOCKF_DEBUG */
+	/*
+	 * Check to see if spliting into only two pieces.
+	 */
+	if (lock1->lf_start == lock2->lf_start) {
+		lock1->lf_start = lock2->lf_end + 1;
+		lock2->lf_next = lock1;
+		return;
+	}
+	if (lock1->lf_end == lock2->lf_end) {
+		lock1->lf_end = lock2->lf_start - 1;
+		lock2->lf_next = lock1->lf_next;
+		lock1->lf_next = lock2;
+		return;
+	}
+	/*
+	 * Make a new lock consisting of the last part of
+	 * the encompassing lock
+	 */
+	MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
+	bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock);
+	splitlock->lf_start = lock2->lf_end + 1;
+	TAILQ_INIT(&splitlock->lf_blkhd);
+	lock1->lf_end = lock2->lf_start - 1;
+	/*
+	 * OK, now link it in
+	 */
+	splitlock->lf_next = lock1->lf_next;
+	lock2->lf_next = splitlock;
+	lock1->lf_next = lock2;
+}
+
+/*
+ * Wakeup a blocklist
+ */
+static void
+lf_wakelock(listhead)
+	struct lockf *listhead;
+{
+	register struct lockf *wakelock;
+
+	while (wakelock = listhead->lf_blkhd.tqh_first) {
+		TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
+		wakelock->lf_next = NOLOCKF;
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 2)
+			lf_print("lf_wakelock: awakening", wakelock);
+#endif /* LOCKF_DEBUG */
+		wakeup((caddr_t)wakelock);
+	}
+}
+
+#ifdef LOCKF_DEBUG
+/*
+ * Print out a lock.
+ */
+void
+lf_print(tag, lock)
+	char *tag;
+	register struct lockf *lock;
+{
+
+	printf("%s: lock %p for ", tag, (void *)lock);
+	if (lock->lf_flags & F_POSIX)
+		printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
+	else
+		printf("id %p", (void *)lock->lf_id);
+	/* XXX no %qd in kernel.  Truncate. */
+	printf(" in ino %lu on dev <%d, %d>, %s, start %ld, end %ld",
+	    (u_long)lock->lf_inode->i_number,
+	    major(lock->lf_inode->i_dev),
+	    minor(lock->lf_inode->i_dev),
+	    lock->lf_type == F_RDLCK ? "shared" :
+	    lock->lf_type == F_WRLCK ? "exclusive" :
+	    lock->lf_type == F_UNLCK ? "unlock" :
+	    "unknown", (long)lock->lf_start, (long)lock->lf_end);
+	if (lock->lf_blkhd.tqh_first)
+		printf(" block %p\n", (void *)lock->lf_blkhd.tqh_first);
+	else
+		printf("\n");
+}
+
+void
+lf_printlist(tag, lock)
+	char *tag;
+	struct lockf *lock;
+{
+	register struct lockf *lf, *blk;
+
+	printf("%s: Lock list for ino %lu on dev <%d, %d>:\n",
+	    tag, (u_long)lock->lf_inode->i_number,
+	    major(lock->lf_inode->i_dev),
+	    minor(lock->lf_inode->i_dev));
+	for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) {
+		printf("\tlock %p for ",(void *)lf);
+		if (lf->lf_flags & F_POSIX)
+			printf("proc %ld",
+			    (long)((struct proc *)lf->lf_id)->p_pid);
+		else
+			printf("id %p", (void *)lf->lf_id);
+		/* XXX no %qd in kernel.  Truncate. */
+		printf(", %s, start %ld, end %ld",
+		    lf->lf_type == F_RDLCK ? "shared" :
+		    lf->lf_type == F_WRLCK ? "exclusive" :
+		    lf->lf_type == F_UNLCK ? "unlock" :
+		    "unknown", (long)lf->lf_start, (long)lf->lf_end);
+		for (blk = lf->lf_blkhd.tqh_first; blk;
+		     blk = blk->lf_block.tqe_next) {
+			printf("\n\t\tlock request %p for ", (void *)blk);
+			if (blk->lf_flags & F_POSIX)
+				printf("proc %ld",
+				    (long)((struct proc *)blk->lf_id)->p_pid);
+			else
+				printf("id %p", (void *)blk->lf_id);
+			/* XXX no %qd in kernel.  Truncate. */
+			printf(", %s, start %ld, end %ld",
+			    blk->lf_type == F_RDLCK ? "shared" :
+			    blk->lf_type == F_WRLCK ? "exclusive" :
+			    blk->lf_type == F_UNLCK ? "unlock" :
+			    "unknown", (long)blk->lf_start,
+			    (long)blk->lf_end);
+			if (blk->lf_blkhd.tqh_first)
+				panic("lf_printlist: bad list");
+		}
+		printf("\n");
+	}
+}
+#endif /* LOCKF_DEBUG */
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
new file mode 100644
index 0000000..be9f9d3
--- /dev/null
+++ b/sys/kern/kern_malloc.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 1987, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
+ * $Id: kern_malloc.c,v 1.51 1999/01/10 01:58:24 eivind Exp $
+ */
+
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#define MALLOC_INSTANTIATE
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/vmmeter.h>
+#include <sys/lock.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+static void kmeminit __P((void *));
+SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL)
+
+static MALLOC_DEFINE(M_FREE, "free", "should be on free list");
+
+static struct malloc_type *kmemstatistics;
+static struct kmembuckets bucket[MINBUCKET + 16];
+static struct kmemusage *kmemusage;
+static char *kmembase;
+static char *kmemlimit;
+static int vm_kmem_size;
+
+#ifdef INVARIANTS
+/*
+ * This structure provides a set of masks to catch unaligned frees.
+ */
+static long addrmask[] = { 0,
+	0x00000001, 0x00000003, 0x00000007, 0x0000000f,
+	0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
+	0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
+	0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
+};
+
+/*
+ * The WEIRD_ADDR is used as known text to copy into free objects so
+ * that modifications after frees can be detected.
+ */
+#define WEIRD_ADDR	0xdeadc0de
+#define MAX_COPY	64
+
+/*
+ * Normally the first word of the structure is used to hold the list
+ * pointer for free objects. However, when running with diagnostics,
+ * we use the third and fourth fields, so as to catch modifications
+ * in the most commonly trashed first two words.
+ */
+struct freelist {
+	long	spare0;
+	struct malloc_type *type;
+	long	spare1;
+	caddr_t	next;
+};
+#else /* !INVARIANTS */
+struct freelist {
+	caddr_t	next;
+};
+#endif /* INVARIANTS */
+
+/*
+ *	malloc:
+ *
+ *	Allocate a block of memory.
+ *
+ *	If M_NOWAIT is set, this routine will not block and return NULL if
+ *	the allocation fails.
+ *
+ *	If M_ASLEEP is set (M_NOWAIT must also be set), this routine
+ *	will have the side effect of calling asleep() if it returns NULL,
+ *	allowing the parent to await() at some future time.
+ */
+void *
+malloc(size, type, flags)
+	unsigned long size;
+	struct malloc_type *type;
+	int flags;
+{
+	register struct kmembuckets *kbp;
+	register struct kmemusage *kup;
+	register struct freelist *freep;
+	long indx, npg, allocsize;
+	int s;
+	caddr_t va, cp, savedlist;
+#ifdef INVARIANTS
+	long *end, *lp;
+	int copysize;
+	char *savedtype;
+#endif
+	register struct malloc_type *ksp = type;
+
+	/*
+	 * Must be at splmem() prior to initializing segment to handle
+	 * potential initialization race.
+	 */
+
+	s = splmem();
+
+	if (!type->ks_next) {
+		malloc_init(type);
+	}
+
+	indx = BUCKETINDX(size);
+	kbp = &bucket[indx];
+
+	while (ksp->ks_memuse >= ksp->ks_limit) {
+		if (flags & M_ASLEEP) {
+			if (ksp->ks_limblocks < 65535)
+				ksp->ks_limblocks++;
+			asleep((caddr_t)ksp, PSWP+2, type->ks_shortdesc, 0);
+		}
+		if (flags & M_NOWAIT) {
+			splx(s);
+			return ((void *) NULL);
+		}
+		if (ksp->ks_limblocks < 65535)
+			ksp->ks_limblocks++;
+		tsleep((caddr_t)ksp, PSWP+2, type->ks_shortdesc, 0);
+	}
+	ksp->ks_size |= 1 << indx;
+#ifdef INVARIANTS
+	copysize = 1 << indx < MAX_COPY ? 1 << indx : MAX_COPY;
+#endif
+	if (kbp->kb_next == NULL) {
+		kbp->kb_last = NULL;
+		if (size > MAXALLOCSAVE)
+			allocsize = roundup(size, PAGE_SIZE);
+		else
+			allocsize = 1 << indx;
+		npg = btoc(allocsize);
+		va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg), flags);
+		if (va == NULL) {
+			splx(s);
+			return ((void *) NULL);
+		}
+		kbp->kb_total += kbp->kb_elmpercl;
+		kup = btokup(va);
+		kup->ku_indx = indx;
+		if (allocsize > MAXALLOCSAVE) {
+			if (npg > 65535)
+				panic("malloc: allocation too large");
+			kup->ku_pagecnt = npg;
+			ksp->ks_memuse += allocsize;
+			goto out;
+		}
+		kup->ku_freecnt = kbp->kb_elmpercl;
+		kbp->kb_totalfree += kbp->kb_elmpercl;
+		/*
+		 * Just in case we blocked while allocating memory,
+		 * and someone else also allocated memory for this
+		 * bucket, don't assume the list is still empty.
+		 */
+		savedlist = kbp->kb_next;
+		kbp->kb_next = cp = va + (npg * PAGE_SIZE) - allocsize;
+		for (;;) {
+			freep = (struct freelist *)cp;
+#ifdef INVARIANTS
+			/*
+			 * Copy in known text to detect modification
+			 * after freeing.
+			 */
+			end = (long *)&cp[copysize];
+			for (lp = (long *)cp; lp < end; lp++)
+				*lp = WEIRD_ADDR;
+			freep->type = M_FREE;
+#endif /* INVARIANTS */
+			if (cp <= va)
+				break;
+			cp -= allocsize;
+			freep->next = cp;
+		}
+		freep->next = savedlist;
+		if (kbp->kb_last == NULL)
+			kbp->kb_last = (caddr_t)freep;
+	}
+	va = kbp->kb_next;
+	kbp->kb_next = ((struct freelist *)va)->next;
+#ifdef INVARIANTS
+	freep = (struct freelist *)va;
+	savedtype = (char *) type->ks_shortdesc;
+#if BYTE_ORDER == BIG_ENDIAN
+	freep->type = (struct malloc_type *)WEIRD_ADDR >> 16;
+#endif
+#if BYTE_ORDER == LITTLE_ENDIAN
+	freep->type = (struct malloc_type *)WEIRD_ADDR;
+#endif
+	if ((intptr_t)(void *)&freep->next & 0x2)
+		freep->next = (caddr_t)((WEIRD_ADDR >> 16)|(WEIRD_ADDR << 16));
+	else
+		freep->next = (caddr_t)WEIRD_ADDR;
+	end = (long *)&va[copysize];
+	for (lp = (long *)va; lp < end; lp++) {
+		if (*lp == WEIRD_ADDR)
+			continue;
+		printf("%s %ld of object %p size %lu %s %s (0x%lx != 0x%lx)\n",
+			"Data modified on freelist: word",
+			(long)(lp - (long *)va), (void *)va, size,
+			"previous type", savedtype, *lp, (u_long)WEIRD_ADDR);
+		break;
+	}
+	freep->spare0 = 0;
+#endif /* INVARIANTS */
+	kup = btokup(va);
+	if (kup->ku_indx != indx)
+		panic("malloc: wrong bucket");
+	if (kup->ku_freecnt == 0)
+		panic("malloc: lost data");
+	kup->ku_freecnt--;
+	kbp->kb_totalfree--;
+	ksp->ks_memuse += 1 << indx;
+out:
+	kbp->kb_calls++;
+	ksp->ks_inuse++;
+	ksp->ks_calls++;
+	if (ksp->ks_memuse > ksp->ks_maxused)
+		ksp->ks_maxused = ksp->ks_memuse;
+	splx(s);
+	return ((void *) va);
+}
+
+/*
+ *	free:
+ *
+ *	Free a block of memory allocated by malloc.
+ *
+ *	This routine may not block.
+ */
+void
+free(addr, type)
+	void *addr;
+	struct malloc_type *type;
+{
+	register struct kmembuckets *kbp;
+	register struct kmemusage *kup;
+	register struct freelist *freep;
+	long size;
+	int s;
+#ifdef INVARIANTS
+	struct freelist *fp;
+	long *end, *lp, alloc, copysize;
+#endif
+	register struct malloc_type *ksp = type;
+
+	if (!type->ks_next)
+		panic("freeing with unknown type (%s)", type->ks_shortdesc);
+
+	KASSERT(kmembase <= (char *)addr && (char *)addr < kmemlimit,
+	    ("free: address %p out of range", (void *)addr));
+	kup = btokup(addr);
+	size = 1 << kup->ku_indx;
+	kbp = &bucket[kup->ku_indx];
+	s = splmem();
+#ifdef INVARIANTS
+	/*
+	 * Check for returns of data that do not point to the
+	 * beginning of the allocation.
+	 */
+	if (size > PAGE_SIZE)
+		alloc = addrmask[BUCKETINDX(PAGE_SIZE)];
+	else
+		alloc = addrmask[kup->ku_indx];
+	if (((uintptr_t)(void *)addr & alloc) != 0)
+		panic("free: unaligned addr %p, size %ld, type %s, mask %ld",
+		    (void *)addr, size, type->ks_shortdesc, alloc);
+#endif /* INVARIANTS */
+	if (size > MAXALLOCSAVE) {
+		kmem_free(kmem_map, (vm_offset_t)addr, ctob(kup->ku_pagecnt));
+		size = kup->ku_pagecnt << PAGE_SHIFT;
+		ksp->ks_memuse -= size;
+		kup->ku_indx = 0;
+		kup->ku_pagecnt = 0;
+		if (ksp->ks_memuse + size >= ksp->ks_limit &&
+		    ksp->ks_memuse < ksp->ks_limit)
+			wakeup((caddr_t)ksp);
+		ksp->ks_inuse--;
+		kbp->kb_total -= 1;
+		splx(s);
+		return;
+	}
+	freep = (struct freelist *)addr;
+#ifdef INVARIANTS
+	/*
+	 * Check for multiple frees. Use a quick check to see if
+	 * it looks free before laboriously searching the freelist.
+	 */
+	if (freep->spare0 == WEIRD_ADDR) {
+		fp = (struct freelist *)kbp->kb_next;
+		while (fp) {
+			if (fp->spare0 != WEIRD_ADDR)
+				panic("free: free item %p modified", fp);
+			else if (addr == (caddr_t)fp)
+				panic("free: multiple freed item %p", addr);
+			fp = (struct freelist *)fp->next;
+		}
+	}
+	/*
+	 * Copy in known text to detect modification after freeing
+	 * and to make it look free. Also, save the type being freed
+	 * so we can list likely culprit if modification is detected
+	 * when the object is reallocated.
+	 */
+	copysize = size < MAX_COPY ? size : MAX_COPY;
+	end = (long *)&((caddr_t)addr)[copysize];
+	for (lp = (long *)addr; lp < end; lp++)
+		*lp = WEIRD_ADDR;
+	freep->type = type;
+#endif /* INVARIANTS */
+	kup->ku_freecnt++;
+	if (kup->ku_freecnt >= kbp->kb_elmpercl)
+		if (kup->ku_freecnt > kbp->kb_elmpercl)
+			panic("free: multiple frees");
+		else if (kbp->kb_totalfree > kbp->kb_highwat)
+			kbp->kb_couldfree++;
+	kbp->kb_totalfree++;
+	ksp->ks_memuse -= size;
+	if (ksp->ks_memuse + size >= ksp->ks_limit &&
+	    ksp->ks_memuse < ksp->ks_limit)
+		wakeup((caddr_t)ksp);
+	ksp->ks_inuse--;
+#ifdef OLD_MALLOC_MEMORY_POLICY
+	if (kbp->kb_next == NULL)
+		kbp->kb_next = addr;
+	else
+		((struct freelist *)kbp->kb_last)->next = addr;
+	freep->next = NULL;
+	kbp->kb_last = addr;
+#else
+	/*
+	 * Return memory to the head of the queue for quick reuse.  This
+	 * can improve performance by improving the probability of the
+	 * item being in the cache when it is reused.
+	 */
+	if (kbp->kb_next == NULL) {
+		kbp->kb_next = addr;
+		kbp->kb_last = addr;
+		freep->next = NULL;
+	} else {
+		freep->next = kbp->kb_next;
+		kbp->kb_next = addr;
+	}
+#endif
+	splx(s);
+}
+
+/*
+ * Initialize the kernel memory allocator
+ */
+/* ARGSUSED*/
+static void
+kmeminit(dummy)
+	void *dummy;
+{
+	register long indx;
+	int npg;
+	int mem_size;
+
+#if	((MAXALLOCSAVE & (MAXALLOCSAVE - 1)) != 0)
+#error "kmeminit: MAXALLOCSAVE not power of 2"
+#endif
+#if	(MAXALLOCSAVE > MINALLOCSIZE * 32768)
+#error "kmeminit: MAXALLOCSAVE too big"
+#endif
+#if	(MAXALLOCSAVE < PAGE_SIZE)
+#error "kmeminit: MAXALLOCSAVE too small"
+#endif
+
+	/*
+	 * Try to auto-tune the kernel memory size, so that it is
+	 * more applicable for a wider range of machine sizes.
+	 * On an X86, a VM_KMEM_SIZE_SCALE value of 4 is good, while
+	 * a VM_KMEM_SIZE of 12MB is a fair compromise.  The
+	 * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space
+	 * available, and on an X86 with a total KVA space of 256MB,
+	 * try to keep VM_KMEM_SIZE_MAX at 80MB or below.
+	 *
+	 * Note that the kmem_map is also used by the zone allocator,
+	 * so make sure that there is enough space.
+	 */
+	vm_kmem_size = VM_KMEM_SIZE;
+	mem_size = cnt.v_page_count * PAGE_SIZE;
+
+#if defined(VM_KMEM_SIZE_SCALE)
+	if ((mem_size / VM_KMEM_SIZE_SCALE) > vm_kmem_size)
+		vm_kmem_size = mem_size / VM_KMEM_SIZE_SCALE;
+#endif
+
+#if defined(VM_KMEM_SIZE_MAX)
+	if (vm_kmem_size >= VM_KMEM_SIZE_MAX)
+		vm_kmem_size = VM_KMEM_SIZE_MAX;
+#endif
+
+	if (vm_kmem_size > 2 * (cnt.v_page_count * PAGE_SIZE))
+		vm_kmem_size = 2 * (cnt.v_page_count * PAGE_SIZE);
+
+	npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + vm_kmem_size)
+		/ PAGE_SIZE;
+
+	kmemusage = (struct kmemusage *) kmem_alloc(kernel_map,
+		(vm_size_t)(npg * sizeof(struct kmemusage)));
+	kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
+		(vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE));
+	kmem_map->system_map = 1;
+	for (indx = 0; indx < MINBUCKET + 16; indx++) {
+		if (1 << indx >= PAGE_SIZE)
+			bucket[indx].kb_elmpercl = 1;
+		else
+			bucket[indx].kb_elmpercl = PAGE_SIZE / (1 << indx);
+		bucket[indx].kb_highwat = 5 * bucket[indx].kb_elmpercl;
+	}
+}
+
+void
+malloc_init(data)
+	void *data;
+{
+	struct malloc_type *type = (struct malloc_type *)data;
+
+	if (type->ks_magic != M_MAGIC) 
+		panic("malloc type lacks magic");
+
+	if (type->ks_next)
+		return;
+
+	if (cnt.v_page_count == 0)
+		panic("malloc_init not allowed before vm init");
+
+	/*
+	 * The default limits for each malloc region is 1/2 of the
+	 * malloc portion of the kmem map size.
+	 */
+	type->ks_limit = vm_kmem_size / 2;
+	type->ks_next = kmemstatistics;	
+	kmemstatistics = type;
+}
+
+void
+malloc_uninit(data)
+	void *data;
+{
+	struct malloc_type *type = (struct malloc_type *)data;
+	struct malloc_type *t;
+
+	if (type->ks_magic != M_MAGIC) 
+		panic("malloc type lacks magic");
+
+	if (cnt.v_page_count == 0)
+		panic("malloc_uninit not allowed before vm init");
+
+	if (type == kmemstatistics)
+		kmemstatistics = type->ks_next;
+	else {
+		for (t = kmemstatistics; t->ks_next != NULL; t = t->ks_next) {
+			if (t->ks_next == type) {
+				t->ks_next = type->ks_next;
+				break;
+			}
+		}
+	}
+}
diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
new file mode 100644
index 0000000..22fcd33
--- /dev/null
+++ b/sys/kern/kern_mib.c
@@ -0,0 +1,182 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
+ * $Id: kern_mib.c,v 1.15 1998/03/28 11:49:52 dufault Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+
+#if defined(SMP)
+#include <machine/smp.h>
+#endif
+
+SYSCTL_NODE(, 0,	  sysctl, CTLFLAG_RW, 0,
+	"Sysctl internal magic");
+SYSCTL_NODE(, CTL_KERN,	  kern,   CTLFLAG_RW, 0,
+	"High kernel, proc, limits &c");
+SYSCTL_NODE(, CTL_VM,	  vm,     CTLFLAG_RW, 0,
+	"Virtual memory");
+SYSCTL_NODE(, CTL_VFS,	  vfs,     CTLFLAG_RW, 0,
+	"File system");
+SYSCTL_NODE(, CTL_NET,	  net,    CTLFLAG_RW, 0,
+	"Network, (see socket.h)");
+SYSCTL_NODE(, CTL_DEBUG,  debug,  CTLFLAG_RW, 0,
+	"Debugging");
+SYSCTL_NODE(, CTL_HW,	  hw,     CTLFLAG_RW, 0,
+	"hardware");
+SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
+	"machine dependent");
+SYSCTL_NODE(, CTL_USER,	  user,   CTLFLAG_RW, 0,
+	"user-level");
+
+SYSCTL_NODE(, CTL_P1003_1B,  p1003_1b,   CTLFLAG_RW, 0,
+	"p1003_1b, (see p1003_1b.h)");
+
+SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD, osrelease, 0, "");
+
+SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, 0, BSD, "");
+
+SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, version, 0, "");
+
+SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD, ostype, 0, "");
+
+extern int osreldate;
+SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, &osreldate, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RD, &maxproc, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid,
+	CTLFLAG_RW, &maxprocperuid, 0, "");
+
+SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, 0, ARG_MAX, "");
+
+SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, 0, _KPOSIX_VERSION, "");
+
+SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD, 0, NGROUPS_MAX, "");
+
+SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, 0, 1, "");
+
+#ifdef _POSIX_SAVED_IDS
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 1, "");
+#else
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 0, "");
+#endif
+
+char kernelname[MAXPATHLEN] = "/kernel";	/* XXX bloat */
+
+SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile,
+	CTLFLAG_RW, kernelname, sizeof kernelname, "");
+
+#ifdef SMP
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, &mp_ncpus, 0, "");
+#else
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, 0, 1, "");
+#endif
+
+SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, 0, BYTE_ORDER, "");
+
+SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, 0, PAGE_SIZE, "");
+
+static char	machine_arch[] = MACHINE_ARCH;
+SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD,
+			  machine_arch, 0, "");
+
+char hostname[MAXHOSTNAMELEN];
+
+SYSCTL_STRING(_kern, KERN_HOSTNAME, hostname, CTLFLAG_RW,
+	hostname, sizeof(hostname), "");
+
+int securelevel = -1;
+
+static int
+sysctl_kern_securelvl SYSCTL_HANDLER_ARGS
+{
+		int error, level;
+
+		level = securelevel;
+		error = sysctl_handle_int(oidp, &level, 0, req);
+		if (error || !req->newptr)
+			return (error);
+		if (level < securelevel)
+			return (EPERM);
+		securelevel = level;
+		return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, CTLTYPE_INT|CTLFLAG_RW,
+	0, 0, sysctl_kern_securelvl, "I", "");
+
+char domainname[MAXHOSTNAMELEN];
+SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW,
+	&domainname, sizeof(domainname), "");
+
+long hostid;
+/* Some trouble here, if sizeof (int) != sizeof (long) */
+SYSCTL_INT(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "");
+
+/*
+ * This is really cheating.  These actually live in the libc, something
+ * which I'm not quite sure is a good idea anyway, but in order for 
+ * getnext and friends to actually work, we define dummies here.
+ */
+SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, "", 0, "");
+SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, 0, 0, "");
diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c
new file mode 100644
index 0000000..afe9f2e
--- /dev/null
+++ b/sys/kern/kern_module.c
@@ -0,0 +1,330 @@
+/*-
+ * Copyright (c) 1997 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: kern_module.c,v 1.13 1999/01/09 14:59:50 dfr Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+#include <sys/proc.h>
+
+#define M_MODULE	M_TEMP		/* XXX */
+
+typedef TAILQ_HEAD(, module) modulelist_t;
+struct module {
+    TAILQ_ENTRY(module)	link;		/* chain together all modules */
+    TAILQ_ENTRY(module)	flink;		/* all modules in a file */
+    struct linker_file*	file;		/* file which contains this module */
+    int			refs;		/* reference count */
+    int			id;		/* unique id number */
+    char		*name;		/* module name */
+    modeventhand_t	handler;	/* event handler */
+    void		*arg;		/* argument for handler */
+    modspecific_t	data;		/* module specific data */
+};
+
+#define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg)
+
+static modulelist_t modules;
+static int nextid = 1;
+
+static void module_shutdown(int, void*);
+
+static void
+module_init(void* arg)
+{
+    TAILQ_INIT(&modules);
+    at_shutdown(module_shutdown, 0, SHUTDOWN_POST_SYNC);
+}
+
+SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0);
+
+static void
+module_shutdown(int arg1, void* arg2)
+{
+    module_t mod;
+
+    for (mod = TAILQ_FIRST(&modules); mod; mod = TAILQ_NEXT(mod, link))
+	MOD_EVENT(mod, MOD_SHUTDOWN);
+}
+
+void
+module_register_init(void *arg)
+{
+    moduledata_t* data = (moduledata_t*) arg;
+    int error;
+
+    error = module_register(data->name, data->evhand, data->priv, data->_file);
+    if (error)
+	printf("module_register_init: module_register(%s, %lx, %p) error %d\n",
+	       data->name, (u_long)(uintfptr_t)data->evhand, data->priv, error);
+}
+
+int
+module_register(const char* name, modeventhand_t handler, void* arg, void *file)
+{
+    size_t namelen;
+    module_t newmod;
+    int error;
+    linker_file_t container = file;
+
+    namelen = strlen(name) + 1;
+    newmod = (module_t) malloc(sizeof(struct module) + namelen,
+			       M_MODULE, M_WAITOK);
+    if (newmod == 0)
+	return ENOMEM;
+
+    newmod->refs = 1;
+    newmod->id = nextid++;
+    newmod->name = (char *) (newmod + 1);
+    strcpy(newmod->name, name);
+    newmod->handler = handler;
+    newmod->arg = arg;
+    bzero(&newmod->data, sizeof(newmod->data));
+    TAILQ_INSERT_TAIL(&modules, newmod, link);
+
+    if (container == NULL)
+	container = linker_current_file;
+    if (container) {
+	TAILQ_INSERT_TAIL(&container->modules, newmod, flink);
+	newmod->file = container;
+    } else
+	newmod->file = 0;
+
+    if (error = MOD_EVENT(newmod, MOD_LOAD)) {
+	MOD_EVENT(newmod, MOD_UNLOAD);
+	module_release(newmod);
+	return error;
+    }
+
+    return 0;
+}
+
+void
+module_reference(module_t mod)
+{
+    MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs));
+
+    mod->refs++;
+}
+
+void
+module_release(module_t mod)
+{
+    if (mod->refs <= 0)
+	panic("module_release: bad reference count");
+
+    MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs));
+
+    mod->refs--;
+    if (mod->refs == 0) {
+	TAILQ_REMOVE(&modules, mod, link);
+	if (mod->file) {
+	    TAILQ_REMOVE(&mod->file->modules, mod, flink);
+	}
+	free(mod, M_MODULE);
+    }
+}
+
+module_t
+module_lookupbyname(const char* name)
+{
+    module_t mod;
+
+    for (mod = TAILQ_FIRST(&modules); mod; mod = TAILQ_NEXT(mod, link)) {
+	if (!strcmp(mod->name, name))
+	    return mod;
+    }
+
+    return 0;
+}
+
+module_t
+module_lookupbyid(int modid)
+{
+    module_t mod;
+
+    for (mod = TAILQ_FIRST(&modules); mod; mod = TAILQ_NEXT(mod, link)) {
+	if (mod->id == modid)
+	    return mod;
+    }
+
+    return 0;
+}
+
+int
+module_unload(module_t mod)
+{
+    return MOD_EVENT(mod, MOD_UNLOAD);
+}
+
+int
+module_getid(module_t mod)
+{
+    return mod->id;
+}
+
+module_t
+module_getfnext(module_t mod)
+{
+    return TAILQ_NEXT(mod, flink);
+}
+
+void
+module_setspecific(module_t mod, modspecific_t *datap)
+{
+    mod->data = *datap;
+}
+
+/*
+ * Syscalls.
+ */
+int
+modnext(struct proc* p, struct modnext_args* uap)
+{
+    module_t mod;
+
+    p->p_retval[0] = -1;
+    if (SCARG(uap, modid) == 0) {
+	mod = TAILQ_FIRST(&modules);
+	if (mod) {
+	    p->p_retval[0] = mod->id;
+	    return 0;
+	} else
+	    return ENOENT;
+    }
+
+    mod = module_lookupbyid(SCARG(uap, modid));
+    if (!mod)
+	return ENOENT;
+
+    if (TAILQ_NEXT(mod, link))
+	p->p_retval[0] = TAILQ_NEXT(mod, link)->id;
+    else
+	p->p_retval[0] = 0;
+    return 0;
+}
+
+int
+modfnext(struct proc* p, struct modfnext_args* uap)
+{
+    module_t mod;
+
+    p->p_retval[0] = -1;
+
+    mod = module_lookupbyid(SCARG(uap, modid));
+    if (!mod)
+	return ENOENT;
+
+    if (TAILQ_NEXT(mod, flink))
+	p->p_retval[0] = TAILQ_NEXT(mod, flink)->id;
+    else
+	p->p_retval[0] = 0;
+    return 0;
+}
+
+struct module_stat_v1 {
+    int		version;	/* set to sizeof(struct module_stat) */
+    char	name[MAXMODNAME];
+    int		refs;
+    int		id;
+};
+
+int
+modstat(struct proc* p, struct modstat_args* uap)
+{
+    module_t mod;
+    int error = 0;
+    int namelen;
+    int version;
+    struct module_stat* stat;
+
+    mod = module_lookupbyid(SCARG(uap, modid));
+    if (!mod)
+	return ENOENT;
+
+    stat = SCARG(uap, stat);
+
+    /*
+     * Check the version of the user's structure.
+     */
+    if (error = copyin(&stat->version, &version, sizeof(version)))
+	goto out;
+    if (version != sizeof(struct module_stat_v1)
+	&& version != sizeof(struct module_stat)) {
+	error = EINVAL;
+	goto out;
+    }
+
+    namelen = strlen(mod->name) + 1;
+    if (namelen > MAXMODNAME)
+	namelen = MAXMODNAME;
+    if (error = copyout(mod->name, &stat->name[0], namelen))
+	goto out;
+
+    if (error = copyout(&mod->refs, &stat->refs, sizeof(int)))
+	goto out;
+    if (error = copyout(&mod->id, &stat->id, sizeof(int)))
+	goto out;
+
+    /*
+     * >v1 stat includes module data.
+     */
+    if (version == sizeof(struct module_stat)) {
+	if (error = copyout(&mod->data, &stat->data, sizeof(mod->data)))
+	    goto out;
+    }
+
+    p->p_retval[0] = 0;
+
+out:
+    return error;
+}
+
+int
+modfind(struct proc* p, struct modfind_args* uap)
+{
+    int error = 0;
+    char name[MAXMODNAME];
+    module_t mod;
+
+    if (error = copyinstr(SCARG(uap, name), name, sizeof name, 0))
+	goto out;
+
+    mod = module_lookupbyname(name);
+    if (!mod)
+	error = ENOENT;
+    else
+	p->p_retval[0] = mod->id;
+
+out:
+    return error;
+}
diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c
new file mode 100644
index 0000000..2f4114d
--- /dev/null
+++ b/sys/kern/kern_ntptime.c
@@ -0,0 +1,856 @@
+/******************************************************************************
+ *                                                                            *
+ * Copyright (c) David L. Mills 1993, 1994                                    *
+ *                                                                            *
+ * Permission to use, copy, modify, and distribute this software and its      *
+ * documentation for any purpose and without fee is hereby granted, provided  *
+ * that the above copyright notice appears in all copies and that both the    *
+ * copyright notice and this permission notice appear in supporting           *
+ * documentation, and that the name University of Delaware not be used in     *
+ * advertising or publicity pertaining to distribution of the software        *
+ * without specific, written prior permission.  The University of Delaware    *
+ * makes no representations about the suitability this software for any       *
+ * purpose.  It is provided "as is" without express or implied warranty.      *
+ *                                                                            *
+ ******************************************************************************/
+
+/*
+ * Modification history kern_ntptime.c
+ *
+ * 24 Sep 94	David L. Mills
+ *	Tightened code at exits.
+ *
+ * 24 Mar 94	David L. Mills
+ *	Revised syscall interface to include new variables for PPS
+ *	time discipline.
+ *
+ * 14 Feb 94	David L. Mills
+ *	Added code for external clock
+ *
+ * 28 Nov 93	David L. Mills
+ *	Revised frequency scaling to conform with adjusted parameters
+ *
+ * 17 Sep 93	David L. Mills
+ *	Created file
+ */
+/*
+ * ntp_gettime(), ntp_adjtime() - precision time interface for SunOS
+ * V4.1.1 and V4.1.3
+ *
+ * These routines consitute the Network Time Protocol (NTP) interfaces
+ * for user and daemon application programs. The ntp_gettime() routine
+ * provides the time, maximum error (synch distance) and estimated error
+ * (dispersion) to client user application programs. The ntp_adjtime()
+ * routine is used by the NTP daemon to adjust the system clock to an
+ * externally derived time. The time offset and related variables set by
+ * this routine are used by hardclock() to adjust the phase and
+ * frequency of the phase-lock loop which controls the system clock.
+ */
+
+#include "opt_ntp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/timex.h>
+#include <sys/timepps.h>
+#include <sys/sysctl.h>
+
+/*
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The following variables are read and set by the ntp_adjtime() system
+ * call.
+ *
+ * time_state shows the state of the system clock, with values defined
+ * in the timex.h header file.
+ *
+ * time_status shows the status of the system clock, with bits defined
+ * in the timex.h header file.
+ *
+ * time_offset is used by the PLL/FLL to adjust the system time in small
+ * increments.
+ *
+ * time_constant determines the bandwidth or "stiffness" of the PLL.
+ *
+ * time_tolerance determines maximum frequency error or tolerance of the
+ * CPU clock oscillator and is a property of the architecture; however,
+ * in principle it could change as result of the presence of external
+ * discipline signals, for instance.
+ *
+ * time_precision is usually equal to the kernel tick variable; however,
+ * in cases where a precision clock counter or external clock is
+ * available, the resolution can be much less than this and depend on
+ * whether the external clock is working or not.
+ *
+ * time_maxerror is initialized by a ntp_adjtime() call and increased by
+ * the kernel once each second to reflect the maximum error
+ * bound growth.
+ *
+ * time_esterror is set and read by the ntp_adjtime() call, but
+ * otherwise not used by the kernel.
+ */
+static int time_status = STA_UNSYNC;	/* clock status bits */
+static int time_state = TIME_OK;	/* clock state */
+static long time_offset = 0;		/* time offset (us) */
+static long time_constant = 0;		/* pll time constant */
+static long time_tolerance = MAXFREQ;	/* frequency tolerance (scaled ppm) */
+static long time_precision = 1;		/* clock precision (us) */
+static long time_maxerror = MAXPHASE;	/* maximum error (us) */
+static long time_esterror = MAXPHASE;	/* estimated error (us) */
+static int time_daemon = 0;		/* No timedaemon active */
+
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock. The scale
+ * factors are defined in the timex.h header file.
+ *
+ * time_phase and time_freq are the phase increment and the frequency
+ * increment, respectively, of the kernel time variable at each tick of
+ * the clock.
+ *
+ * time_freq is set via ntp_adjtime() from a value stored in a file when
+ * the synchronization daemon is first started. Its value is retrieved
+ * via ntp_adjtime() and written to the file about once per hour by the
+ * daemon.
+ *
+ * time_adj is the adjustment added to the value of tick at each timer
+ * interrupt and is recomputed from time_phase and time_freq at each
+ * seconds rollover.
+ *
+ * time_reftime is the second's portion of the system time on the last
+ * call to ntp_adjtime(). It is used to adjust the time_freq variable
+ * and to increase the time_maxerror as the time since last update
+ * increases.
+ */
+long time_phase = 0;			/* phase offset (scaled us) */
+static long time_freq = 0;		/* frequency offset (scaled ppm) */
+long time_adj = 0;			/* tick adjust (scaled 1 / hz) */
+static long time_reftime = 0;		/* time at last adjustment (s) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used only if the kernel PPS discipline
+ * code is configured (PPS_SYNC). The scale factors are defined in the
+ * timex.h header file.
+ *
+ * pps_time contains the time at each calibration interval, as read by
+ * microtime(). pps_count counts the seconds of the calibration
+ * interval, the duration of which is nominally pps_shift in powers of
+ * two.
+ *
+ * pps_offset is the time offset produced by the time median filter
+ * pps_tf[], while pps_jitter is the dispersion (jitter) measured by
+ * this filter.
+ *
+ * pps_freq is the frequency offset produced by the frequency median
+ * filter pps_ff[], while pps_stabil is the dispersion (wander) measured
+ * by this filter.
+ *
+ * pps_usec is latched from a high resolution counter or external clock
+ * at pps_time. Here we want the hardware counter contents only, not the
+ * contents plus the time_tv.usec as usual.
+ *
+ * pps_valid counts the number of seconds since the last PPS update. It
+ * is used as a watchdog timer to disable the PPS discipline should the
+ * PPS signal be lost.
+ *
+ * pps_glitch counts the number of seconds since the beginning of an
+ * offset burst more than tick/2 from current nominal offset. It is used
+ * mainly to suppress error bursts due to priority conflicts between the
+ * PPS interrupt and timer interrupt.
+ *
+ * pps_intcnt counts the calibration intervals for use in the interval-
+ * adaptation algorithm. It's just too complicated for words.
+ */
+static struct timeval pps_time;	/* kernel time at last interval */
+static long pps_offset = 0;		/* pps time offset (us) */
+static long pps_jitter = MAXTIME;	/* pps time dispersion (jitter) (us) */
+static long pps_tf[] = {0, 0, 0};	/* pps time offset median filter (us) */
+static long pps_freq = 0;		/* frequency offset (scaled ppm) */
+static long pps_stabil = MAXFREQ;	/* frequency dispersion (scaled ppm) */
+static long pps_ff[] = {0, 0, 0};	/* frequency offset median filter */
+static long pps_usec = 0;		/* microsec counter at last interval */
+static long pps_valid = PPS_VALID;	/* pps signal watchdog counter */
+static int pps_glitch = 0;		/* pps signal glitch counter */
+static int pps_count = 0;		/* calibration interval counter (s) */
+static int pps_shift = PPS_SHIFT;	/* interval duration (s) (shift) */
+static int pps_intcnt = 0;		/* intervals at current duration */
+
+/*
+ * PPS signal quality monitors
+ *
+ * pps_jitcnt counts the seconds that have been discarded because the
+ * jitter measured by the time median filter exceeds the limit MAXTIME
+ * (100 us).
+ *
+ * pps_calcnt counts the frequency calibration intervals, which are
+ * variable from 4 s to 256 s.
+ *
+ * pps_errcnt counts the calibration intervals which have been discarded
+ * because the wander exceeds the limit MAXFREQ (100 ppm) or where the
+ * calibration interval jitter exceeds two ticks.
+ *
+ * pps_stbcnt counts the calibration intervals that have been discarded
+ * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us).
+ */
+static long pps_jitcnt = 0;		/* jitter limit exceeded */
+static long pps_calcnt = 0;		/* calibration intervals */
+static long pps_errcnt = 0;		/* calibration errors */
+static long pps_stbcnt = 0;		/* stability limit exceeded */
+#endif /* PPS_SYNC */
+
+static void hardupdate __P((int64_t offset, int prescaled));
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillatores and nominal update
+ * intervals less than 1024 s, operation should be in phase-lock mode
+ * (STA_FLL = 0), where the loop is disciplined to phase. For update
+ * intervals greater than thiss, operation should be in frequency-lock
+ * mode (STA_FLL = 1), where the loop is disciplined to frequency.
+ *
+ * Note: splclock() is in effect.
+ */
+static void
+hardupdate(offset, prescaled)
+	int64_t offset;
+	int prescaled;
+{
+	long mtemp;
+	int64_t ltemp;
+
+	if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME))
+		return;
+	if (prescaled)
+		ltemp = offset;
+	else
+		ltemp = offset << SHIFT_UPDATE;
+#ifdef PPS_SYNC
+	if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+		ltemp = pps_offset << SHIFT_UPDATE;
+#endif /* PPS_SYNC */
+
+	/*
+	 * Scale the phase adjustment and clamp to the operating range.
+	 */
+	if (ltemp > (MAXPHASE << SHIFT_UPDATE))
+		time_offset = MAXPHASE << SHIFT_UPDATE;
+	else if (ltemp < -(MAXPHASE << SHIFT_UPDATE))
+		time_offset = -(MAXPHASE << SHIFT_UPDATE);
+	else
+		time_offset = ltemp;
+
+	/*
+	 * Select whether the frequency is to be controlled and in which
+	 * mode (PLL or FLL). Clamp to the operating range. Ugly
+	 * multiply/divide should be replaced someday.
+	 */
+	if (time_status & STA_FREQHOLD || time_reftime == 0)
+		time_reftime = time_second;
+	mtemp = time_second - time_reftime;
+	time_reftime = time_second;
+	if (time_status & STA_FLL) {
+		if (mtemp >= MINSEC) {
+			ltemp = ((time_offset / mtemp) << (SHIFT_USEC -
+			    SHIFT_UPDATE));
+			if (ltemp < 0)
+				time_freq -= -ltemp >> SHIFT_KH;
+			else
+				time_freq += ltemp >> SHIFT_KH;
+		}
+	} else {
+		if (mtemp < MAXSEC) {
+			ltemp = time_offset * mtemp;
+			if (ltemp < 0)
+				time_freq -= -ltemp >> ((int64_t)time_constant +
+				    time_constant + SHIFT_KF -
+				    SHIFT_USEC + SHIFT_UPDATE);
+			else
+				time_freq += ltemp >> ((int64_t)time_constant +
+				    time_constant + SHIFT_KF -
+				    SHIFT_USEC + SHIFT_UPDATE);
+		}
+	}
+	if (time_freq > time_tolerance)
+		time_freq = time_tolerance;
+	else if (time_freq < -time_tolerance)
+		time_freq = -time_tolerance;
+}
+
+/*
+ * On rollover of the second the phase adjustment to be used for
+ * the next second is calculated. Also, the maximum error is
+ * increased by the tolerance. If the PPS frequency discipline
+ * code is present, the phase is increased to compensate for the
+ * CPU clock oscillator frequency error.
+ *
+ * On a 32-bit machine and given parameters in the timex.h
+ * header file, the maximum phase adjustment is +-512 ms and
+ * maximum frequency offset is a tad less than) +-512 ppm. On a
+ * 64-bit machine, you shouldn't need to ask.
+ */
+void
+ntp_update_second(struct timecounter *tc)
+{
+	u_int32_t *newsec;
+	long ltemp;
+
+	if (!time_daemon)
+		return;
+
+	newsec = &tc->tc_offset_sec;
+	time_maxerror += time_tolerance >> SHIFT_USEC;
+
+	/*
+	* Compute the phase adjustment for the next second. In
+	* PLL mode, the offset is reduced by a fixed factor
+	* times the time constant. In FLL mode the offset is
+	* used directly. In either mode, the maximum phase
+	* adjustment for each second is clamped so as to spread
+	* the adjustment over not more than the number of
+	* seconds between updates.
+	*/
+	if (time_offset < 0) {
+		ltemp = -time_offset;
+		if (!(time_status & STA_FLL))
+			ltemp >>= SHIFT_KG + time_constant;
+		if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+			ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+		time_offset += ltemp;
+		time_adj = -ltemp << (SHIFT_SCALE - SHIFT_UPDATE);
+	} else {
+		ltemp = time_offset;
+		if (!(time_status & STA_FLL))
+			ltemp >>= SHIFT_KG + time_constant;
+		if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+			ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+		time_offset -= ltemp;
+		time_adj = ltemp << (SHIFT_SCALE - SHIFT_UPDATE);
+	}
+
+	/*
+	* Compute the frequency estimate and additional phase
+	* adjustment due to frequency error for the next
+	* second. When the PPS signal is engaged, gnaw on the
+	* watchdog counter and update the frequency computed by
+	* the pll and the PPS signal.
+	*/
+#ifdef PPS_SYNC
+	pps_valid++;
+	if (pps_valid == PPS_VALID) {
+		pps_jitter = MAXTIME;
+		pps_stabil = MAXFREQ;
+		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+		    STA_PPSWANDER | STA_PPSERROR);
+	}
+	ltemp = time_freq + pps_freq;
+#else
+	ltemp = time_freq;
+#endif /* PPS_SYNC */
+	if (ltemp < 0)
+		time_adj -= -ltemp << (SHIFT_SCALE - SHIFT_USEC);
+	else
+		time_adj += ltemp << (SHIFT_SCALE - SHIFT_USEC);
+
+	tc->tc_adjustment = time_adj;
+	
+	/* XXX - this is really bogus, but can't be fixed until
+	xntpd's idea of the system clock is fixed to know how
+	the user wants leap seconds handled; in the mean time,
+	we assume that users of NTP are running without proper
+	leap second support (this is now the default anyway) */
+	/*
+	* Leap second processing. If in leap-insert state at
+	* the end of the day, the system clock is set back one
+	* second; if in leap-delete state, the system clock is
+	* set ahead one second. The microtime() routine or
+	* external clock driver will insure that reported time
+	* is always monotonic. The ugly divides should be
+	* replaced.
+	*/
+	switch (time_state) {
+
+		case TIME_OK:
+			if (time_status & STA_INS)
+				time_state = TIME_INS;
+			else if (time_status & STA_DEL)
+				time_state = TIME_DEL;
+			break;
+
+		case TIME_INS:
+			if ((*newsec) % 86400 == 0) {
+				(*newsec)--;
+				time_state = TIME_OOP;
+			}
+			break;
+
+		case TIME_DEL:
+			if (((*newsec) + 1) % 86400 == 0) {
+				(*newsec)++;
+				time_state = TIME_WAIT;
+			}
+			break;
+
+		case TIME_OOP:
+			time_state = TIME_WAIT;
+			break;
+
+		case TIME_WAIT:
+			if (!(time_status & (STA_INS | STA_DEL)))
+				time_state = TIME_OK;
+			break;
+	}
+}
+
+static int
+ntp_sysctl SYSCTL_HANDLER_ARGS
+{
+	struct timeval atv;
+	struct ntptimeval ntv;
+	int s;
+
+	s = splclock();
+	microtime(&atv);
+	ntv.time = atv;
+	ntv.maxerror = time_maxerror;
+	ntv.esterror = time_esterror;
+	splx(s);
+
+	ntv.time_state = time_state;
+
+	/*
+	 * Status word error decode. If any of these conditions
+	 * occur, an error is returned, instead of the status
+	 * word. Most applications will care only about the fact
+	 * the system clock may not be trusted, not about the
+	 * details.
+	 *
+	 * Hardware or software error
+	 */
+	if (time_status & (STA_UNSYNC | STA_CLOCKERR)) {
+		ntv.time_state = TIME_ERROR;
+	}
+
+	/*
+	 * PPS signal lost when either time or frequency
+	 * synchronization requested
+	 */
+	if (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+	    !(time_status & STA_PPSSIGNAL)) {
+		ntv.time_state = TIME_ERROR;
+	}
+
+	/*
+	 * PPS jitter exceeded when time synchronization
+	 * requested
+	 */
+	if (time_status & STA_PPSTIME &&
+	    time_status & STA_PPSJITTER) {
+		ntv.time_state = TIME_ERROR;
+	}
+
+	/*
+	 * PPS wander exceeded or calibration error when
+	 * frequency synchronization requested
+	 */
+	if (time_status & STA_PPSFREQ &&
+	    time_status & (STA_PPSWANDER | STA_PPSERROR)) {
+		ntv.time_state = TIME_ERROR;
+	}
+	return (sysctl_handle_opaque(oidp, &ntv, sizeof ntv, req));
+}
+
+SYSCTL_NODE(_kern, KERN_NTP_PLL, ntp_pll, CTLFLAG_RW, 0,
+	"NTP kernel PLL related stuff");
+SYSCTL_PROC(_kern_ntp_pll, NTP_PLL_GETTIME, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
+
+/*
+ * ntp_adjtime() - NTP daemon application interface
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ntp_adjtime_args {
+  struct timex *tp;
+};
+#endif
+
+int
+ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap)
+{
+	struct timex ntv;
+	int modes;
+	int s;
+	int error;
+
+	time_daemon = 1;
+
+	error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
+	if (error)
+		return error;
+
+	/*
+	 * Update selected clock variables - only the superuser can
+	 * change anything. Note that there is no error checking here on
+	 * the assumption the superuser should know what it is doing.
+	 */
+	modes = ntv.modes;
+	if ((modes != 0)
+	    && (error = suser(p->p_cred->pc_ucred, &p->p_acflag)))
+		return error;
+
+	s = splclock();
+	if (modes & MOD_FREQUENCY)
+#ifdef PPS_SYNC
+		time_freq = ntv.freq - pps_freq;
+#else /* PPS_SYNC */
+		time_freq = ntv.freq;
+#endif /* PPS_SYNC */
+	if (modes & MOD_MAXERROR)
+		time_maxerror = ntv.maxerror;
+	if (modes & MOD_ESTERROR)
+		time_esterror = ntv.esterror;
+	if (modes & MOD_STATUS) {
+		time_status &= STA_RONLY;
+		time_status |= ntv.status & ~STA_RONLY;
+	}
+	if (modes & MOD_TIMECONST)
+		time_constant = ntv.constant;
+	if (modes & MOD_OFFSET)
+		hardupdate(ntv.offset, modes & MOD_DOSCALE);
+
+	ntv.modes |= MOD_CANSCALE;
+	/*
+	 * Retrieve all clock variables
+	 */
+	if (modes & MOD_DOSCALE)
+		ntv.offset = time_offset;
+	else if (time_offset < 0)
+		ntv.offset = -(-time_offset >> SHIFT_UPDATE);
+	else
+		ntv.offset = time_offset >> SHIFT_UPDATE;
+#ifdef PPS_SYNC
+	ntv.freq = time_freq + pps_freq;
+#else /* PPS_SYNC */
+	ntv.freq = time_freq;
+#endif /* PPS_SYNC */
+	ntv.maxerror = time_maxerror;
+	ntv.esterror = time_esterror;
+	ntv.status = time_status;
+	ntv.constant = time_constant;
+	ntv.precision = time_precision;
+	ntv.tolerance = time_tolerance;
+#ifdef PPS_SYNC
+	ntv.shift = pps_shift;
+	ntv.ppsfreq = pps_freq;
+	ntv.jitter = pps_jitter >> PPS_AVG;
+	ntv.stabil = pps_stabil;
+	ntv.calcnt = pps_calcnt;
+	ntv.errcnt = pps_errcnt;
+	ntv.jitcnt = pps_jitcnt;
+	ntv.stbcnt = pps_stbcnt;
+#endif /* PPS_SYNC */
+	(void)splx(s);
+
+	error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
+	if (!error) {
+		/*
+		 * Status word error decode. See comments in
+		 * ntp_gettime() routine.
+		 */
+		p->p_retval[0] = time_state;
+		if (time_status & (STA_UNSYNC | STA_CLOCKERR))
+			p->p_retval[0] = TIME_ERROR;
+		if (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+		    !(time_status & STA_PPSSIGNAL))
+			p->p_retval[0] = TIME_ERROR;
+		if (time_status & STA_PPSTIME &&
+		    time_status & STA_PPSJITTER)
+			p->p_retval[0] = TIME_ERROR;
+		if (time_status & STA_PPSFREQ &&
+		    time_status & (STA_PPSWANDER | STA_PPSERROR))
+			p->p_retval[0] = TIME_ERROR;
+	}
+	return error;
+}
+
+#ifdef PPS_SYNC
+
+/* We need this ugly monster twice, so let's macroize it. */
+
+#define MEDIAN3X(a, m, s, i1, i2, i3)				\
+	do {							\
+	m = a[i2];						\
+	s = a[i1] - a[i3];					\
+	} while (0)
+
+#define MEDIAN3(a, m, s)					\
+	do {							\
+		if (a[0] > a[1]) {				\
+			if (a[1] > a[2])			\
+				MEDIAN3X(a, m, s, 0, 1, 2);	\
+			else if (a[2] > a[0])			\
+				MEDIAN3X(a, m, s, 2, 0, 1);	\
+			else					\
+				MEDIAN3X(a, m, s, 0, 2, 1);	\
+		} else {					\
+			if (a[2] > a[1])			\
+				MEDIAN3X(a, m, s, 2, 1, 0);	\
+			else  if (a[0] > a[2])			\
+				MEDIAN3X(a, m, s, 1, 0, 2);	\
+			else					\
+				MEDIAN3X(a, m, s, 1, 2, 0);	\
+		}						\
+	} while (0)
+
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. It measures the PPS phase
+ * and leaves it in a handy spot for the hardclock() routine. It
+ * integrates successive PPS phase differences and calculates the
+ * frequency offset. This is used in hardclock() to discipline the CPU
+ * clock oscillator so that intrinsic frequency error is cancelled out.
+ * The code requires the caller to capture the time and hardware counter
+ * value at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems, this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for certain exceptions: The PPS frequency pps_freq
+ * and phase pps_offset variables are determined by this routine and
+ * updated atomically. The time_tolerance variable can be considered a
+ * constant, since it is infrequently changed, and then only when the
+ * PPS signal is disabled. The watchdog counter pps_valid is updated
+ * once per second by hardclock() and is atomically cleared in this
+ * routine.
+ */
+void
+hardpps(tvp, p_usec)
+	struct timeval *tvp;		/* time at PPS */
+	long p_usec;			/* hardware counter at PPS */
+{
+	long u_usec, v_usec, bigtick;
+	long cal_sec, cal_usec;
+
+	/*
+	 * An occasional glitch can be produced when the PPS interrupt
+	 * occurs in the hardclock() routine before the time variable is
+	 * updated. Here the offset is discarded when the difference
+	 * between it and the last one is greater than tick/2, but not
+	 * if the interval since the first discard exceeds 30 s.
+	 */
+	time_status |= STA_PPSSIGNAL;
+	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+	pps_valid = 0;
+	u_usec = -tvp->tv_usec;
+	if (u_usec < -500000)
+		u_usec += 1000000;
+	v_usec = pps_offset - u_usec;
+	if (v_usec < 0)
+		v_usec = -v_usec;
+	if (v_usec > (tick >> 1)) {
+		if (pps_glitch > MAXGLITCH) {
+			pps_glitch = 0;
+			pps_tf[2] = u_usec;
+			pps_tf[1] = u_usec;
+		} else {
+			pps_glitch++;
+			u_usec = pps_offset;
+		}
+	} else
+		pps_glitch = 0;
+
+	/*
+	 * A three-stage median filter is used to help deglitch the pps
+	 * time. The median sample becomes the time offset estimate; the
+	 * difference between the other two samples becomes the time
+	 * dispersion (jitter) estimate.
+	 */
+	pps_tf[2] = pps_tf[1];
+	pps_tf[1] = pps_tf[0];
+	pps_tf[0] = u_usec;
+	MEDIAN3(pps_tf, pps_offset, v_usec);
+	if (v_usec > MAXTIME)
+		pps_jitcnt++;
+	v_usec = (v_usec << PPS_AVG) - pps_jitter;
+	if (v_usec < 0)
+		pps_jitter -= -v_usec >> PPS_AVG;
+	else
+		pps_jitter += v_usec >> PPS_AVG;
+	if (pps_jitter > (MAXTIME >> 1))
+		time_status |= STA_PPSJITTER;
+
+	/*
+	 * During the calibration interval adjust the starting time when
+	 * the tick overflows. At the end of the interval compute the
+	 * duration of the interval and the difference of the hardware
+	 * counters at the beginning and end of the interval. This code
+	 * is deliciously complicated by the fact valid differences may
+	 * exceed the value of tick when using long calibration
+	 * intervals and small ticks. Note that the counter can be
+	 * greater than tick if caught at just the wrong instant, but
+	 * the values returned and used here are correct.
+	 */
+	bigtick = (long)tick << SHIFT_USEC;
+	pps_usec -= pps_freq;
+	if (pps_usec >= bigtick)
+		pps_usec -= bigtick;
+	if (pps_usec < 0)
+		pps_usec += bigtick;
+	pps_time.tv_sec++;
+	pps_count++;
+	if (pps_count < (1 << pps_shift))
+		return;
+	pps_count = 0;
+	pps_calcnt++;
+	u_usec = p_usec << SHIFT_USEC;
+	v_usec = pps_usec - u_usec;
+	if (v_usec >= bigtick >> 1)
+		v_usec -= bigtick;
+	if (v_usec < -(bigtick >> 1))
+		v_usec += bigtick;
+	if (v_usec < 0)
+		v_usec = -(-v_usec >> pps_shift);
+	else
+		v_usec = v_usec >> pps_shift;
+	pps_usec = u_usec;
+	cal_sec = tvp->tv_sec;
+	cal_usec = tvp->tv_usec;
+	cal_sec -= pps_time.tv_sec;
+	cal_usec -= pps_time.tv_usec;
+	if (cal_usec < 0) {
+		cal_usec += 1000000;
+		cal_sec--;
+	}
+	pps_time = *tvp;
+
+	/*
+	 * Check for lost interrupts, noise, excessive jitter and
+	 * excessive frequency error. The number of timer ticks during
+	 * the interval may vary +-1 tick. Add to this a margin of one
+	 * tick for the PPS signal jitter and maximum frequency
+	 * deviation. If the limits are exceeded, the calibration
+	 * interval is reset to the minimum and we start over.
+	 */
+	u_usec = (long)tick << 1;
+	if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec))
+	    || (cal_sec == 0 && cal_usec < u_usec))
+	    || v_usec > time_tolerance || v_usec < -time_tolerance) {
+		pps_errcnt++;
+		pps_shift = PPS_SHIFT;
+		pps_intcnt = 0;
+		time_status |= STA_PPSERROR;
+		return;
+	}
+
+	/*
+	 * A three-stage median filter is used to help deglitch the pps
+	 * frequency. The median sample becomes the frequency offset
+	 * estimate; the difference between the other two samples
+	 * becomes the frequency dispersion (stability) estimate.
+	 */
+	pps_ff[2] = pps_ff[1];
+	pps_ff[1] = pps_ff[0];
+	pps_ff[0] = v_usec;
+	MEDIAN3(pps_ff, u_usec, v_usec);
+
+	/*
+	 * Here the frequency dispersion (stability) is updated. If it
+	 * is less than one-fourth the maximum (MAXFREQ), the frequency
+	 * offset is updated as well, but clamped to the tolerance. It
+	 * will be processed later by the hardclock() routine.
+	 */
+	v_usec = (v_usec >> 1) - pps_stabil;
+	if (v_usec < 0)
+		pps_stabil -= -v_usec >> PPS_AVG;
+	else
+		pps_stabil += v_usec >> PPS_AVG;
+	if (pps_stabil > MAXFREQ >> 2) {
+		pps_stbcnt++;
+		time_status |= STA_PPSWANDER;
+		return;
+	}
+	if (time_status & STA_PPSFREQ) {
+		if (u_usec < 0) {
+			pps_freq -= -u_usec >> PPS_AVG;
+			if (pps_freq < -time_tolerance)
+				pps_freq = -time_tolerance;
+			u_usec = -u_usec;
+		} else {
+			pps_freq += u_usec >> PPS_AVG;
+			if (pps_freq > time_tolerance)
+				pps_freq = time_tolerance;
+		}
+	}
+
+	/*
+	 * Here the calibration interval is adjusted. If the maximum
+	 * time difference is greater than tick / 4, reduce the interval
+	 * by half. If this is not the case for four consecutive
+	 * intervals, double the interval.
+	 */
+	if (u_usec << pps_shift > bigtick >> 2) {
+		pps_intcnt = 0;
+		if (pps_shift > PPS_SHIFT)
+			pps_shift--;
+	} else if (pps_intcnt >= 4) {
+		pps_intcnt = 0;
+		if (pps_shift < PPS_SHIFTMAX)
+			pps_shift++;
+	} else
+		pps_intcnt++;
+}
+
+#endif /* PPS_SYNC */
+
+int
+std_pps_ioctl(u_long cmd, caddr_t data, pps_params_t *pp, pps_info_t *pi, int ppscap)
+{
+        pps_params_t *app;
+        pps_info_t *api;
+
+        switch (cmd) {
+        case PPS_IOC_CREATE:
+                return (0);
+        case PPS_IOC_DESTROY:
+                return (0);
+        case PPS_IOC_SETPARAMS:
+                app = (pps_params_t *)data;
+                if (app->mode & ~ppscap)
+                        return (EINVAL);
+                *pp = *app;         
+                return (0);
+        case PPS_IOC_GETPARAMS:
+                app = (pps_params_t *)data;
+                *app = *pp;
+                return (0);
+        case PPS_IOC_GETCAP:
+                *(int*)data = ppscap;
+                return (0);
+        case PPS_IOC_FETCH:
+                api = (pps_info_t *)data;
+                *api = *pi;
+                pi->current_mode = pp->mode;         
+                return (0);
+        case PPS_IOC_WAIT:
+                return (EOPNOTSUPP);
+        default:
+                return (ENODEV);
+        }
+}
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
new file mode 100644
index 0000000..ad63a98
--- /dev/null
+++ b/sys/kern/kern_physio.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ *    John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ *    are met.
+ *
+ * $Id: kern_physio.c,v 1.29 1998/10/25 17:44:51 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static void	physwakeup __P((struct buf *bp));
+static struct buf * phygetvpbuf(dev_t dev, int resid);
+
+int
+physio(strategy, bp, dev, rw, minp, uio)
+	d_strategy_t *strategy;
+	struct buf *bp;
+	dev_t dev;
+	int rw;
+	u_int (*minp) __P((struct buf *bp));
+	struct uio *uio;
+{
+	int i;
+	int bufflags = rw?B_READ:0;
+	int error;
+	int spl;
+	caddr_t sa;
+	int bp_alloc = (bp == 0);
+	struct buf *bpa;
+
+/*
+ * keep the process from being swapped
+ */
+	curproc->p_flag |= P_PHYSIO;
+
+	/* create and build a buffer header for a transfer */
+	bpa = (struct buf *)phygetvpbuf(dev, uio->uio_resid);
+	if (!bp_alloc) {
+		spl = splbio();
+		while (bp->b_flags & B_BUSY) {
+			bp->b_flags |= B_WANTED;
+			tsleep((caddr_t)bp, PRIBIO, "physbw", 0);
+		}
+		bp->b_flags |= B_BUSY;
+		splx(spl);
+	} else {
+		bp = bpa;
+	}
+
+	/*
+	 * get a copy of the kva from the physical buffer
+	 */
+	sa = bpa->b_data;
+	bp->b_proc = curproc;
+	error = bp->b_error = 0;
+
+	for(i=0;i<uio->uio_iovcnt;i++) {
+		while( uio->uio_iov[i].iov_len) {
+
+			bp->b_dev = dev;
+			bp->b_bcount = uio->uio_iov[i].iov_len;
+			bp->b_flags = B_BUSY | B_PHYS | B_CALL | bufflags;
+			bp->b_iodone = physwakeup;
+			bp->b_data = uio->uio_iov[i].iov_base;
+			bp->b_bcount = minp( bp);
+			if( minp != minphys)
+				bp->b_bcount = minphys( bp);
+			bp->b_bufsize = bp->b_bcount;
+			/*
+			 * pass in the kva from the physical buffer
+			 * for the temporary kernel mapping.
+			 */
+			bp->b_saveaddr = sa;
+			bp->b_blkno = btodb(uio->uio_offset);
+			bp->b_offset = uio->uio_offset;
+
+			if (uio->uio_segflg == UIO_USERSPACE) {
+				if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) {
+					error = EFAULT;
+					goto doerror;
+				}
+				if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) {
+					error = EFAULT;
+					goto doerror;
+				}
+
+				/* bring buffer into kernel space */
+				vmapbuf(bp);
+			}
+
+			/* perform transfer */
+			(*strategy)(bp);
+
+			spl = splbio();
+			while ((bp->b_flags & B_DONE) == 0)
+				tsleep((caddr_t)bp, PRIBIO, "physstr", 0);
+			splx(spl);
+
+			/* release mapping into kernel space */
+			if (uio->uio_segflg == UIO_USERSPACE)
+				vunmapbuf(bp);
+
+			/*
+			 * update the uio data
+			 */
+			{
+				int iolen = bp->b_bcount - bp->b_resid;
+
+				if (iolen == 0 && !(bp->b_flags & B_ERROR))
+					goto doerror;	/* EOF */
+				uio->uio_iov[i].iov_len -= iolen;
+				uio->uio_iov[i].iov_base += iolen;
+				uio->uio_resid -= iolen;
+				uio->uio_offset += iolen;
+			}
+
+			/*
+			 * check for an error
+			 */
+			if( bp->b_flags & B_ERROR) {
+				error = bp->b_error;
+				goto doerror;
+			}
+		}
+	}
+
+
+doerror:
+	relpbuf(bpa, NULL);
+	if (!bp_alloc) {
+		bp->b_flags &= ~(B_BUSY|B_PHYS);
+		if( bp->b_flags & B_WANTED) {
+			bp->b_flags &= ~B_WANTED;
+			wakeup((caddr_t)bp);
+		}
+	}
+/*
+ * allow the process to be swapped
+ */
+	curproc->p_flag &= ~P_PHYSIO;
+
+	return (error);
+}
+
+u_int
+minphys(bp)
+	struct buf *bp;
+{
+	u_int maxphys = DFLTPHYS;
+	struct cdevsw *bdsw;
+
+	bdsw = cdevsw[major(bp->b_dev)];
+
+	if (bdsw && bdsw->d_maxio) {
+		maxphys = bdsw->d_maxio;
+	}
+	if (bp->b_kvasize && (bp->b_kvasize < maxphys))
+		maxphys = bp->b_kvasize;
+
+	if(((vm_offset_t) bp->b_data) & PAGE_MASK) {
+		maxphys -= PAGE_SIZE;
+	}
+
+	if( bp->b_bcount > maxphys) {
+		bp->b_bcount = maxphys;
+	}
+
+	return bp->b_bcount;
+}
+
+struct buf *
+phygetvpbuf(dev_t dev, int resid)
+{
+	struct cdevsw *bdsw;
+	int maxio;
+
+	bdsw = cdevsw[major(dev)];
+	if ((bdsw == NULL) || (bdsw->d_bmaj == -1))
+		return getpbuf(NULL);
+
+	maxio = bdsw->d_maxio;
+	if (resid > maxio)
+		resid = maxio;
+
+	return getpbuf(NULL);
+}
+
+static void
+physwakeup(bp)
+	struct buf *bp;
+{
+	wakeup((caddr_t) bp);
+	bp->b_flags &= ~B_CALL;
+}
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
new file mode 100644
index 0000000..0c6feac
--- /dev/null
+++ b/sys/kern/kern_proc.c
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
+ * $Id: kern_proc.c,v 1.42 1999/01/10 01:58:24 eivind Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#include <sys/tty.h>
+#include <sys/signalvar.h>
+#include <vm/vm.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+#include <vm/vm_zone.h>
+
+static MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
+MALLOC_DEFINE(M_SESSION, "session", "session header");
+static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
+MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
+
+struct prochd qs[NQS];		/* as good a place as any... */
+struct prochd rtqs[NQS];	/* Space for REALTIME queues too */
+struct prochd idqs[NQS];	/* Space for IDLE queues too */
+
+static void pgdelete	__P((struct pgrp *));
+
+/*
+ * Structure associated with user cacheing.
+ */
+struct uidinfo {
+	LIST_ENTRY(uidinfo) ui_hash;
+	uid_t	ui_uid;
+	long	ui_proccnt;
+};
+#define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
+static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
+static u_long uihash;		/* size of hash table - 1 */
+
+static void	orphanpg __P((struct pgrp *pg));
+
+/*
+ * Other process lists
+ */
+struct pidhashhead *pidhashtbl;
+u_long pidhash;
+struct pgrphashhead *pgrphashtbl;
+u_long pgrphash;
+struct proclist allproc;
+struct proclist zombproc;
+vm_zone_t proc_zone;
+
+/*
+ * Initialize global process hashing structures.
+ */
+void
+procinit()
+{
+
+	LIST_INIT(&allproc);
+	LIST_INIT(&zombproc);
+	pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
+	pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
+	uihashtbl = hashinit(maxproc / 16, M_PROC, &uihash);
+	proc_zone = zinit("PROC", sizeof (struct proc), 0, 0, 5);
+}
+
+/*
+ * Change the count associated with number of processes
+ * a given user is using.
+ */
+int
+chgproccnt(uid, diff)
+	uid_t	uid;
+	int	diff;
+{
+	register struct uidinfo *uip;
+	register struct uihashhead *uipp;
+
+	uipp = UIHASH(uid);
+	for (uip = uipp->lh_first; uip != 0; uip = uip->ui_hash.le_next)
+		if (uip->ui_uid == uid)
+			break;
+	if (uip) {
+		uip->ui_proccnt += diff;
+		if (uip->ui_proccnt > 0)
+			return (uip->ui_proccnt);
+		if (uip->ui_proccnt < 0)
+			panic("chgproccnt: procs < 0");
+		LIST_REMOVE(uip, ui_hash);
+		FREE(uip, M_PROC);
+		return (0);
+	}
+	if (diff <= 0) {
+		if (diff == 0)
+			return(0);
+		panic("chgproccnt: lost user");
+	}
+	MALLOC(uip, struct uidinfo *, sizeof(*uip), M_PROC, M_WAITOK);
+	LIST_INSERT_HEAD(uipp, uip, ui_hash);
+	uip->ui_uid = uid;
+	uip->ui_proccnt = diff;
+	return (diff);
+}
+
+/*
+ * Is p an inferior of the current process?
+ */
+int
+inferior(p)
+	register struct proc *p;
+{
+
+	for (; p != curproc; p = p->p_pptr)
+		if (p->p_pid == 0)
+			return (0);
+	return (1);
+}
+
+/*
+ * Locate a process by number
+ */
+struct proc *
+pfind(pid)
+	register pid_t pid;
+{
+	register struct proc *p;
+
+	for (p = PIDHASH(pid)->lh_first; p != 0; p = p->p_hash.le_next)
+		if (p->p_pid == pid)
+			return (p);
+	return (NULL);
+}
+
+/*
+ * Locate a process group by number
+ */
+struct pgrp *
+pgfind(pgid)
+	register pid_t pgid;
+{
+	register struct pgrp *pgrp;
+
+	for (pgrp = PGRPHASH(pgid)->lh_first; pgrp != 0;
+	     pgrp = pgrp->pg_hash.le_next)
+		if (pgrp->pg_id == pgid)
+			return (pgrp);
+	return (NULL);
+}
+
+/*
+ * Move p to a new or existing process group (and session)
+ */
+int
+enterpgrp(p, pgid, mksess)
+	register struct proc *p;
+	pid_t pgid;
+	int mksess;
+{
+	register struct pgrp *pgrp = pgfind(pgid);
+
+	KASSERT(pgrp == NULL || !mksess,
+	    ("enterpgrp: setsid into non-empty pgrp"));
+	KASSERT(!SESS_LEADER(p),
+	    ("enterpgrp: session leader attempted setpgrp"));
+
+	if (pgrp == NULL) {
+		pid_t savepid = p->p_pid;
+		struct proc *np;
+		/*
+		 * new process group
+		 */
+		KASSERT(p->p_pid == pgid,
+		    ("enterpgrp: new pgrp and pid != pgid"));
+		MALLOC(pgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP,
+		    M_WAITOK);
+		if ((np = pfind(savepid)) == NULL || np != p)
+			return (ESRCH);
+		if (mksess) {
+			register struct session *sess;
+
+			/*
+			 * new session
+			 */
+			MALLOC(sess, struct session *, sizeof(struct session),
+			    M_SESSION, M_WAITOK);
+			sess->s_leader = p;
+			sess->s_sid = p->p_pid;
+			sess->s_count = 1;
+			sess->s_ttyvp = NULL;
+			sess->s_ttyp = NULL;
+			bcopy(p->p_session->s_login, sess->s_login,
+			    sizeof(sess->s_login));
+			p->p_flag &= ~P_CONTROLT;
+			pgrp->pg_session = sess;
+			KASSERT(p == curproc,
+			    ("enterpgrp: mksession and p != curproc"));
+		} else {
+			pgrp->pg_session = p->p_session;
+			pgrp->pg_session->s_count++;
+		}
+		pgrp->pg_id = pgid;
+		LIST_INIT(&pgrp->pg_members);
+		LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
+		pgrp->pg_jobc = 0;
+		SLIST_INIT(&pgrp->pg_sigiolst);
+	} else if (pgrp == p->p_pgrp)
+		return (0);
+
+	/*
+	 * Adjust eligibility of affected pgrps to participate in job control.
+	 * Increment eligibility counts before decrementing, otherwise we
+	 * could reach 0 spuriously during the first call.
+	 */
+	fixjobc(p, pgrp, 1);
+	fixjobc(p, p->p_pgrp, 0);
+
+	LIST_REMOVE(p, p_pglist);
+	if (p->p_pgrp->pg_members.lh_first == 0)
+		pgdelete(p->p_pgrp);
+	p->p_pgrp = pgrp;
+	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
+	return (0);
+}
+
+/*
+ * remove process from process group
+ */
+int
+leavepgrp(p)
+	register struct proc *p;
+{
+
+	LIST_REMOVE(p, p_pglist);
+	if (p->p_pgrp->pg_members.lh_first == 0)
+		pgdelete(p->p_pgrp);
+	p->p_pgrp = 0;
+	return (0);
+}
+
+/*
+ * delete a process group
+ */
+static void
+pgdelete(pgrp)
+	register struct pgrp *pgrp;
+{
+
+	/*
+	 * Reset any sigio structures pointing to us as a result of
+	 * F_SETOWN with our pgid.
+	 */
+	funsetownlst(&pgrp->pg_sigiolst);
+
+	if (pgrp->pg_session->s_ttyp != NULL &&
+	    pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
+		pgrp->pg_session->s_ttyp->t_pgrp = NULL;
+	LIST_REMOVE(pgrp, pg_hash);
+	if (--pgrp->pg_session->s_count == 0)
+		FREE(pgrp->pg_session, M_SESSION);
+	FREE(pgrp, M_PGRP);
+}
+
+/*
+ * Adjust pgrp jobc counters when specified process changes process group.
+ * We count the number of processes in each process group that "qualify"
+ * the group for terminal job control (those with a parent in a different
+ * process group of the same session).  If that count reaches zero, the
+ * process group becomes orphaned.  Check both the specified process'
+ * process group and that of its children.
+ * entering == 0 => p is leaving specified group.
+ * entering == 1 => p is entering specified group.
+ */
+void
+fixjobc(p, pgrp, entering)
+	register struct proc *p;
+	register struct pgrp *pgrp;
+	int entering;
+{
+	register struct pgrp *hispgrp;
+	register struct session *mysession = pgrp->pg_session;
+
+	/*
+	 * Check p's parent to see whether p qualifies its own process
+	 * group; if so, adjust count for p's process group.
+	 */
+	if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
+	    hispgrp->pg_session == mysession)
+		if (entering)
+			pgrp->pg_jobc++;
+		else if (--pgrp->pg_jobc == 0)
+			orphanpg(pgrp);
+
+	/*
+	 * Check this process' children to see whether they qualify
+	 * their process groups; if so, adjust counts for children's
+	 * process groups.
+	 */
+	for (p = p->p_children.lh_first; p != 0; p = p->p_sibling.le_next)
+		if ((hispgrp = p->p_pgrp) != pgrp &&
+		    hispgrp->pg_session == mysession &&
+		    p->p_stat != SZOMB)
+			if (entering)
+				hispgrp->pg_jobc++;
+			else if (--hispgrp->pg_jobc == 0)
+				orphanpg(hispgrp);
+}
+
+/*
+ * A process group has become orphaned;
+ * if there are any stopped processes in the group,
+ * hang-up all process in that group.
+ */
+static void
+orphanpg(pg)
+	struct pgrp *pg;
+{
+	register struct proc *p;
+
+	for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) {
+		if (p->p_stat == SSTOP) {
+			for (p = pg->pg_members.lh_first; p != 0;
+			    p = p->p_pglist.le_next) {
+				psignal(p, SIGHUP);
+				psignal(p, SIGCONT);
+			}
+			return;
+		}
+	}
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(pgrpdump, pgrpdump)
+{
+	register struct pgrp *pgrp;
+	register struct proc *p;
+	register int i;
+
+	for (i = 0; i <= pgrphash; i++) {
+		if (pgrp = pgrphashtbl[i].lh_first) {
+			printf("\tindx %d\n", i);
+			for (; pgrp != 0; pgrp = pgrp->pg_hash.le_next) {
+				printf(
+			"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
+				    (void *)pgrp, (long)pgrp->pg_id,
+				    (void *)pgrp->pg_session,
+				    pgrp->pg_session->s_count,
+				    (void *)pgrp->pg_members.lh_first);
+				for (p = pgrp->pg_members.lh_first; p != 0;
+				    p = p->p_pglist.le_next) {
+					printf("\t\tpid %ld addr %p pgrp %p\n", 
+					    (long)p->p_pid, (void *)p,
+					    (void *)p->p_pgrp);
+				}
+			}
+		}
+	}
+}
+#endif /* DDB */
+
+/*
+ * Fill in an eproc structure for the specified process.
+ */
+void
+fill_eproc(p, ep)
+	register struct proc *p;
+	register struct eproc *ep;
+{
+	register struct tty *tp;
+
+	bzero(ep, sizeof(*ep));
+
+	ep->e_paddr = p;
+	if (p->p_cred) {
+		ep->e_pcred = *p->p_cred;
+		if (p->p_ucred)
+			ep->e_ucred = *p->p_ucred;
+	}
+#ifdef COMPAT_LINUX_THREADS
+	if (p->p_procsig){
+		ep->e_procsig = *p->p_procsig;
+	}
+#endif
+	if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) {
+		register struct vmspace *vm = p->p_vmspace;
+
+#ifdef pmap_resident_count
+		ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/
+#else
+		ep->e_vm.vm_rssize = vm->vm_rssize;
+#endif
+		ep->e_vm.vm_tsize = vm->vm_tsize;
+		ep->e_vm.vm_dsize = vm->vm_dsize;
+		ep->e_vm.vm_ssize = vm->vm_ssize;
+		ep->e_vm.vm_taddr = vm->vm_taddr;
+		ep->e_vm.vm_daddr = vm->vm_daddr;
+		ep->e_vm.vm_minsaddr = vm->vm_minsaddr;
+		ep->e_vm.vm_maxsaddr = vm->vm_maxsaddr;
+		ep->e_vm.vm_map = vm->vm_map;
+#ifndef sparc
+		ep->e_vm.vm_pmap = vm->vm_pmap;
+#endif
+	}
+	if (p->p_pptr)
+		ep->e_ppid = p->p_pptr->p_pid;
+	if (p->p_pgrp) {
+		ep->e_pgid = p->p_pgrp->pg_id;
+		ep->e_jobc = p->p_pgrp->pg_jobc;
+		ep->e_sess = p->p_pgrp->pg_session;
+
+		if (ep->e_sess) {
+			bcopy(ep->e_sess->s_login, ep->e_login, sizeof(ep->e_login));
+			if (ep->e_sess->s_ttyvp)
+				ep->e_flag = EPROC_CTTY;
+			if (p->p_session && SESS_LEADER(p))
+				ep->e_flag |= EPROC_SLEADER;
+		}
+	}
+	if ((p->p_flag & P_CONTROLT) &&
+	    (ep->e_sess != NULL) &&
+	    ((tp = ep->e_sess->s_ttyp) != NULL)) {
+		ep->e_tdev = tp->t_dev;
+		ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+		ep->e_tsess = tp->t_session;
+	} else
+		ep->e_tdev = NODEV;
+	if (p->p_wmesg) {
+		strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN);
+		ep->e_wmesg[WMESGLEN] = 0;
+	}
+}
+
+static struct proc *
+zpfind(pid_t pid)
+{
+	struct proc *p;
+
+	for (p = zombproc.lh_first; p != 0; p = p->p_list.le_next)
+		if (p->p_pid == pid)
+			return (p);
+	return (NULL);
+}
+
+
+static int
+sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb)
+{
+	struct eproc eproc;
+	int error;
+	pid_t pid = p->p_pid;
+
+	fill_eproc(p, &eproc);
+	error = SYSCTL_OUT(req,(caddr_t)p, sizeof(struct proc));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req,(caddr_t)&eproc, sizeof(eproc));
+	if (error)
+		return (error);
+	if (!doingzomb && pid && (pfind(pid) != p))
+		return EAGAIN;
+	if (doingzomb && zpfind(pid) != p)
+		return EAGAIN;
+	return (0);
+}
+
+static int
+sysctl_kern_proc SYSCTL_HANDLER_ARGS
+{
+	int *name = (int*) arg1;
+	u_int namelen = arg2;
+	struct proc *p;
+	int doingzomb;
+	int error = 0;
+
+	if (oidp->oid_number == KERN_PROC_PID) {
+		if (namelen != 1) 
+			return (EINVAL);
+		p = pfind((pid_t)name[0]);
+		if (!p)
+			return (0);
+		error = sysctl_out_proc(p, req, 0);
+		return (error);
+	}
+	if (oidp->oid_number == KERN_PROC_ALL && !namelen)
+		;
+	else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1)
+		;
+	else
+		return (EINVAL);
+	
+	if (!req->oldptr) {
+		/* overestimate by 5 procs */
+		error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
+		if (error)
+			return (error);
+	}
+	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
+		if (!doingzomb)
+			p = allproc.lh_first;
+		else
+			p = zombproc.lh_first;
+		for (; p != 0; p = p->p_list.le_next) {
+			/*
+			 * Skip embryonic processes.
+			 */
+			if (p->p_stat == SIDL)
+				continue;
+			/*
+			 * TODO - make more efficient (see notes below).
+			 * do by session.
+			 */
+			switch (oidp->oid_number) {
+
+			case KERN_PROC_PGRP:
+				/* could do this by traversing pgrp */
+				if (p->p_pgrp == NULL || 
+				    p->p_pgrp->pg_id != (pid_t)name[0])
+					continue;
+				break;
+
+			case KERN_PROC_TTY:
+				if ((p->p_flag & P_CONTROLT) == 0 ||
+				    p->p_session == NULL ||
+				    p->p_session->s_ttyp == NULL ||
+				    p->p_session->s_ttyp->t_dev != (dev_t)name[0])
+					continue;
+				break;
+
+			case KERN_PROC_UID:
+				if (p->p_ucred == NULL || 
+				    p->p_ucred->cr_uid != (uid_t)name[0])
+					continue;
+				break;
+
+			case KERN_PROC_RUID:
+				if (p->p_ucred == NULL || 
+				    p->p_cred->p_ruid != (uid_t)name[0])
+					continue;
+				break;
+			}
+
+			error = sysctl_out_proc(p, req, doingzomb);
+			if (error)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+
+SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD,  0, "Process table");
+
+SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT,
+	0, 0, sysctl_kern_proc, "S,proc", "");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
new file mode 100644
index 0000000..e5e1a3e
--- /dev/null
+++ b/sys/kern/kern_prot.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_prot.c	8.6 (Berkeley) 1/21/94
+ * $Id: kern_prot.c,v 1.42 1998/11/10 09:16:29 peter Exp $
+ */
+
+/*
+ * System calls related to processes and protection
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/acct.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/pioctl.h>
+
+static MALLOC_DEFINE(M_CRED, "cred", "credentials");
+
+#ifndef _SYS_SYSPROTO_H_
+struct getpid_args {
+	int	dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+getpid(p, uap)
+	struct proc *p;
+	struct getpid_args *uap;
+{
+
+	p->p_retval[0] = p->p_pid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+	p->p_retval[1] = p->p_pptr->p_pid;
+#endif
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getppid_args {
+        int     dummy;
+};
+#endif
+/* ARGSUSED */
+int
+getppid(p, uap)
+	struct proc *p;
+	struct getppid_args *uap;
+{
+
+	p->p_retval[0] = p->p_pptr->p_pid;
+	return (0);
+}
+
+/* Get process group ID; note that POSIX getpgrp takes no parameter */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgrp_args {
+        int     dummy;
+};
+#endif
+
+int
+getpgrp(p, uap)
+	struct proc *p;
+	struct getpgrp_args *uap;
+{
+
+	p->p_retval[0] = p->p_pgrp->pg_id;
+	return (0);
+}
+
+/* Get an arbitary pid's process group id */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgid_args {
+	pid_t	pid;
+};
+#endif
+
+int
+getpgid(p, uap)
+	struct proc *p;
+	struct getpgid_args *uap;
+{
+	struct proc *pt;
+
+	pt = p;
+	if (uap->pid == 0)
+		goto found;
+
+	if ((pt = pfind(uap->pid)) == 0)
+		return ESRCH;
+found:
+	p->p_retval[0] = pt->p_pgrp->pg_id;
+	return 0;
+}
+
+/*
+ * Get an arbitary pid's session id.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getsid_args {
+	pid_t	pid;
+};
+#endif
+
+int
+getsid(p, uap)
+	struct proc *p;
+	struct getsid_args *uap;
+{
+	struct proc *pt;
+
+	pt = p;
+	if (uap->pid == 0)
+		goto found;
+
+	if ((pt == pfind(uap->pid)) == 0)
+		return ESRCH;
+found:
+	p->p_retval[0] = pt->p_session->s_sid;
+	return 0;
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct getuid_args {
+        int     dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+getuid(p, uap)
+	struct proc *p;
+	struct getuid_args *uap;
+{
+
+	p->p_retval[0] = p->p_cred->p_ruid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+	p->p_retval[1] = p->p_ucred->cr_uid;
+#endif
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct geteuid_args {
+        int     dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+geteuid(p, uap)
+	struct proc *p;
+	struct geteuid_args *uap;
+{
+
+	p->p_retval[0] = p->p_ucred->cr_uid;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgid_args {
+        int     dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+getgid(p, uap)
+	struct proc *p;
+	struct getgid_args *uap;
+{
+
+	p->p_retval[0] = p->p_cred->p_rgid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+	p->p_retval[1] = p->p_ucred->cr_groups[0];
+#endif
+	return (0);
+}
+
+/*
+ * Get effective group ID.  The "egid" is groups[0], and could be obtained
+ * via getgroups.  This syscall exists because it is somewhat painful to do
+ * correctly in a library function.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getegid_args {
+        int     dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+getegid(p, uap)
+	struct proc *p;
+	struct getegid_args *uap;
+{
+
+	p->p_retval[0] = p->p_ucred->cr_groups[0];
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgroups_args {
+	u_int	gidsetsize;
+	gid_t	*gidset;
+};
+#endif
+int
+getgroups(p, uap)
+	struct proc *p;
+	register struct	getgroups_args *uap;
+{
+	register struct pcred *pc = p->p_cred;
+	register u_int ngrp;
+	int error;
+
+	if ((ngrp = uap->gidsetsize) == 0) {
+		p->p_retval[0] = pc->pc_ucred->cr_ngroups;
+		return (0);
+	}
+	if (ngrp < pc->pc_ucred->cr_ngroups)
+		return (EINVAL);
+	ngrp = pc->pc_ucred->cr_ngroups;
+	if ((error = copyout((caddr_t)pc->pc_ucred->cr_groups,
+	    (caddr_t)uap->gidset, ngrp * sizeof(gid_t))))
+		return (error);
+	p->p_retval[0] = ngrp;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setsid_args {
+        int     dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+setsid(p, uap)
+	register struct proc *p;
+	struct setsid_args *uap;
+{
+
+	if (p->p_pgid == p->p_pid || pgfind(p->p_pid)) {
+		return (EPERM);
+	} else {
+		(void)enterpgrp(p, p->p_pid, 1);
+		p->p_retval[0] = p->p_pid;
+		return (0);
+	}
+}
+
+/*
+ * set process group (setpgid/old setpgrp)
+ *
+ * caller does setpgid(targpid, targpgid)
+ *
+ * pid must be caller or child of caller (ESRCH)
+ * if a child
+ *	pid must be in same session (EPERM)
+ *	pid can't have done an exec (EACCES)
+ * if pgid != pid
+ * 	there must exist some pid in same session having pgid (EPERM)
+ * pid must not be session leader (EPERM)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setpgid_args {
+	int	pid;	/* target process id */
+	int	pgid;	/* target pgrp id */
+};
+#endif
+/* ARGSUSED */
+int
+setpgid(curp, uap)
+	struct proc *curp;
+	register struct setpgid_args *uap;
+{
+	register struct proc *targp;		/* target process */
+	register struct pgrp *pgrp;		/* target pgrp */
+
+	if (uap->pgid < 0)
+		return (EINVAL);
+	if (uap->pid != 0 && uap->pid != curp->p_pid) {
+		if ((targp = pfind(uap->pid)) == 0 || !inferior(targp))
+			return (ESRCH);
+		if (targp->p_pgrp == NULL ||  targp->p_session != curp->p_session)
+			return (EPERM);
+		if (targp->p_flag & P_EXEC)
+			return (EACCES);
+	} else
+		targp = curp;
+	if (SESS_LEADER(targp))
+		return (EPERM);
+	if (uap->pgid == 0)
+		uap->pgid = targp->p_pid;
+	else if (uap->pgid != targp->p_pid)
+		if ((pgrp = pgfind(uap->pgid)) == 0 ||
+	            pgrp->pg_session != curp->p_session)
+			return (EPERM);
+	return (enterpgrp(targp, uap->pgid, 0));
+}
+
+/*
+ * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
+ * compatable.  It says that setting the uid/gid to euid/egid is a special
+ * case of "appropriate privilege".  Once the rules are expanded out, this
+ * basically means that setuid(nnn) sets all three id's, in all permitted
+ * cases unless _POSIX_SAVED_IDS is enabled.  In that case, setuid(getuid())
+ * does not set the saved id - this is dangerous for traditional BSD
+ * programs.  For this reason, we *really* do not want to set
+ * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
+ */
+#define POSIX_APPENDIX_B_4_2_2
+
+#ifndef _SYS_SYSPROTO_H_
+struct setuid_args {
+	uid_t	uid;
+};
+#endif
+/* ARGSUSED */
+int
+setuid(p, uap)
+	struct proc *p;
+	struct setuid_args *uap;
+{
+	register struct pcred *pc = p->p_cred;
+	register uid_t uid;
+	int error;
+
+	/*
+	 * See if we have "permission" by POSIX 1003.1 rules.
+	 *
+	 * Note that setuid(geteuid()) is a special case of 
+	 * "appropriate privileges" in appendix B.4.2.2.  We need
+	 * to use this clause to be compatable with traditional BSD
+	 * semantics.  Basically, it means that "setuid(xx)" sets all
+	 * three id's (assuming you have privs).
+	 *
+	 * Notes on the logic.  We do things in three steps.
+	 * 1: We determine if the euid is going to change, and do EPERM
+	 *    right away.  We unconditionally change the euid later if this
+	 *    test is satisfied, simplifying that part of the logic.
+	 * 2: We determine if the real and/or saved uid's are going to
+	 *    change.  Determined by compile options.
+	 * 3: Change euid last. (after tests in #2 for "appropriate privs")
+	 */
+	uid = uap->uid;
+	if (uid != pc->p_ruid &&		/* allow setuid(getuid()) */
+#ifdef _POSIX_SAVED_IDS
+	    uid != pc->p_svuid &&		/* allow setuid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
+	    uid != pc->pc_ucred->cr_uid &&	/* allow setuid(geteuid()) */
+#endif
+	    (error = suser(pc->pc_ucred, &p->p_acflag)))
+		return (error);
+
+#ifdef _POSIX_SAVED_IDS
+	/*
+	 * Do we have "appropriate privileges" (are we root or uid == euid)
+	 * If so, we are changing the real uid and/or saved uid.
+	 */
+	if (
+#ifdef POSIX_APPENDIX_B_4_2_2	/* Use the clause from B.4.2.2 */
+	    uid == pc->pc_ucred->cr_uid ||
+#endif
+	    suser(pc->pc_ucred, &p->p_acflag) == 0) /* we are using privs */
+#endif
+	{
+		/*
+		 * Transfer proc count to new user.
+		 */
+		if (uid != pc->p_ruid) {
+			(void)chgproccnt(pc->p_ruid, -1);
+			(void)chgproccnt(uid, 1);
+		}
+		/*
+		 * Set real uid
+		 */
+		if (uid != pc->p_ruid) {
+			pc->p_ruid = uid;
+			setsugid(p);
+		}
+		/*
+		 * Set saved uid
+		 *
+		 * XXX always set saved uid even if not _POSIX_SAVED_IDS, as
+		 * the security of seteuid() depends on it.  B.4.2.2 says it
+		 * is important that we should do this.
+		 */
+		if (pc->p_svuid != uid) {
+			pc->p_svuid = uid;
+			setsugid(p);
+		}
+	}
+
+	/*
+	 * In all permitted cases, we are changing the euid.
+	 * Copy credentials so other references do not see our changes.
+	 */
+	if (pc->pc_ucred->cr_uid != uid) {
+		pc->pc_ucred = crcopy(pc->pc_ucred);
+		pc->pc_ucred->cr_uid = uid;
+		setsugid(p);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct seteuid_args {
+	uid_t	euid;
+};
+#endif
+/* ARGSUSED */
+int
+seteuid(p, uap)
+	struct proc *p;
+	struct seteuid_args *uap;
+{
+	register struct pcred *pc = p->p_cred;
+	register uid_t euid;
+	int error;
+
+	euid = uap->euid;
+	if (euid != pc->p_ruid &&		/* allow seteuid(getuid()) */
+	    euid != pc->p_svuid &&		/* allow seteuid(saved uid) */
+	    (error = suser(pc->pc_ucred, &p->p_acflag)))
+		return (error);
+	/*
+	 * Everything's okay, do it.  Copy credentials so other references do
+	 * not see our changes.
+	 */
+	if (pc->pc_ucred->cr_uid != euid) {
+		pc->pc_ucred = crcopy(pc->pc_ucred);
+		pc->pc_ucred->cr_uid = euid;
+		setsugid(p);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgid_args {
+	gid_t	gid;
+};
+#endif
+/* ARGSUSED */
+int
+setgid(p, uap)
+	struct proc *p;
+	struct setgid_args *uap;
+{
+	register struct pcred *pc = p->p_cred;
+	register gid_t gid;
+	int error;
+
+	/*
+	 * See if we have "permission" by POSIX 1003.1 rules.
+	 *
+	 * Note that setgid(getegid()) is a special case of
+	 * "appropriate privileges" in appendix B.4.2.2.  We need
+	 * to use this clause to be compatable with traditional BSD
+	 * semantics.  Basically, it means that "setgid(xx)" sets all
+	 * three id's (assuming you have privs).
+	 *
+	 * For notes on the logic here, see setuid() above.
+	 */
+	gid = uap->gid;
+	if (gid != pc->p_rgid &&		/* allow setgid(getgid()) */
+#ifdef _POSIX_SAVED_IDS
+	    gid != pc->p_svgid &&		/* allow setgid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
+	    gid != pc->pc_ucred->cr_groups[0] && /* allow setgid(getegid()) */
+#endif
+	    (error = suser(pc->pc_ucred, &p->p_acflag)))
+		return (error);
+
+#ifdef _POSIX_SAVED_IDS
+	/*
+	 * Do we have "appropriate privileges" (are we root or gid == egid)
+	 * If so, we are changing the real uid and saved gid.
+	 */
+	if (
+#ifdef POSIX_APPENDIX_B_4_2_2	/* use the clause from B.4.2.2 */
+	    gid == pc->pc_ucred->cr_groups[0] ||
+#endif
+	    suser(pc->pc_ucred, &p->p_acflag) == 0) /* we are using privs */
+#endif
+	{
+		/*
+		 * Set real gid
+		 */
+		if (pc->p_rgid != gid) {
+			pc->p_rgid = gid;
+			setsugid(p);
+		}
+		/*
+		 * Set saved gid
+		 *
+		 * XXX always set saved gid even if not _POSIX_SAVED_IDS, as
+		 * the security of setegid() depends on it.  B.4.2.2 says it
+		 * is important that we should do this.
+		 */
+		if (pc->p_svgid != gid) {
+			pc->p_svgid = gid;
+			setsugid(p);
+		}
+	}
+	/*
+	 * In all cases permitted cases, we are changing the egid.
+	 * Copy credentials so other references do not see our changes.
+	 */
+	if (pc->pc_ucred->cr_groups[0] != gid) {
+		pc->pc_ucred = crcopy(pc->pc_ucred);
+		pc->pc_ucred->cr_groups[0] = gid;
+		setsugid(p);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setegid_args {
+	gid_t	egid;
+};
+#endif
+/* ARGSUSED */
+int
+setegid(p, uap)
+	struct proc *p;
+	struct setegid_args *uap;
+{
+	register struct pcred *pc = p->p_cred;
+	register gid_t egid;
+	int error;
+
+	egid = uap->egid;
+	if (egid != pc->p_rgid &&		/* allow setegid(getgid()) */
+	    egid != pc->p_svgid &&		/* allow setegid(saved gid) */
+	    (error = suser(pc->pc_ucred, &p->p_acflag)))
+		return (error);
+	if (pc->pc_ucred->cr_groups[0] != egid) {
+		pc->pc_ucred = crcopy(pc->pc_ucred);
+		pc->pc_ucred->cr_groups[0] = egid;
+		setsugid(p);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgroups_args {
+	u_int	gidsetsize;
+	gid_t	*gidset;
+};
+#endif
+/* ARGSUSED */
+int
+setgroups(p, uap)
+	struct proc *p;
+	struct setgroups_args *uap;
+{
+	register struct pcred *pc = p->p_cred;
+	register u_int ngrp;
+	int error;
+
+	if ((error = suser(pc->pc_ucred, &p->p_acflag)))
+		return (error);
+	ngrp = uap->gidsetsize;
+	if (ngrp > NGROUPS)
+		return (EINVAL);
+	/*
+	 * XXX A little bit lazy here.  We could test if anything has
+	 * changed before crcopy() and setting P_SUGID.
+	 */
+	pc->pc_ucred = crcopy(pc->pc_ucred);
+	if (ngrp < 1) {
+		/*
+		 * setgroups(0, NULL) is a legitimate way of clearing the
+		 * groups vector on non-BSD systems (which generally do not
+		 * have the egid in the groups[0]).  We risk security holes
+		 * when running non-BSD software if we do not do the same.
+		 */
+		pc->pc_ucred->cr_ngroups = 1;
+	} else {
+		if ((error = copyin((caddr_t)uap->gidset,
+		    (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t))))
+			return (error);
+		pc->pc_ucred->cr_ngroups = ngrp;
+	}
+	setsugid(p);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setreuid_args {
+	uid_t	ruid;
+	uid_t	euid;
+};
+#endif
+/* ARGSUSED */
+int
+setreuid(p, uap)
+	register struct proc *p;
+	struct setreuid_args *uap;
+{
+	register struct pcred *pc = p->p_cred;
+	register uid_t ruid, euid;
+	int error;
+
+	ruid = uap->ruid;
+	euid = uap->euid;
+	if ((ruid != (uid_t)-1 && ruid != pc->p_ruid && ruid != pc->p_svuid ||
+	     euid != (uid_t)-1 && euid != pc->pc_ucred->cr_uid &&
+	     euid != pc->p_ruid && euid != pc->p_svuid) &&
+	    (error = suser(pc->pc_ucred, &p->p_acflag)))
+		return (error);
+
+	if (euid != (uid_t)-1 && pc->pc_ucred->cr_uid != euid) {
+		pc->pc_ucred = crcopy(pc->pc_ucred);
+		pc->pc_ucred->cr_uid = euid;
+		setsugid(p);
+	}
+	if (ruid != (uid_t)-1 && pc->p_ruid != ruid) {
+		(void)chgproccnt(pc->p_ruid, -1);
+		(void)chgproccnt(ruid, 1);
+		pc->p_ruid = ruid;
+		setsugid(p);
+	}
+	if ((ruid != (uid_t)-1 || pc->pc_ucred->cr_uid != pc->p_ruid) &&
+	    pc->p_svuid != pc->pc_ucred->cr_uid) {
+		pc->p_svuid = pc->pc_ucred->cr_uid;
+		setsugid(p);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setregid_args {
+	gid_t	rgid;
+	gid_t	egid;
+};
+#endif
+/* ARGSUSED */
+int
+setregid(p, uap)
+	register struct proc *p;
+	struct setregid_args *uap;
+{
+	register struct pcred *pc = p->p_cred;
+	register gid_t rgid, egid;
+	int error;
+
+	rgid = uap->rgid;
+	egid = uap->egid;
+	if ((rgid != (gid_t)-1 && rgid != pc->p_rgid && rgid != pc->p_svgid ||
+	     egid != (gid_t)-1 && egid != pc->pc_ucred->cr_groups[0] &&
+	     egid != pc->p_rgid && egid != pc->p_svgid) &&
+	    (error = suser(pc->pc_ucred, &p->p_acflag)))
+		return (error);
+
+	if (egid != (gid_t)-1 && pc->pc_ucred->cr_groups[0] != egid) {
+		pc->pc_ucred = crcopy(pc->pc_ucred);
+		pc->pc_ucred->cr_groups[0] = egid;
+		setsugid(p);
+	}
+	if (rgid != (gid_t)-1 && pc->p_rgid != rgid) {
+		pc->p_rgid = rgid;
+		setsugid(p);
+	}
+	if ((rgid != (gid_t)-1 || pc->pc_ucred->cr_groups[0] != pc->p_rgid) &&
+	    pc->p_svgid != pc->pc_ucred->cr_groups[0]) {
+		pc->p_svgid = pc->pc_ucred->cr_groups[0];
+		setsugid(p);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct issetugid_args {
+	int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+issetugid(p, uap)
+	register struct proc *p;
+	struct issetugid_args *uap;
+{
+	/*
+	 * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
+	 * we use P_SUGID because we consider changing the owners as
+	 * "tainting" as well.
+	 * This is significant for procs that start as root and "become"
+	 * a user without an exec - programs cannot know *everything*
+	 * that libc *might* have put in their data segment.
+	 */
+	if (p->p_flag & P_SUGID)
+		return (1);
+	return (0);
+}
+
+/*
+ * Check if gid is a member of the group set.
+ */
+int
+groupmember(gid, cred)
+	gid_t gid;
+	register struct ucred *cred;
+{
+	register gid_t *gp;
+	gid_t *egp;
+
+	egp = &(cred->cr_groups[cred->cr_ngroups]);
+	for (gp = cred->cr_groups; gp < egp; gp++)
+		if (*gp == gid)
+			return (1);
+	return (0);
+}
+
+/*
+ * Test whether the specified credentials imply "super-user"
+ * privilege; if so, and we have accounting info, set the flag
+ * indicating use of super-powers.
+ * Returns 0 or error.
+ */
+int
+suser(cred, acflag)
+	struct ucred *cred;
+	u_short *acflag;
+{
+	if (cred->cr_uid == 0) {
+		if (acflag)
+			*acflag |= ASU;
+		return (0);
+	}
+	return (EPERM);
+}
+
+/*
+ * Allocate a zeroed cred structure.
+ */
+struct ucred *
+crget()
+{
+	register struct ucred *cr;
+
+	MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK);
+	bzero((caddr_t)cr, sizeof(*cr));
+	cr->cr_ref = 1;
+	return (cr);
+}
+
+/*
+ * Free a cred structure.
+ * Throws away space when ref count gets to 0.
+ */
+void
+crfree(cr)
+	struct ucred *cr;
+{
+	if (--cr->cr_ref == 0)
+		FREE((caddr_t)cr, M_CRED);
+}
+
+/*
+ * Copy cred structure to a new one and free the old one.
+ */
+struct ucred *
+crcopy(cr)
+	struct ucred *cr;
+{
+	struct ucred *newcr;
+
+	if (cr->cr_ref == 1)
+		return (cr);
+	newcr = crget();
+	*newcr = *cr;
+	crfree(cr);
+	newcr->cr_ref = 1;
+	return (newcr);
+}
+
+/*
+ * Dup cred struct to a new held one.
+ */
+struct ucred *
+crdup(cr)
+	struct ucred *cr;
+{
+	struct ucred *newcr;
+
+	newcr = crget();
+	*newcr = *cr;
+	newcr->cr_ref = 1;
+	return (newcr);
+}
+
+/*
+ * Get login name, if available.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getlogin_args {
+	char	*namebuf;
+	u_int	namelen;
+};
+#endif
+/* ARGSUSED */
+int
+getlogin(p, uap)
+	struct proc *p;
+	struct getlogin_args *uap;
+{
+
+	if (uap->namelen > MAXLOGNAME)
+		uap->namelen = MAXLOGNAME;
+	return (copyout((caddr_t) p->p_pgrp->pg_session->s_login,
+	    (caddr_t) uap->namebuf, uap->namelen));
+}
+
+/*
+ * Set login name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setlogin_args {
+	char	*namebuf;
+};
+#endif
+/* ARGSUSED */
+int
+setlogin(p, uap)
+	struct proc *p;
+	struct setlogin_args *uap;
+{
+	int error;
+	char logintmp[MAXLOGNAME];
+
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
+		return (error);
+	error = copyinstr((caddr_t) uap->namebuf, (caddr_t) logintmp,
+	    sizeof(logintmp), (size_t *)0);
+	if (error == ENAMETOOLONG)
+		error = EINVAL;
+	else if (!error)
+		(void) memcpy(p->p_pgrp->pg_session->s_login, logintmp,
+		    sizeof(logintmp));
+	return (error);
+}
+
+void
+setsugid(p)
+     struct proc *p;
+{
+	p->p_flag |= P_SUGID;
+	if (!(p->p_pfsflags & PF_ISUGID))
+		p->p_stops = 0;
+}
diff --git a/sys/kern/kern_random.c b/sys/kern/kern_random.c
new file mode 100644
index 0000000..d635668
--- /dev/null
+++ b/sys/kern/kern_random.c
@@ -0,0 +1,379 @@
+/*
+ * random_machdep.c -- A strong random number generator
+ *
+ * $Id: random_machdep.c,v 1.28 1998/06/18 15:32:07 bde Exp $
+ *
+ * Version 0.95, last modified 18-Oct-95
+ * 
+ * Copyright Theodore Ts'o, 1994, 1995.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/select.h>
+#include <sys/poll.h>
+#include <sys/md5.h>
+
+#include <machine/random.h>
+
+#include <i386/isa/icu.h>
+#include <i386/isa/intr_machdep.h>
+
+#define MAX_BLKDEV 4
+
+/*
+ * The pool is stirred with a primitive polynomial of degree 128
+ * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1.
+ * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1.
+ */
+#define POOLWORDS 128    /* Power of 2 - note that this is 32-bit words */
+#define POOLBITS (POOLWORDS*32)
+
+#if POOLWORDS == 128
+#define TAP1    99     /* The polynomial taps */
+#define TAP2    59
+#define TAP3    31
+#define TAP4    9
+#define TAP5    7
+#elif POOLWORDS == 64
+#define TAP1    62      /* The polynomial taps */
+#define TAP2    38
+#define TAP3    10
+#define TAP4    6
+#define TAP5    1
+#else
+#error No primitive polynomial available for chosen POOLWORDS
+#endif
+
+#define WRITEBUFFER 512 /* size in bytes */
+
+/* There is actually only one of these, globally. */
+struct random_bucket {
+	u_int	add_ptr;
+	u_int	entropy_count;
+	int	input_rotate;
+	u_int32_t *pool;
+	struct	selinfo rsel;
+};
+
+/* There is one of these per entropy source */
+struct timer_rand_state {
+	u_long	last_time;
+	int 	last_delta;
+	int 	nbits;
+};
+
+static struct random_bucket random_state;
+static u_int32_t random_pool[POOLWORDS];
+static struct timer_rand_state keyboard_timer_state;
+static struct timer_rand_state extract_timer_state;
+static struct timer_rand_state irq_timer_state[ICU_LEN];
+#ifdef notyet
+static struct timer_rand_state blkdev_timer_state[MAX_BLKDEV];
+#endif
+static struct wait_queue *random_wait;
+
+#ifndef MIN
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+	
+void
+rand_initialize(void)
+{
+	random_state.add_ptr = 0;
+	random_state.entropy_count = 0;
+	random_state.pool = random_pool;
+	random_wait = NULL;
+	random_state.rsel.si_flags = 0;
+	random_state.rsel.si_pid = 0;
+}
+
+/*
+ * This function adds an int into the entropy "pool".  It does not
+ * update the entropy estimate.  The caller must do this if appropriate.
+ *
+ * The pool is stirred with a primitive polynomial of degree 128
+ * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1.
+ * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1.
+ * 
+ * We rotate the input word by a changing number of bits, to help
+ * assure that all bits in the entropy get toggled.  Otherwise, if we
+ * consistently feed the entropy pool small numbers (like ticks and
+ * scancodes, for example), the upper bits of the entropy pool don't
+ * get affected. --- TYT, 10/11/95
+ */
+static __inline void
+add_entropy_word(struct random_bucket *r, const u_int32_t input)
+{
+	u_int i;
+	u_int32_t w;
+
+	w = (input << r->input_rotate) | (input >> (32 - r->input_rotate));
+	i = r->add_ptr = (r->add_ptr - 1) & (POOLWORDS-1);
+	if (i)
+		r->input_rotate = (r->input_rotate + 7) & 31;
+	else
+		/*
+		 * At the beginning of the pool, add an extra 7 bits
+		 * rotation, so that successive passes spread the
+		 * input bits across the pool evenly.
+		 */
+		r->input_rotate = (r->input_rotate + 14) & 31;
+
+	/* XOR in the various taps */
+	w ^= r->pool[(i+TAP1)&(POOLWORDS-1)];
+	w ^= r->pool[(i+TAP2)&(POOLWORDS-1)];
+	w ^= r->pool[(i+TAP3)&(POOLWORDS-1)];
+	w ^= r->pool[(i+TAP4)&(POOLWORDS-1)];
+	w ^= r->pool[(i+TAP5)&(POOLWORDS-1)];
+	w ^= r->pool[i];
+	/* Rotate w left 1 bit (stolen from SHA) and store */
+	r->pool[i] = (w << 1) | (w >> 31);
+}
+
+/*
+ * This function adds entropy to the entropy "pool" by using timing
+ * delays.  It uses the timer_rand_state structure to make an estimate
+ * of how  any bits of entropy this call has added to the pool.
+ *
+ * The number "num" is also added to the pool - it should somehow describe
+ * the type of event which just happened.  This is currently 0-255 for
+ * keyboard scan codes, and 256 upwards for interrupts.
+ * On the i386, this is assumed to be at most 16 bits, and the high bits
+ * are used for a high-resolution timer.
+ */
+static void
+add_timer_randomness(struct random_bucket *r, struct timer_rand_state *state,
+	u_int num)
+{
+	int		delta, delta2;
+	u_int		nbits;
+	u_int32_t	time;
+
+	num ^= timecounter->tc_get_timecount(timecounter) << 16;
+	r->entropy_count += 2;
+		
+	time = ticks;
+
+	add_entropy_word(r, (u_int32_t) num);
+	add_entropy_word(r, time);
+
+	/*
+	 * Calculate number of bits of randomness we probably
+	 * added.  We take into account the first and second order
+	 * deltas in order to make our estimate.
+	 */
+	delta = time - state->last_time;
+	state->last_time = time;
+
+	delta2 = delta - state->last_delta;
+	state->last_delta = delta;
+
+	if (delta < 0) delta = -delta;
+	if (delta2 < 0) delta2 = -delta2;
+	delta = MIN(delta, delta2) >> 1;
+	for (nbits = 0; delta; nbits++)
+		delta >>= 1;
+
+	r->entropy_count += nbits;
+	
+	/* Prevent overflow */
+	if (r->entropy_count > POOLBITS)
+		r->entropy_count = POOLBITS;
+
+	if (r->entropy_count >= 8)
+		selwakeup(&random_state.rsel);
+}
+
+void
+add_keyboard_randomness(u_char scancode)
+{
+	add_timer_randomness(&random_state, &keyboard_timer_state, scancode);
+}
+
+void
+add_interrupt_randomness(void *vsc)
+{
+	int intr;
+	struct random_softc *sc = vsc;
+
+	(sc->sc_handler)(sc->sc_arg);
+	intr = sc->sc_intr;
+	add_timer_randomness(&random_state, &irq_timer_state[intr], intr);
+}
+
+#ifdef notused
+void
+add_blkdev_randomness(int major)
+{
+	if (major >= MAX_BLKDEV)
+		return;
+
+	add_timer_randomness(&random_state, &blkdev_timer_state[major],
+			     0x200+major);
+}
+#endif /* notused */
+
+#if POOLWORDS % 16
+#error extract_entropy() assumes that POOLWORDS is a multiple of 16 words.
+#endif
+/*
+ * This function extracts randomness from the "entropy pool", and
+ * returns it in a buffer.  This function computes how many remaining
+ * bits of entropy are left in the pool, but it does not restrict the
+ * number of bytes that are actually obtained.
+ */
+static __inline int
+extract_entropy(struct random_bucket *r, char *buf, int nbytes)
+{
+	int ret, i;
+	u_int32_t tmp[4];
+	
+	add_timer_randomness(r, &extract_timer_state, nbytes);
+	
+	/* Redundant, but just in case... */
+	if (r->entropy_count > POOLBITS) 
+		r->entropy_count = POOLBITS;
+	/* Why is this here?  Left in from Ted Ts'o.  Perhaps to limit time. */
+	if (nbytes > 32768)
+		nbytes = 32768;
+
+	ret = nbytes;
+	if (r->entropy_count / 8 >= nbytes)
+		r->entropy_count -= nbytes*8;
+	else
+		r->entropy_count = 0;
+
+	while (nbytes) {
+		/* Hash the pool to get the output */
+		tmp[0] = 0x67452301;
+		tmp[1] = 0xefcdab89;
+		tmp[2] = 0x98badcfe;
+		tmp[3] = 0x10325476;
+		for (i = 0; i < POOLWORDS; i += 16)
+			MD5Transform(tmp, (char *)(r->pool+i));
+		/* Modify pool so next hash will produce different results */
+		add_entropy_word(r, tmp[0]);
+		add_entropy_word(r, tmp[1]);
+		add_entropy_word(r, tmp[2]);
+		add_entropy_word(r, tmp[3]);
+		/*
+		 * Run the MD5 Transform one more time, since we want
+		 * to add at least minimal obscuring of the inputs to
+		 * add_entropy_word().  --- TYT
+		 */
+		MD5Transform(tmp, (char *)(r->pool));
+		
+		/* Copy data to destination buffer */
+		i = MIN(nbytes, 16);
+		bcopy(tmp, buf, i);
+		nbytes -= i;
+		buf += i;
+	}
+
+	/* Wipe data from memory */
+	bzero(tmp, sizeof(tmp));
+	
+	return ret;
+}
+
+#ifdef notused /* XXX NOT the exported kernel interface */
+/*
+ * This function is the exported kernel interface.  It returns some
+ * number of good random numbers, suitable for seeding TCP sequence
+ * numbers, etc.
+ */
+void
+get_random_bytes(void *buf, u_int nbytes)
+{
+	extract_entropy(&random_state, (char *) buf, nbytes);
+}
+#endif /* notused */
+
+u_int
+read_random(void *buf, u_int nbytes)
+{
+	if ((nbytes * 8) > random_state.entropy_count)
+		nbytes = random_state.entropy_count / 8;
+	
+	return extract_entropy(&random_state, (char *)buf, nbytes);
+}
+
+u_int
+read_random_unlimited(void *buf, u_int nbytes)
+{
+	return extract_entropy(&random_state, (char *)buf, nbytes);
+}
+
+#ifdef notused
+u_int
+write_random(const char *buf, u_int nbytes)
+{
+	u_int i;
+	u_int32_t word, *p;
+
+	for (i = nbytes, p = (u_int32_t *)buf;
+	     i >= sizeof(u_int32_t);
+	     i-= sizeof(u_int32_t), p++)
+		add_entropy_word(&random_state, *p);
+	if (i) {
+		word = 0;
+		bcopy(p, &word, i);
+		add_entropy_word(&random_state, word);
+	}
+	return nbytes;
+}
+#endif /* notused */
+
+int
+random_poll(dev_t dev, int events, struct proc *p)
+{
+	int s;
+	int revents = 0;
+
+	s = splhigh();
+	if (events & (POLLIN | POLLRDNORM))
+		if (random_state.entropy_count >= 8)
+			revents |= events & (POLLIN | POLLRDNORM);
+		else
+			selrecord(p, &random_state.rsel);
+
+	splx(s);
+	if (events & (POLLOUT | POLLWRNORM))
+		revents |= events & (POLLOUT | POLLWRNORM);	/* heh */
+
+	return (revents);
+}
+
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
new file mode 100644
index 0000000..1bad1d2
--- /dev/null
+++ b/sys/kern/kern_resource.c
@@ -0,0 +1,623 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
+ * $Id: kern_resource.c,v 1.37 1998/05/28 09:30:18 phk Exp $
+ */
+
+#include "opt_compat.h"
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/resourcevar.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+static int donice __P((struct proc *curp, struct proc *chgp, int n));
+static int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp));
+
+/*
+ * Resource controls and accounting.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getpriority_args {
+	int	which;
+	int	who;
+};
+#endif
+int
+getpriority(curp, uap)
+	struct proc *curp;
+	register struct getpriority_args *uap;
+{
+	register struct proc *p;
+	register int low = PRIO_MAX + 1;
+
+	switch (uap->which) {
+
+	case PRIO_PROCESS:
+		if (uap->who == 0)
+			p = curp;
+		else
+			p = pfind(uap->who);
+		if (p == 0)
+			break;
+		low = p->p_nice;
+		break;
+
+	case PRIO_PGRP: {
+		register struct pgrp *pg;
+
+		if (uap->who == 0)
+			pg = curp->p_pgrp;
+		else if ((pg = pgfind(uap->who)) == NULL)
+			break;
+		for (p = pg->pg_members.lh_first; p != 0;
+		     p = p->p_pglist.le_next) {
+			if (p->p_nice < low)
+				low = p->p_nice;
+		}
+		break;
+	}
+
+	case PRIO_USER:
+		if (uap->who == 0)
+			uap->who = curp->p_ucred->cr_uid;
+		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next)
+			if (p->p_ucred->cr_uid == uap->who &&
+			    p->p_nice < low)
+				low = p->p_nice;
+		break;
+
+	default:
+		return (EINVAL);
+	}
+	if (low == PRIO_MAX + 1)
+		return (ESRCH);
+	curp->p_retval[0] = low;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setpriority_args {
+	int	which;
+	int	who;
+	int	prio;
+};
+#endif
+/* ARGSUSED */
+int
+setpriority(curp, uap)
+	struct proc *curp;
+	register struct setpriority_args *uap;
+{
+	register struct proc *p;
+	int found = 0, error = 0;
+
+	switch (uap->which) {
+
+	case PRIO_PROCESS:
+		if (uap->who == 0)
+			p = curp;
+		else
+			p = pfind(uap->who);
+		if (p == 0)
+			break;
+		error = donice(curp, p, uap->prio);
+		found++;
+		break;
+
+	case PRIO_PGRP: {
+		register struct pgrp *pg;
+
+		if (uap->who == 0)
+			pg = curp->p_pgrp;
+		else if ((pg = pgfind(uap->who)) == NULL)
+			break;
+		for (p = pg->pg_members.lh_first; p != 0;
+		    p = p->p_pglist.le_next) {
+			error = donice(curp, p, uap->prio);
+			found++;
+		}
+		break;
+	}
+
+	case PRIO_USER:
+		if (uap->who == 0)
+			uap->who = curp->p_ucred->cr_uid;
+		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next)
+			if (p->p_ucred->cr_uid == uap->who) {
+				error = donice(curp, p, uap->prio);
+				found++;
+			}
+		break;
+
+	default:
+		return (EINVAL);
+	}
+	if (found == 0)
+		return (ESRCH);
+	return (error);
+}
+
+static int
+donice(curp, chgp, n)
+	register struct proc *curp, *chgp;
+	register int n;
+{
+	register struct pcred *pcred = curp->p_cred;
+
+	if (pcred->pc_ucred->cr_uid && pcred->p_ruid &&
+	    pcred->pc_ucred->cr_uid != chgp->p_ucred->cr_uid &&
+	    pcred->p_ruid != chgp->p_ucred->cr_uid)
+		return (EPERM);
+	if (n > PRIO_MAX)
+		n = PRIO_MAX;
+	if (n < PRIO_MIN)
+		n = PRIO_MIN;
+	if (n < chgp->p_nice && suser(pcred->pc_ucred, &curp->p_acflag))
+		return (EACCES);
+	chgp->p_nice = n;
+	(void)resetpriority(chgp);
+	return (0);
+}
+
+/* rtprio system call */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_args {
+	int		function;
+	pid_t		pid;
+	struct rtprio	*rtp;
+};
+#endif
+
+/*
+ * Set realtime priority
+ */
+
+/* ARGSUSED */
+int
+rtprio(curp, uap)
+	struct proc *curp;
+	register struct rtprio_args *uap;
+{
+	register struct proc *p;
+	register struct pcred *pcred = curp->p_cred;
+	struct rtprio rtp;
+	int error;
+
+	error = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+	if (error)
+		return (error);
+
+	if (uap->pid == 0)
+		p = curp;
+	else
+		p = pfind(uap->pid);
+
+	if (p == 0)
+		return (ESRCH);
+
+	switch (uap->function) {
+	case RTP_LOOKUP:
+		return (copyout(&p->p_rtprio, uap->rtp, sizeof(struct rtprio)));
+	case RTP_SET:
+		if (pcred->pc_ucred->cr_uid && pcred->p_ruid &&
+		    pcred->pc_ucred->cr_uid != p->p_ucred->cr_uid &&
+		    pcred->p_ruid != p->p_ucred->cr_uid)
+		        return (EPERM);
+		/* disallow setting rtprio in most cases if not superuser */
+		if (suser(pcred->pc_ucred, &curp->p_acflag)) {
+			/* can't set someone else's */
+			if (uap->pid)
+				return (EPERM);
+			/* can't set realtime priority */
+/*
+ * Realtime priority has to be restricted for reasons which should be
+ * obvious. However, for idle priority, there is a potential for
+ * system deadlock if an idleprio process gains a lock on a resource
+ * that other processes need (and the idleprio process can't run
+ * due to a CPU-bound normal process). Fix me! XXX
+ */
+#if 0
+ 			if (RTP_PRIO_IS_REALTIME(rtp.type))
+#endif
+			if (rtp.type != RTP_PRIO_NORMAL)
+				return (EPERM);
+		}
+		switch (rtp.type) {
+#ifdef RTP_PRIO_FIFO
+		case RTP_PRIO_FIFO:
+#endif
+		case RTP_PRIO_REALTIME:
+		case RTP_PRIO_NORMAL:
+		case RTP_PRIO_IDLE:
+			if (rtp.prio > RTP_PRIO_MAX)
+				return (EINVAL);
+			p->p_rtprio = rtp;
+			return (0);
+		default:
+			return (EINVAL);
+		}
+
+	default:
+		return (EINVAL);
+	}
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osetrlimit_args {
+	u_int	which;
+	struct	orlimit *rlp;
+};
+#endif
+/* ARGSUSED */
+int
+osetrlimit(p, uap)
+	struct proc *p;
+	register struct osetrlimit_args *uap;
+{
+	struct orlimit olim;
+	struct rlimit lim;
+	int error;
+
+	if ((error =
+	    copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit))))
+		return (error);
+	lim.rlim_cur = olim.rlim_cur;
+	lim.rlim_max = olim.rlim_max;
+	return (dosetrlimit(p, uap->which, &lim));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogetrlimit_args {
+	u_int	which;
+	struct	orlimit *rlp;
+};
+#endif
+/* ARGSUSED */
+int
+ogetrlimit(p, uap)
+	struct proc *p;
+	register struct ogetrlimit_args *uap;
+{
+	struct orlimit olim;
+
+	if (uap->which >= RLIM_NLIMITS)
+		return (EINVAL);
+	olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur;
+	if (olim.rlim_cur == -1)
+		olim.rlim_cur = 0x7fffffff;
+	olim.rlim_max = p->p_rlimit[uap->which].rlim_max;
+	if (olim.rlim_max == -1)
+		olim.rlim_max = 0x7fffffff;
+	return (copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim)));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifndef _SYS_SYSPROTO_H_
+struct __setrlimit_args {
+	u_int	which;
+	struct	rlimit *rlp;
+};
+#endif
+/* ARGSUSED */
+int
+setrlimit(p, uap)
+	struct proc *p;
+	register struct __setrlimit_args *uap;
+{
+	struct rlimit alim;
+	int error;
+
+	if ((error =
+	    copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit))))
+		return (error);
+	return (dosetrlimit(p, uap->which, &alim));
+}
+
+static int
+dosetrlimit(p, which, limp)
+	struct proc *p;
+	u_int which;
+	struct rlimit *limp;
+{
+	register struct rlimit *alimp;
+	int error;
+
+	if (which >= RLIM_NLIMITS)
+		return (EINVAL);
+	alimp = &p->p_rlimit[which];
+
+	/*
+	 * Preserve historical bugs by treating negative limits as unsigned.
+	 */
+	if (limp->rlim_cur < 0)
+		limp->rlim_cur = RLIM_INFINITY;
+	if (limp->rlim_max < 0)
+		limp->rlim_max = RLIM_INFINITY;
+
+	if (limp->rlim_cur > alimp->rlim_max ||
+	    limp->rlim_max > alimp->rlim_max)
+		if ((error = suser(p->p_ucred, &p->p_acflag)))
+			return (error);
+	if (limp->rlim_cur > limp->rlim_max)
+		limp->rlim_cur = limp->rlim_max;
+	if (p->p_limit->p_refcnt > 1 &&
+	    (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
+		p->p_limit->p_refcnt--;
+		p->p_limit = limcopy(p->p_limit);
+		alimp = &p->p_rlimit[which];
+	}
+
+	switch (which) {
+
+	case RLIMIT_CPU:
+		if (limp->rlim_cur > RLIM_INFINITY / (rlim_t)1000000)
+			p->p_limit->p_cpulimit = RLIM_INFINITY;
+		else
+			p->p_limit->p_cpulimit = 
+			    (rlim_t)1000000 * limp->rlim_cur;
+		break;
+	case RLIMIT_DATA:
+		if (limp->rlim_cur > MAXDSIZ)
+			limp->rlim_cur = MAXDSIZ;
+		if (limp->rlim_max > MAXDSIZ)
+			limp->rlim_max = MAXDSIZ;
+		break;
+
+	case RLIMIT_STACK:
+		if (limp->rlim_cur > MAXSSIZ)
+			limp->rlim_cur = MAXSSIZ;
+		if (limp->rlim_max > MAXSSIZ)
+			limp->rlim_max = MAXSSIZ;
+		/*
+		 * Stack is allocated to the max at exec time with only
+		 * "rlim_cur" bytes accessible.  If stack limit is going
+		 * up make more accessible, if going down make inaccessible.
+		 */
+		if (limp->rlim_cur != alimp->rlim_cur) {
+			vm_offset_t addr;
+			vm_size_t size;
+			vm_prot_t prot;
+
+			if (limp->rlim_cur > alimp->rlim_cur) {
+				prot = VM_PROT_ALL;
+				size = limp->rlim_cur - alimp->rlim_cur;
+				addr = USRSTACK - limp->rlim_cur;
+			} else {
+				prot = VM_PROT_NONE;
+				size = alimp->rlim_cur - limp->rlim_cur;
+				addr = USRSTACK - alimp->rlim_cur;
+			}
+			addr = trunc_page(addr);
+			size = round_page(size);
+			(void) vm_map_protect(&p->p_vmspace->vm_map,
+					      addr, addr+size, prot, FALSE);
+		}
+		break;
+
+	case RLIMIT_NOFILE:
+		if (limp->rlim_cur > maxfilesperproc)
+			limp->rlim_cur = maxfilesperproc;
+		if (limp->rlim_max > maxfilesperproc)
+			limp->rlim_max = maxfilesperproc;
+		break;
+
+	case RLIMIT_NPROC:
+		if (limp->rlim_cur > maxprocperuid)
+			limp->rlim_cur = maxprocperuid;
+		if (limp->rlim_max > maxprocperuid)
+			limp->rlim_max = maxprocperuid;
+		break;
+	}
+	*alimp = *limp;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct __getrlimit_args {
+	u_int	which;
+	struct	rlimit *rlp;
+};
+#endif
+/* ARGSUSED */
+int
+getrlimit(p, uap)
+	struct proc *p;
+	register struct __getrlimit_args *uap;
+{
+
+	if (uap->which >= RLIM_NLIMITS)
+		return (EINVAL);
+	return (copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp,
+	    sizeof (struct rlimit)));
+}
+
+/*
+ * Transform the running time and tick information in proc p into user,
+ * system, and interrupt time usage.
+ */
+void
+calcru(p, up, sp, ip)
+	struct proc *p;
+	struct timeval *up;
+	struct timeval *sp;
+	struct timeval *ip;
+{
+	int64_t totusec;
+	u_int64_t u, st, ut, it, tot;
+	int s;
+	struct timeval tv;
+
+	/* XXX: why spl-protect ?  worst case is an off-by-one report */
+	s = splstatclock();
+	st = p->p_sticks;
+	ut = p->p_uticks;
+	it = p->p_iticks;
+	splx(s);
+
+	tot = st + ut + it;
+	if (tot == 0) {
+		st = 1;
+		tot = 1;
+	}
+
+	totusec = p->p_runtime;
+#ifdef SMP
+	if (p->p_oncpu != (char)0xff) {
+#else
+	if (p == curproc) {
+#endif
+		/*
+		 * Adjust for the current time slice.  This is actually fairly
+		 * important since the error here is on the order of a time
+		 * quantum, which is much greater than the sampling error.
+		 */
+		microuptime(&tv);
+		totusec += (tv.tv_usec - p->p_switchtime.tv_usec) +
+		    (tv.tv_sec - p->p_switchtime.tv_sec) * (int64_t)1000000;
+
+		/*
+		 * Copy the time that was just read to `switchtime' in case
+		 * we are being called from exit1().  Exits don't go through
+		 * mi_switch(), so `switchtime' doesn't get set in the normal
+		 * way.  We set it here instead of more cleanly in exit1()
+		 * to avoid losing track of the time between the calls to
+		 * microuptime().
+		 */
+		switchtime = tv;
+	}
+	if (totusec < 0) {
+		/* XXX no %qd in kernel.  Truncate. */
+		printf("calcru: negative time of %ld usec for pid %d (%s)\n",
+		       (long)totusec, p->p_pid, p->p_comm);
+		totusec = 0;
+	}
+	u = totusec;
+	st = (u * st) / tot;
+	sp->tv_sec = st / 1000000;
+	sp->tv_usec = st % 1000000;
+	ut = (u * ut) / tot;
+	up->tv_sec = ut / 1000000;
+	up->tv_usec = ut % 1000000;
+	if (ip != NULL) {
+		it = (u * it) / tot;
+		ip->tv_sec = it / 1000000;
+		ip->tv_usec = it % 1000000;
+	}
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getrusage_args {
+	int	who;
+	struct	rusage *rusage;
+};
+#endif
+/* ARGSUSED */
+int
+getrusage(p, uap)
+	register struct proc *p;
+	register struct getrusage_args *uap;
+{
+	register struct rusage *rup;
+
+	switch (uap->who) {
+
+	case RUSAGE_SELF:
+		rup = &p->p_stats->p_ru;
+		calcru(p, &rup->ru_utime, &rup->ru_stime, NULL);
+		break;
+
+	case RUSAGE_CHILDREN:
+		rup = &p->p_stats->p_cru;
+		break;
+
+	default:
+		return (EINVAL);
+	}
+	return (copyout((caddr_t)rup, (caddr_t)uap->rusage,
+	    sizeof (struct rusage)));
+}
+
+void
+ruadd(ru, ru2)
+	register struct rusage *ru, *ru2;
+{
+	register long *ip, *ip2;
+	register int i;
+
+	timevaladd(&ru->ru_utime, &ru2->ru_utime);
+	timevaladd(&ru->ru_stime, &ru2->ru_stime);
+	if (ru->ru_maxrss < ru2->ru_maxrss)
+		ru->ru_maxrss = ru2->ru_maxrss;
+	ip = &ru->ru_first; ip2 = &ru2->ru_first;
+	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
+		*ip++ += *ip2++;
+}
+
+/*
+ * Make a copy of the plimit structure.
+ * We share these structures copy-on-write after fork,
+ * and copy when a limit is changed.
+ */
+struct plimit *
+limcopy(lim)
+	struct plimit *lim;
+{
+	register struct plimit *copy;
+
+	MALLOC(copy, struct plimit *, sizeof(struct plimit),
+	    M_SUBPROC, M_WAITOK);
+	bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct plimit));
+	copy->p_lflags = 0;
+	copy->p_refcnt = 1;
+	return (copy);
+}
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
new file mode 100644
index 0000000..4d6db41
--- /dev/null
+++ b/sys/kern/kern_shutdown.c
@@ -0,0 +1,530 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_shutdown.c	8.3 (Berkeley) 1/21/94
+ * $Id: kern_shutdown.c,v 1.43 1998/12/04 22:54:51 archie Exp $
+ */
+
+#include "opt_ddb.h"
+#include "opt_hw_wdog.h"
+#include "opt_panic.h"
+#include "opt_show_busybufs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/reboot.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/sysproto.h>
+
+#include <machine/pcb.h>
+#include <machine/clock.h>
+#include <machine/cons.h>
+#include <machine/md_var.h>
+#ifdef SMP
+#include <machine/smp.h>		/* smp_active, cpuid */
+#endif
+
+#include <sys/signalvar.h>
+
+#ifndef PANIC_REBOOT_WAIT_TIME
+#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
+#endif
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#ifdef DDB
+#ifdef DDB_UNATTENDED
+int debugger_on_panic = 0;
+#else
+int debugger_on_panic = 1;
+#endif
+SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW,
+	&debugger_on_panic, 0, "");
+#endif
+
+#ifdef	HW_WDOG
+/*
+ * If there is a hardware watchdog, point this at the function needed to
+ * hold it off.
+ * It's needed when the kernel needs to do some lengthy operations.
+ * e.g. in wd.c when dumping core.. It's most annoying to have
+ * your precious core-dump only half written because the wdog kicked in.
+ */
+watchdog_tickle_fn wdog_tickler = NULL;
+#endif	/* HW_WDOG */
+
+/*
+ * Variable panicstr contains argument to first call to panic; used as flag
+ * to indicate that the kernel has already called panic.
+ */
+const char *panicstr;
+
+/*
+ * callout list for things to do a shutdown
+ */
+typedef struct shutdown_list_element {
+	LIST_ENTRY(shutdown_list_element) links;
+	bootlist_fn function;
+	void *arg;
+	int priority;
+} *sle_p;
+
+/*
+ * There are three shutdown lists. Some things need to be shut down
+ * earlier than others.
+ */
+LIST_HEAD(shutdown_list, shutdown_list_element);
+
+static struct shutdown_list shutdown_lists[SHUTDOWN_FINAL + 1];
+
+static void boot __P((int)) __dead2;
+static void dumpsys __P((void));
+
+#ifndef _SYS_SYSPROTO_H_
+struct reboot_args {
+	int	opt;
+};
+#endif
+/* ARGSUSED */
+
+/*
+ * The system call that results in a reboot
+ */
+int
+reboot(p, uap)
+	struct proc *p;
+	struct reboot_args *uap;
+{
+	int error;
+
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
+		return (error);
+
+	boot(uap->opt);
+	return (0);
+}
+
+/*
+ * Called by events that want to shut down.. e.g  <CTL><ALT><DEL> on a PC
+ */
+void
+shutdown_nice()
+{
+	/* Send a signal to init(8) and have it shutdown the world */
+	if (initproc != NULL) {
+		psignal(initproc, SIGINT);
+	} else {
+		/* No init(8) running, so simply reboot */
+		boot(RB_NOSYNC);
+	}
+	return;
+}
+static int	waittime = -1;
+static struct pcb dumppcb;
+
+/*
+ *  Go through the rigmarole of shutting down..
+ * this used to be in machdep.c but I'll be dammned if I could see
+ * anything machine dependant in it.
+ */
+static void
+boot(howto)
+	int howto;
+{
+	sle_p ep;
+
+#ifdef SMP
+	if (smp_active) {
+		printf("boot() called on cpu#%d\n", cpuid);
+	}
+#endif
+	/*
+	 * Do any callouts that should be done BEFORE syncing the filesystems.
+	 */
+	LIST_FOREACH(ep, &shutdown_lists[SHUTDOWN_PRE_SYNC], links)
+		(*ep->function)(howto, ep->arg);
+
+	/* 
+	 * Now sync filesystems
+	 */
+	if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
+		register struct buf *bp;
+		int iter, nbusy;
+
+		waittime = 0;
+		printf("\nsyncing disks... ");
+
+		sync(&proc0, NULL);
+
+		/*
+		 * With soft updates, some buffers that are
+		 * written will be remarked as dirty until other
+		 * buffers are written.
+		 */
+		for (iter = 0; iter < 20; iter++) {
+			nbusy = 0;
+			for (bp = &buf[nbuf]; --bp >= buf; ) {
+				if ((bp->b_flags & (B_BUSY | B_INVAL))
+						== B_BUSY) {
+					nbusy++;
+				} else if ((bp->b_flags & (B_DELWRI | B_INVAL))
+						== B_DELWRI) {
+					/* bawrite(bp);*/
+					nbusy++;
+				}
+			}
+			if (nbusy == 0)
+				break;
+			printf("%d ", nbusy);
+			sync(&proc0, NULL);
+			DELAY(50000 * iter);
+		}
+		/*
+		 * Count only busy local buffers to prevent forcing 
+		 * a fsck if we're just a client of a wedged NFS server
+		 */
+		nbusy = 0;
+		for (bp = &buf[nbuf]; --bp >= buf; ) {
+			if (((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) 
+			    ||((bp->b_flags & (B_DELWRI | B_INVAL))== B_DELWRI))
+				if(bp->b_dev == NODEV)
+					CIRCLEQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list);
+				else
+					nbusy++;
+
+
+		}
+		if (nbusy) {
+			/*
+			 * Failed to sync all blocks. Indicate this and don't
+			 * unmount filesystems (thus forcing an fsck on reboot).
+			 */
+			printf("giving up\n");
+#ifdef SHOW_BUSYBUFS
+			nbusy = 0;
+			for (bp = &buf[nbuf]; --bp >= buf; ) {
+				if ((bp->b_flags & (B_BUSY | B_INVAL))
+						== B_BUSY) {
+					nbusy++;
+					printf(
+			"%d: dev:%08lx, flags:%08lx, blkno:%ld, lblkno:%ld\n",
+					    nbusy, (u_long)bp->b_dev,
+					    bp->b_flags, (long)bp->b_blkno,
+					    (long)bp->b_lblkno);
+				}
+			}
+			DELAY(5000000);	/* 5 seconds */
+#endif
+		} else {
+			printf("done\n");
+			/*
+			 * Unmount filesystems
+			 */
+			if (panicstr == 0)
+				vfs_unmountall();
+		}
+		DELAY(100000);		/* wait for console output to finish */
+	}
+
+	/*
+	 * Ok, now do things that assume all filesystem activity has
+	 * been completed.
+	 */
+	LIST_FOREACH(ep, &shutdown_lists[SHUTDOWN_POST_SYNC], links)
+		(*ep->function)(howto, ep->arg);
+	splhigh();
+	if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold) {
+		savectx(&dumppcb);
+#ifdef __i386__
+		dumppcb.pcb_cr3 = rcr3();
+#endif
+		dumpsys();
+	}
+
+	/* Now that we're going to really halt the system... */
+	LIST_FOREACH(ep, &shutdown_lists[SHUTDOWN_FINAL], links)
+		(*ep->function)(howto, ep->arg);
+
+	if (howto & RB_HALT) {
+		printf("\n");
+		printf("The operating system has halted.\n");
+		printf("Please press any key to reboot.\n\n");
+		switch (cngetc()) {
+		case -1:		/* No console, just die */
+			cpu_halt();
+			/* NOTREACHED */
+		default:
+			howto &= ~RB_HALT;
+			break;
+		}
+	} else if (howto & RB_DUMP) {
+		/* System Paniced */
+
+		if (PANIC_REBOOT_WAIT_TIME != 0) {
+			if (PANIC_REBOOT_WAIT_TIME != -1) {
+				int loop;
+				printf("Automatic reboot in %d seconds - "
+				       "press a key on the console to abort\n",
+					PANIC_REBOOT_WAIT_TIME);
+				for (loop = PANIC_REBOOT_WAIT_TIME * 10;
+				     loop > 0; --loop) {
+					DELAY(1000 * 100); /* 1/10th second */
+					/* Did user type a key? */
+					if (cncheckc() != -1)
+						break;
+				}
+				if (!loop)
+					goto die;
+			}
+		} else { /* zero time specified - reboot NOW */
+			goto die;
+		}
+		printf("--> Press a key on the console to reboot <--\n");
+		cngetc();
+	}
+die:
+	printf("Rebooting...\n");
+	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
+	/* cpu_boot(howto); */ /* doesn't do anything at the moment */
+	cpu_reset();
+	for(;;) ;
+	/* NOTREACHED */
+}
+
+/*
+ * Magic number for savecore
+ *
+ * exported (symorder) and used at least by savecore(8)
+ *
+ */
+static u_long const	dumpmag = 0x8fca0101UL;	
+
+static int	dumpsize = 0;		/* also for savecore */
+
+static int	dodump = 1;
+SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0, "");
+
+/* ARGSUSED */
+static void dump_conf __P((void *dummy));
+static void
+dump_conf(dummy)
+	void *dummy;
+{
+	cpu_dumpconf();
+}
+SYSINIT(dump_conf, SI_SUB_DUMP_CONF, SI_ORDER_FIRST, dump_conf, NULL)
+
+/*
+ * Doadump comes here after turning off memory management and
+ * getting on the dump stack, either when called above, or by
+ * the auto-restart code.
+ */
+static void
+dumpsys(void)
+{
+
+	if (!dodump)
+		return;
+	if (dumpdev == NODEV)
+		return;
+	if (!(bdevsw[major(dumpdev)]))
+		return;
+	if (!(bdevsw[major(dumpdev)]->d_dump))
+		return;
+	dumpsize = Maxmem;
+	printf("\ndumping to dev %lx, offset %ld\n", (u_long)dumpdev, dumplo);
+	printf("dump ");
+	switch ((*bdevsw[major(dumpdev)]->d_dump)(dumpdev)) {
+
+	case ENXIO:
+		printf("device bad\n");
+		break;
+
+	case EFAULT:
+		printf("device not ready\n");
+		break;
+
+	case EINVAL:
+		printf("area improper\n");
+		break;
+
+	case EIO:
+		printf("i/o error\n");
+		break;
+
+	case EINTR:
+		printf("aborted from console\n");
+		break;
+
+	default:
+		printf("succeeded\n");
+		break;
+	}
+}
+
+/*
+ * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
+ * and then reboots.  If we are called twice, then we avoid trying to sync
+ * the disks as this often leads to recursive panics.
+ */
+void
+panic(const char *fmt, ...)
+{
+	int bootopt;
+	va_list ap;
+	static char buf[256];
+
+	bootopt = RB_AUTOBOOT | RB_DUMP;
+	if (panicstr)
+		bootopt |= RB_NOSYNC;
+	else
+		panicstr = fmt;
+
+	va_start(ap, fmt);
+	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
+	if (panicstr == fmt)
+		panicstr = buf;
+	va_end(ap);
+	printf("panic: %s\n", buf);
+#ifdef SMP
+	/* three seperate prints in case of an unmapped page and trap */
+	printf("mp_lock = %08x; ", mp_lock);
+	printf("cpuid = %d; ", cpuid);
+	printf("lapic.id = %08x\n", lapic.id);
+#endif
+
+#if defined(DDB)
+	if (debugger_on_panic)
+		Debugger ("panic");
+#endif
+	boot(bootopt);
+}
+
+/*
+ * Three routines to handle adding/deleting items on the
+ * shutdown callout lists
+ *
+ * at_shutdown():
+ * Take the arguments given and put them onto the shutdown callout list.
+ * However first make sure that it's not already there.
+ * returns 0 on success.
+ */
+int
+at_shutdown(bootlist_fn function, void *arg, int queue)
+{
+	return(at_shutdown_pri(function, arg, queue, SHUTDOWN_PRI_DEFAULT));
+}
+
+/*
+ * at_shutdown_pri():
+ * Take the arguments given and put them onto the shutdown callout list
+ * with the given execution priority.
+ * returns 0 on success.
+ */
+int
+at_shutdown_pri(bootlist_fn function, void *arg, int queue, int pri)
+{
+	sle_p ep, ip;
+
+	if (queue < SHUTDOWN_PRE_SYNC
+	 || queue > SHUTDOWN_FINAL) {
+		printf("at_shutdown: bad exit callout queue %d specified\n",
+		       queue);
+		return (EINVAL);
+	}
+	if (rm_at_shutdown(function, arg))
+		printf("at_shutdown: exit callout entry was already present\n");
+	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->function = function;
+	ep->arg = arg;
+	ep->priority = pri;
+
+	/* Sort into list of items on this queue */
+	ip = LIST_FIRST(&shutdown_lists[queue]);
+	if (ip == NULL) {
+		LIST_INSERT_HEAD(&shutdown_lists[queue], ep, links);
+	} else {
+		for (; LIST_NEXT(ip, links) != NULL; ip = LIST_NEXT(ip, links)) {
+			if (ep->priority < ip->priority) {
+				LIST_INSERT_BEFORE(ip, ep, links);
+				ep = NULL;
+				break;
+			}
+		}
+		if (ep != NULL)
+			LIST_INSERT_AFTER(ip, ep, links);
+	}
+	return (0);
+}
+
+/*
+ * Scan the exit callout lists for the given items and remove them.
+ * Returns the number of items removed.
+ */
+int
+rm_at_shutdown(bootlist_fn function, void *arg)
+{
+	sle_p ep;
+	int   count;
+	int   queue;
+
+	count = 0;
+	for (queue = SHUTDOWN_PRE_SYNC; queue < SHUTDOWN_FINAL; queue++) {
+		LIST_FOREACH(ep, &shutdown_lists[queue], links) {
+			if ((ep->function == function) && (ep->arg == arg)) {
+				LIST_REMOVE(ep, links);
+				free(ep, M_TEMP);
+				count++;
+			}
+		}
+	}
+	return (count);
+}
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
new file mode 100644
index 0000000..bf89d8a
--- /dev/null
+++ b/sys/kern/kern_sig.c
@@ -0,0 +1,1455 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
+ * $Id: kern_sig.c,v 1.52 1999/01/08 17:31:10 eivind Exp $
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#define	SIGPROP		/* include signal properties table */
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/fcntl.h>
+#include <sys/wait.h>
+#include <sys/ktrace.h>
+#include <sys/syslog.h>
+#include <sys/stat.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+
+#include <machine/cpu.h>
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+
+static int killpg1	__P((struct proc *cp, int signum, int pgid, int all));
+static void setsigvec	__P((struct proc *p, int signum, struct sigaction *sa));
+static void stop	__P((struct proc *));
+
+static int	kern_logsigexit = 1;
+SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "");
+
+/*
+ * Can process p, with pcred pc, send the signal signum to process q?
+ */
+#define CANSIGNAL(p, pc, q, signum) \
+	((pc)->pc_ucred->cr_uid == 0 || \
+	    (pc)->p_ruid == (q)->p_cred->p_ruid || \
+	    (pc)->pc_ucred->cr_uid == (q)->p_cred->p_ruid || \
+	    (pc)->p_ruid == (q)->p_ucred->cr_uid || \
+	    (pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid || \
+	    ((signum) == SIGCONT && (q)->p_session == (p)->p_session))
+
+/*
+ * Policy -- Can real uid ruid with ucred uc send a signal to process q?
+ */
+#define CANSIGIO(ruid, uc, q) \
+	((uc)->cr_uid == 0 || \
+	    (ruid) == (q)->p_cred->p_ruid || \
+	    (uc)->cr_uid == (q)->p_cred->p_ruid || \
+	    (ruid) == (q)->p_ucred->cr_uid || \
+	    (uc)->cr_uid == (q)->p_ucred->cr_uid)
+
+int sugid_coredump;
+SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, &sugid_coredump, 0, "");
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaction_args {
+	int	signum;
+	struct	sigaction *nsa;
+	struct	sigaction *osa;
+};
+#endif
+/* ARGSUSED */
+int
+sigaction(p, uap)
+	struct proc *p;
+	register struct sigaction_args *uap;
+{
+	struct sigaction vec;
+	register struct sigaction *sa;
+	register struct sigacts *ps = p->p_sigacts;
+	register int signum;
+	int bit, error;
+
+	signum = uap->signum;
+	if (signum <= 0 || signum >= NSIG)
+		return (EINVAL);
+	sa = &vec;
+	if (uap->osa) {
+		sa->sa_handler = ps->ps_sigact[signum];
+		sa->sa_mask = ps->ps_catchmask[signum];
+		bit = sigmask(signum);
+		sa->sa_flags = 0;
+		if ((ps->ps_sigonstack & bit) != 0)
+			sa->sa_flags |= SA_ONSTACK;
+		if ((ps->ps_sigintr & bit) == 0)
+			sa->sa_flags |= SA_RESTART;
+		if ((ps->ps_sigreset & bit) != 0)
+			sa->sa_flags |= SA_RESETHAND;
+		if ((ps->ps_signodefer & bit) != 0)
+			sa->sa_flags |= SA_NODEFER;
+#ifndef COMPAT_LINUX_THREADS
+		if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP)
+#else
+		if (signum == SIGCHLD && p->p_procsig->ps_flag & P_NOCLDSTOP)
+#endif /* COMPAT_LINUX_THREADS */
+			sa->sa_flags |= SA_NOCLDSTOP;
+#ifndef COMPAT_LINUX_THREADS
+		if (signum == SIGCHLD && p->p_flag & P_NOCLDWAIT)
+#else
+		if (signum == SIGCHLD && p->p_procsig->ps_flag & P_NOCLDWAIT)
+#endif /* COMPAT_LINUX_THREADS */
+			sa->sa_flags |= SA_NOCLDWAIT;
+		if ((error = copyout((caddr_t)sa, (caddr_t)uap->osa,
+		    sizeof (vec))))
+			return (error);
+	}
+	if (uap->nsa) {
+		if ((error = copyin((caddr_t)uap->nsa, (caddr_t)sa,
+		    sizeof (vec))))
+			return (error);
+		if ((signum == SIGKILL || signum == SIGSTOP) &&
+		    sa->sa_handler != SIG_DFL)
+			return (EINVAL);
+		setsigvec(p, signum, sa);
+	}
+	return (0);
+}
+
+static void
+setsigvec(p, signum, sa)
+	register struct proc *p;
+	int signum;
+	register struct sigaction *sa;
+{
+	register struct sigacts *ps = p->p_sigacts;
+	register int bit;
+
+	bit = sigmask(signum);
+	/*
+	 * Change setting atomically.
+	 */
+	(void) splhigh();
+	ps->ps_sigact[signum] = sa->sa_handler;
+	ps->ps_catchmask[signum] = sa->sa_mask &~ sigcantmask;
+	if ((sa->sa_flags & SA_RESTART) == 0)
+		ps->ps_sigintr |= bit;
+	else
+		ps->ps_sigintr &= ~bit;
+	if (sa->sa_flags & SA_ONSTACK)
+		ps->ps_sigonstack |= bit;
+	else
+		ps->ps_sigonstack &= ~bit;
+	if (sa->sa_flags & SA_RESETHAND)
+		ps->ps_sigreset |= bit;
+	else
+		ps->ps_sigreset &= ~bit;
+	if (sa->sa_flags & SA_NODEFER)
+		ps->ps_signodefer |= bit;
+	else
+		ps->ps_signodefer &= ~bit;
+#ifdef COMPAT_SUNOS
+	if (sa->sa_flags & SA_USERTRAMP)
+		ps->ps_usertramp |= bit;
+	else
+		ps->ps_usertramp &= ~bit;
+#endif
+	if (signum == SIGCHLD) {
+		if (sa->sa_flags & SA_NOCLDSTOP)
+#ifndef COMPAT_LINUX_THREADS
+			p->p_flag |= P_NOCLDSTOP;
+		else
+			p->p_flag &= ~P_NOCLDSTOP;
+#else
+			p->p_procsig->ps_flag |= P_NOCLDSTOP;
+		else
+			p->p_procsig->ps_flag &= ~P_NOCLDSTOP;
+#endif /* COMPAT_LINUX_THREADS */
+		if (sa->sa_flags & SA_NOCLDWAIT) {
+			/*
+			 * Paranoia: since SA_NOCLDWAIT is implemented by
+			 * reparenting the dying child to PID 1 (and
+			 * trust it to reap the zombie), PID 1 itself is
+			 * forbidden to set SA_NOCLDWAIT.
+			 */
+			if (p->p_pid == 1)
+#ifndef COMPAT_LINUX_THREADS
+				p->p_flag &= ~P_NOCLDWAIT;
+			else
+				p->p_flag |= P_NOCLDWAIT;
+#else
+				p->p_procsig->ps_flag &= ~P_NOCLDWAIT;
+			else
+				p->p_procsig->ps_flag |= P_NOCLDWAIT;
+#endif /* COMPAT_LINUX_THREADS */
+		} else
+#ifndef COMPAT_LINUX_THREADS
+			p->p_flag &= ~P_NOCLDWAIT;
+#else
+			p->p_procsig->ps_flag &= ~P_NOCLDWAIT;
+#endif /* COMPAT_LINUX_THREADS */
+	}
+	/*
+	 * Set bit in p_sigignore for signals that are set to SIG_IGN,
+	 * and for signals set to SIG_DFL where the default is to ignore.
+	 * However, don't put SIGCONT in p_sigignore,
+	 * as we have to restart the process.
+	 */
+	if (sa->sa_handler == SIG_IGN ||
+	    (sigprop[signum] & SA_IGNORE && sa->sa_handler == SIG_DFL)) {
+		p->p_siglist &= ~bit;		/* never to be seen again */
+		if (signum != SIGCONT)
+			p->p_sigignore |= bit;	/* easier in psignal */
+		p->p_sigcatch &= ~bit;
+	} else {
+		p->p_sigignore &= ~bit;
+		if (sa->sa_handler == SIG_DFL)
+			p->p_sigcatch &= ~bit;
+		else
+			p->p_sigcatch |= bit;
+	}
+	(void) spl0();
+}
+
+/*
+ * Initialize signal state for process 0;
+ * set to ignore signals that are ignored by default.
+ */
+void
+siginit(p)
+	struct proc *p;
+{
+	register int i;
+
+	for (i = 0; i < NSIG; i++)
+		if (sigprop[i] & SA_IGNORE && i != SIGCONT)
+			p->p_sigignore |= sigmask(i);
+}
+
+/*
+ * Reset signals for an exec of the specified process.
+ */
+void
+execsigs(p)
+	register struct proc *p;
+{
+	register struct sigacts *ps = p->p_sigacts;
+	register int nc, mask;
+
+	/*
+	 * Reset caught signals.  Held signals remain held
+	 * through p_sigmask (unless they were caught,
+	 * and are now ignored by default).
+	 */
+	while (p->p_sigcatch) {
+		nc = ffs((long)p->p_sigcatch);
+		mask = sigmask(nc);
+		p->p_sigcatch &= ~mask;
+		if (sigprop[nc] & SA_IGNORE) {
+			if (nc != SIGCONT)
+				p->p_sigignore |= mask;
+			p->p_siglist &= ~mask;
+		}
+		ps->ps_sigact[nc] = SIG_DFL;
+	}
+	/*
+	 * Reset stack state to the user stack.
+	 * Clear set of signals caught on the signal stack.
+	 */
+	ps->ps_sigstk.ss_flags = SS_DISABLE;
+	ps->ps_sigstk.ss_size = 0;
+	ps->ps_sigstk.ss_sp = 0;
+	ps->ps_flags = 0;
+}
+
+/*
+ * Manipulate signal mask.
+ * Note that we receive new mask, not pointer,
+ * and return old mask as return value;
+ * the library stub does the rest.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sigprocmask_args {
+	int	how;
+	sigset_t mask;
+};
+#endif
+int
+sigprocmask(p, uap)
+	register struct proc *p;
+	struct sigprocmask_args *uap;
+{
+	int error = 0;
+
+	p->p_retval[0] = p->p_sigmask;
+	(void) splhigh();
+
+	switch (uap->how) {
+	case SIG_BLOCK:
+		p->p_sigmask |= uap->mask &~ sigcantmask;
+		break;
+
+	case SIG_UNBLOCK:
+		p->p_sigmask &= ~uap->mask;
+		break;
+
+	case SIG_SETMASK:
+		p->p_sigmask = uap->mask &~ sigcantmask;
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+	(void) spl0();
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigpending_args {
+	int	dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sigpending(p, uap)
+	struct proc *p;
+	struct sigpending_args *uap;
+{
+
+	p->p_retval[0] = p->p_siglist;
+	return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Generalized interface signal handler, 4.3-compatible.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct osigvec_args {
+	int	signum;
+	struct	sigvec *nsv;
+	struct	sigvec *osv;
+};
+#endif
+/* ARGSUSED */
+int
+osigvec(p, uap)
+	struct proc *p;
+	register struct osigvec_args *uap;
+{
+	struct sigvec vec;
+	register struct sigacts *ps = p->p_sigacts;
+	register struct sigvec *sv;
+	register int signum;
+	int bit, error;
+
+	signum = uap->signum;
+	if (signum <= 0 || signum >= NSIG)
+		return (EINVAL);
+	sv = &vec;
+	if (uap->osv) {
+		*(sig_t *)&sv->sv_handler = ps->ps_sigact[signum];
+		sv->sv_mask = ps->ps_catchmask[signum];
+		bit = sigmask(signum);
+		sv->sv_flags = 0;
+		if ((ps->ps_sigonstack & bit) != 0)
+			sv->sv_flags |= SV_ONSTACK;
+		if ((ps->ps_sigintr & bit) != 0)
+			sv->sv_flags |= SV_INTERRUPT;
+		if ((ps->ps_sigreset & bit) != 0)
+			sv->sv_flags |= SV_RESETHAND;
+		if ((ps->ps_signodefer & bit) != 0)
+			sv->sv_flags |= SV_NODEFER;
+#ifndef COMPAT_SUNOS
+#ifndef COMPAT_LINUX_THREADS
+		if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP)
+#else
+		if (signum == SIGCHLD && p->p_procsig->ps_flag & P_NOCLDSTOP)
+#endif /* COMPAT_LINUX_THREADS */
+			sv->sv_flags |= SV_NOCLDSTOP;
+#endif
+		if ((error = copyout((caddr_t)sv, (caddr_t)uap->osv,
+		    sizeof (vec))))
+			return (error);
+	}
+	if (uap->nsv) {
+		if ((error = copyin((caddr_t)uap->nsv, (caddr_t)sv,
+		    sizeof (vec))))
+			return (error);
+		if ((signum == SIGKILL || signum == SIGSTOP) &&
+		    sv->sv_handler != SIG_DFL)
+			return (EINVAL);
+#ifdef COMPAT_SUNOS
+		sv->sv_flags |= SA_USERTRAMP;
+#endif
+		sv->sv_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
+		setsigvec(p, signum, (struct sigaction *)sv);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigblock_args {
+	int	mask;
+};
+#endif
+int
+osigblock(p, uap)
+	register struct proc *p;
+	struct osigblock_args *uap;
+{
+
+	(void) splhigh();
+	p->p_retval[0] = p->p_sigmask;
+	p->p_sigmask |= uap->mask &~ sigcantmask;
+	(void) spl0();
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigsetmask_args {
+	int	mask;
+};
+#endif
+int
+osigsetmask(p, uap)
+	struct proc *p;
+	struct osigsetmask_args *uap;
+{
+
+	(void) splhigh();
+	p->p_retval[0] = p->p_sigmask;
+	p->p_sigmask = uap->mask &~ sigcantmask;
+	(void) spl0();
+	return (0);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Suspend process until signal, providing mask to be set
+ * in the meantime.  Note nonstandard calling convention:
+ * libc stub passes mask, not pointer, to save a copyin.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sigsuspend_args {
+	sigset_t mask;
+};
+#endif
+/* ARGSUSED */
+int
+sigsuspend(p, uap)
+	register struct proc *p;
+	struct sigsuspend_args *uap;
+{
+	register struct sigacts *ps = p->p_sigacts;
+
+	/*
+	 * When returning from sigpause, we want
+	 * the old mask to be restored after the
+	 * signal handler has finished.  Thus, we
+	 * save it here and mark the sigacts structure
+	 * to indicate this.
+	 */
+#ifndef COMPAT_LINUX_THREADS
+	ps->ps_oldmask = p->p_sigmask;
+	ps->ps_flags |= SAS_OLDMASK;
+#else
+	p->p_oldsigmask = p->p_sigmask;
+#endif /* COMPAT_LINUX_THREADS */
+	p->p_sigmask = uap->mask &~ sigcantmask;
+	while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "pause", 0) == 0)
+		/* void */;
+	/* always return EINTR rather than ERESTART... */
+	return (EINTR);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osigstack_args {
+	struct	sigstack *nss;
+	struct	sigstack *oss;
+};
+#endif
+/* ARGSUSED */
+int
+osigstack(p, uap)
+	struct proc *p;
+	register struct osigstack_args *uap;
+{
+	struct sigstack ss;
+	struct sigacts *psp;
+	int error = 0;
+
+	psp = p->p_sigacts;
+	ss.ss_sp = psp->ps_sigstk.ss_sp;
+	ss.ss_onstack = psp->ps_sigstk.ss_flags & SS_ONSTACK;
+	if (uap->oss && (error = copyout((caddr_t)&ss, (caddr_t)uap->oss,
+	    sizeof (struct sigstack))))
+		return (error);
+	if (uap->nss && (error = copyin((caddr_t)uap->nss, (caddr_t)&ss,
+	    sizeof (ss))) == 0) {
+		psp->ps_sigstk.ss_sp = ss.ss_sp;
+		psp->ps_sigstk.ss_size = 0;
+		psp->ps_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK;
+		psp->ps_flags |= SAS_ALTSTACK;
+	}
+	return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaltstack_args {
+	struct	sigaltstack *nss;
+	struct	sigaltstack *oss;
+};
+#endif
+/* ARGSUSED */
+int
+sigaltstack(p, uap)
+	struct proc *p;
+	register struct sigaltstack_args *uap;
+{
+	struct sigacts *psp;
+	struct sigaltstack ss;
+	int error;
+
+	psp = p->p_sigacts;
+	if ((psp->ps_flags & SAS_ALTSTACK) == 0)
+		psp->ps_sigstk.ss_flags |= SS_DISABLE;
+	if (uap->oss && (error = copyout((caddr_t)&psp->ps_sigstk,
+	    (caddr_t)uap->oss, sizeof (struct sigaltstack))))
+		return (error);
+	if (uap->nss == 0)
+		return (0);
+	if ((error = copyin((caddr_t)uap->nss, (caddr_t)&ss, sizeof (ss))))
+		return (error);
+	if (ss.ss_flags & SS_DISABLE) {
+		if (psp->ps_sigstk.ss_flags & SS_ONSTACK)
+			return (EINVAL);
+		psp->ps_flags &= ~SAS_ALTSTACK;
+		psp->ps_sigstk.ss_flags = ss.ss_flags;
+		return (0);
+	}
+	if (ss.ss_size < MINSIGSTKSZ)
+		return (ENOMEM);
+	psp->ps_flags |= SAS_ALTSTACK;
+	psp->ps_sigstk= ss;
+	return (0);
+}
+
+/*
+ * Common code for kill process group/broadcast kill.
+ * cp is calling process.
+ */
+int
+killpg1(cp, signum, pgid, all)
+	register struct proc *cp;
+	int signum, pgid, all;
+{
+	register struct proc *p;
+	register struct pcred *pc = cp->p_cred;
+	struct pgrp *pgrp;
+	int nfound = 0;
+
+	if (all)
+		/*
+		 * broadcast
+		 */
+		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+			    p == cp || !CANSIGNAL(cp, pc, p, signum))
+				continue;
+			nfound++;
+			if (signum)
+				psignal(p, signum);
+		}
+	else {
+		if (pgid == 0)
+			/*
+			 * zero pgid means send to my process group.
+			 */
+			pgrp = cp->p_pgrp;
+		else {
+			pgrp = pgfind(pgid);
+			if (pgrp == NULL)
+				return (ESRCH);
+		}
+		for (p = pgrp->pg_members.lh_first; p != 0;
+		     p = p->p_pglist.le_next) {
+			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+			    p->p_stat == SZOMB ||
+			    !CANSIGNAL(cp, pc, p, signum))
+				continue;
+			nfound++;
+			if (signum)
+				psignal(p, signum);
+		}
+	}
+	return (nfound ? 0 : ESRCH);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct kill_args {
+	int	pid;
+	int	signum;
+};
+#endif
+/* ARGSUSED */
+int
+kill(cp, uap)
+	register struct proc *cp;
+	register struct kill_args *uap;
+{
+	register struct proc *p;
+	register struct pcred *pc = cp->p_cred;
+
+	if ((u_int)uap->signum >= NSIG)
+		return (EINVAL);
+	if (uap->pid > 0) {
+		/* kill single process */
+		if ((p = pfind(uap->pid)) == NULL)
+			return (ESRCH);
+		if (!CANSIGNAL(cp, pc, p, uap->signum))
+			return (EPERM);
+		if (uap->signum)
+			psignal(p, uap->signum);
+		return (0);
+	}
+	switch (uap->pid) {
+	case -1:		/* broadcast signal */
+		return (killpg1(cp, uap->signum, 0, 1));
+	case 0:			/* signal own process group */
+		return (killpg1(cp, uap->signum, 0, 0));
+	default:		/* negative explicit process group */
+		return (killpg1(cp, uap->signum, -uap->pid, 0));
+	}
+	/* NOTREACHED */
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct okillpg_args {
+	int	pgid;
+	int	signum;
+};
+#endif
+/* ARGSUSED */
+int
+okillpg(p, uap)
+	struct proc *p;
+	register struct okillpg_args *uap;
+{
+
+	if ((u_int)uap->signum >= NSIG)
+		return (EINVAL);
+	return (killpg1(p, uap->signum, uap->pgid, 0));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Send a signal to a process group.
+ */
+void
+gsignal(pgid, signum)
+	int pgid, signum;
+{
+	struct pgrp *pgrp;
+
+	if (pgid && (pgrp = pgfind(pgid)))
+		pgsignal(pgrp, signum, 0);
+}
+
+/*
+ * Send a signal to a process group.  If checktty is 1,
+ * limit to members which have a controlling terminal.
+ */
+void
+pgsignal(pgrp, signum, checkctty)
+	struct pgrp *pgrp;
+	int signum, checkctty;
+{
+	register struct proc *p;
+
+	if (pgrp)
+		for (p = pgrp->pg_members.lh_first; p != 0;
+		     p = p->p_pglist.le_next)
+			if (checkctty == 0 || p->p_flag & P_CONTROLT)
+				psignal(p, signum);
+}
+
+/*
+ * Send a signal caused by a trap to the current process.
+ * If it will be caught immediately, deliver it with correct code.
+ * Otherwise, post it normally.
+ */
+void
+trapsignal(p, signum, code)
+	struct proc *p;
+	register int signum;
+	u_long code;
+{
+	register struct sigacts *ps = p->p_sigacts;
+	int mask;
+
+	mask = sigmask(signum);
+	if ((p->p_flag & P_TRACED) == 0 && (p->p_sigcatch & mask) != 0 &&
+	    (p->p_sigmask & mask) == 0) {
+		p->p_stats->p_ru.ru_nsignals++;
+#ifdef KTRACE
+		if (KTRPOINT(p, KTR_PSIG))
+			ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum],
+				p->p_sigmask, code);
+#endif
+		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[signum], signum,
+						p->p_sigmask, code);
+		p->p_sigmask |= ps->ps_catchmask[signum] |
+				(mask & ~ps->ps_signodefer);
+		if ((ps->ps_sigreset & mask) != 0) {
+			/*
+			 * See setsigvec() for origin of this code.
+			 */
+			p->p_sigcatch &= ~mask;
+			if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
+				p->p_sigignore |= mask;
+			ps->ps_sigact[signum] = SIG_DFL;
+		}
+	} else {
+#ifndef COMPAT_LINUX_THREADS
+		ps->ps_code = code;	/* XXX for core dump/debugger */
+		ps->ps_sig = signum;	/* XXX to verify code */
+#else
+		p->p_code = code;	/* XXX for core dump/debugger */
+		p->p_sig = signum;	/* XXX to verify code */
+#endif /* COMPAT_LINUX_THREADS */
+		psignal(p, signum);
+	}
+}
+
+/*
+ * Send the signal to the process.  If the signal has an action, the action
+ * is usually performed by the target process rather than the caller; we add
+ * the signal to the set of pending signals for the process.
+ *
+ * Exceptions:
+ *   o When a stop signal is sent to a sleeping process that takes the
+ *     default action, the process is stopped without awakening it.
+ *   o SIGCONT restarts stopped processes (or puts them back to sleep)
+ *     regardless of the signal action (eg, blocked or ignored).
+ *
+ * Other ignored signals are discarded immediately.
+ */
+void
+psignal(p, signum)
+	register struct proc *p;
+	register int signum;
+{
+	register int s, prop;
+	register sig_t action;
+	int mask;
+
+	if ((u_int)signum >= NSIG || signum == 0) {
+		printf("psignal: signum %d\n", signum);
+		panic("psignal signal number");
+	}
+	mask = sigmask(signum);
+	prop = sigprop[signum];
+
+	/*
+	 * If proc is traced, always give parent a chance;
+	 * if signal event is tracked by procfs, give *that*
+	 * a chance, as well.
+	 */
+	if ((p->p_flag & P_TRACED) || (p->p_stops & S_SIG))
+		action = SIG_DFL;
+	else {
+		/*
+		 * If the signal is being ignored,
+		 * then we forget about it immediately.
+		 * (Note: we don't set SIGCONT in p_sigignore,
+		 * and if it is set to SIG_IGN,
+		 * action will be SIG_DFL here.)
+		 */
+#ifndef COMPAT_LINUX_THREADS
+		if (p->p_sigignore & mask)
+#else
+		if ((p->p_sigignore & mask) || (p->p_flag & P_WEXIT))
+#endif /* COMPAT_LINUX_THREADS */
+			return;
+		if (p->p_sigmask & mask)
+			action = SIG_HOLD;
+		else if (p->p_sigcatch & mask)
+			action = SIG_CATCH;
+		else
+			action = SIG_DFL;
+	}
+
+	if (p->p_nice > NZERO && action == SIG_DFL && (prop & SA_KILL) &&
+	    (p->p_flag & P_TRACED) == 0)
+		p->p_nice = NZERO;
+
+	if (prop & SA_CONT)
+		p->p_siglist &= ~stopsigmask;
+
+	if (prop & SA_STOP) {
+		/*
+		 * If sending a tty stop signal to a member of an orphaned
+		 * process group, discard the signal here if the action
+		 * is default; don't stop the process below if sleeping,
+		 * and don't clear any pending SIGCONT.
+		 */
+		if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 &&
+		    action == SIG_DFL)
+		        return;
+		p->p_siglist &= ~contsigmask;
+	}
+	p->p_siglist |= mask;
+
+	/*
+	 * Defer further processing for signals which are held,
+	 * except that stopped processes must be continued by SIGCONT.
+	 */
+	if (action == SIG_HOLD && ((prop & SA_CONT) == 0 || p->p_stat != SSTOP))
+		return;
+	s = splhigh();
+	switch (p->p_stat) {
+
+	case SSLEEP:
+		/*
+		 * If process is sleeping uninterruptibly
+		 * we can't interrupt the sleep... the signal will
+		 * be noticed when the process returns through
+		 * trap() or syscall().
+		 */
+		if ((p->p_flag & P_SINTR) == 0)
+			goto out;
+		/*
+		 * Process is sleeping and traced... make it runnable
+		 * so it can discover the signal in issignal() and stop
+		 * for the parent.
+		 */
+		if (p->p_flag & P_TRACED)
+			goto run;
+		/*
+		 * If SIGCONT is default (or ignored) and process is
+		 * asleep, we are finished; the process should not
+		 * be awakened.
+		 */
+		if ((prop & SA_CONT) && action == SIG_DFL) {
+			p->p_siglist &= ~mask;
+			goto out;
+		}
+		/*
+		 * When a sleeping process receives a stop
+		 * signal, process immediately if possible.
+		 * All other (caught or default) signals
+		 * cause the process to run.
+		 */
+		if (prop & SA_STOP) {
+			if (action != SIG_DFL)
+				goto runfast;
+			/*
+			 * If a child holding parent blocked,
+			 * stopping could cause deadlock.
+			 */
+			if (p->p_flag & P_PPWAIT)
+				goto out;
+			p->p_siglist &= ~mask;
+			p->p_xstat = signum;
+#ifndef COMPAT_LINUX_THREADS
+			if ((p->p_pptr->p_flag & P_NOCLDSTOP) == 0)
+#else
+			if ((p->p_pptr->p_procsig->ps_flag & P_NOCLDSTOP) == 0)
+#endif /* COMPAT_LINUX_THREADS */
+				psignal(p->p_pptr, SIGCHLD);
+			stop(p);
+			goto out;
+		} else
+			goto runfast;
+		/*NOTREACHED*/
+
+	case SSTOP:
+		/*
+		 * If traced process is already stopped,
+		 * then no further action is necessary.
+		 */
+		if (p->p_flag & P_TRACED)
+			goto out;
+
+		/*
+		 * Kill signal always sets processes running.
+		 */
+		if (signum == SIGKILL)
+			goto runfast;
+
+		if (prop & SA_CONT) {
+			/*
+			 * If SIGCONT is default (or ignored), we continue the
+			 * process but don't leave the signal in p_siglist, as
+			 * it has no further action.  If SIGCONT is held, we
+			 * continue the process and leave the signal in
+			 * p_siglist.  If the process catches SIGCONT, let it
+			 * handle the signal itself.  If it isn't waiting on
+			 * an event, then it goes back to run state.
+			 * Otherwise, process goes back to sleep state.
+			 */
+			if (action == SIG_DFL)
+				p->p_siglist &= ~mask;
+			if (action == SIG_CATCH)
+				goto runfast;
+			if (p->p_wchan == 0)
+				goto run;
+			p->p_stat = SSLEEP;
+			goto out;
+		}
+
+		if (prop & SA_STOP) {
+			/*
+			 * Already stopped, don't need to stop again.
+			 * (If we did the shell could get confused.)
+			 */
+			p->p_siglist &= ~mask;		/* take it away */
+			goto out;
+		}
+
+		/*
+		 * If process is sleeping interruptibly, then simulate a
+		 * wakeup so that when it is continued, it will be made
+		 * runnable and can look at the signal.  But don't make
+		 * the process runnable, leave it stopped.
+		 */
+		if (p->p_wchan && p->p_flag & P_SINTR)
+			unsleep(p);
+		goto out;
+
+	default:
+		/*
+		 * SRUN, SIDL, SZOMB do nothing with the signal,
+		 * other than kicking ourselves if we are running.
+		 * It will either never be noticed, or noticed very soon.
+		 */
+		if (p == curproc)
+			signotify(p);
+#ifdef SMP
+		else if (p->p_stat == SRUN)
+			forward_signal(p);
+#endif
+		goto out;
+	}
+	/*NOTREACHED*/
+
+runfast:
+	/*
+	 * Raise priority to at least PUSER.
+	 */
+	if (p->p_priority > PUSER)
+		p->p_priority = PUSER;
+run:
+	setrunnable(p);
+out:
+	splx(s);
+}
+
+/*
+ * If the current process has received a signal (should be caught or cause
+ * termination, should interrupt current syscall), return the signal number.
+ * Stop signals with default action are processed immediately, then cleared;
+ * they aren't returned.  This is checked after each entry to the system for
+ * a syscall or trap (though this can usually be done without calling issignal
+ * by checking the pending signal masks in the CURSIG macro.) The normal call
+ * sequence is
+ *
+ *	while (signum = CURSIG(curproc))
+ *		postsig(signum);
+ */
+int
+issignal(p)
+	register struct proc *p;
+{
+	register int signum, mask, prop;
+
+	for (;;) {
+		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
+
+		mask = p->p_siglist & ~p->p_sigmask;
+		if (p->p_flag & P_PPWAIT)
+			mask &= ~stopsigmask;
+		if (mask == 0)	 	/* no signal to send */
+			return (0);
+		signum = ffs((long)mask);
+		mask = sigmask(signum);
+		prop = sigprop[signum];
+
+		STOPEVENT(p, S_SIG, signum);
+
+		/*
+		 * We should see pending but ignored signals
+		 * only if P_TRACED was on when they were posted.
+		 */
+		if ((mask & p->p_sigignore) && (traced == 0)) {
+			p->p_siglist &= ~mask;
+			continue;
+		}
+		if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
+			/*
+			 * If traced, always stop, and stay
+			 * stopped until released by the parent.
+			 */
+			p->p_xstat = signum;
+			psignal(p->p_pptr, SIGCHLD);
+			do {
+				stop(p);
+				mi_switch();
+			} while (!trace_req(p)
+				 && p->p_flag & P_TRACED);
+
+			/*
+			 * If the traced bit got turned off, go back up
+			 * to the top to rescan signals.  This ensures
+			 * that p_sig* and ps_sigact are consistent.
+			 */
+			if ((p->p_flag & P_TRACED) == 0)
+				continue;
+
+			/*
+			 * If parent wants us to take the signal,
+			 * then it will leave it in p->p_xstat;
+			 * otherwise we just look for signals again.
+			 */
+			p->p_siglist &= ~mask;	/* clear the old signal */
+			signum = p->p_xstat;
+			if (signum == 0)
+				continue;
+
+			/*
+			 * Put the new signal into p_siglist.  If the
+			 * signal is being masked, look for other signals.
+			 */
+			mask = sigmask(signum);
+			p->p_siglist |= mask;
+			if (p->p_sigmask & mask)
+				continue;
+		}
+
+		/*
+		 * Decide whether the signal should be returned.
+		 * Return the signal's number, or fall through
+		 * to clear it from the pending mask.
+		 */
+		switch ((int)(intptr_t)p->p_sigacts->ps_sigact[signum]) {
+
+		case (int)SIG_DFL:
+			/*
+			 * Don't take default actions on system processes.
+			 */
+			if (p->p_pid <= 1) {
+#ifdef DIAGNOSTIC
+				/*
+				 * Are you sure you want to ignore SIGSEGV
+				 * in init? XXX
+				 */
+				printf("Process (pid %lu) got signal %d\n",
+					(u_long)p->p_pid, signum);
+#endif
+				break;		/* == ignore */
+			}
+			/*
+			 * If there is a pending stop signal to process
+			 * with default action, stop here,
+			 * then clear the signal.  However,
+			 * if process is member of an orphaned
+			 * process group, ignore tty stop signals.
+			 */
+			if (prop & SA_STOP) {
+				if (p->p_flag & P_TRACED ||
+		    		    (p->p_pgrp->pg_jobc == 0 &&
+				    prop & SA_TTYSTOP))
+					break;	/* == ignore */
+				p->p_xstat = signum;
+				stop(p);
+#ifndef COMPAT_LINUX_THREADS
+				if ((p->p_pptr->p_flag & P_NOCLDSTOP) == 0)
+#else
+				if ((p->p_pptr->p_procsig->ps_flag & P_NOCLDSTOP) == 0)
+#endif /* COMPAT_LINUX_THREADS */
+					psignal(p->p_pptr, SIGCHLD);
+				mi_switch();
+				break;
+			} else if (prop & SA_IGNORE) {
+				/*
+				 * Except for SIGCONT, shouldn't get here.
+				 * Default action is to ignore; drop it.
+				 */
+				break;		/* == ignore */
+			} else
+				return (signum);
+			/*NOTREACHED*/
+
+		case (int)SIG_IGN:
+			/*
+			 * Masking above should prevent us ever trying
+			 * to take action on an ignored signal other
+			 * than SIGCONT, unless process is traced.
+			 */
+			if ((prop & SA_CONT) == 0 &&
+			    (p->p_flag & P_TRACED) == 0)
+				printf("issignal\n");
+			break;		/* == ignore */
+
+		default:
+			/*
+			 * This signal has an action, let
+			 * postsig() process it.
+			 */
+			return (signum);
+		}
+		p->p_siglist &= ~mask;		/* take the signal! */
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Put the argument process into the stopped state and notify the parent
+ * via wakeup.  Signals are handled elsewhere.  The process must not be
+ * on the run queue.
+ */
+void
+stop(p)
+	register struct proc *p;
+{
+
+	p->p_stat = SSTOP;
+	p->p_flag &= ~P_WAITED;
+	wakeup((caddr_t)p->p_pptr);
+}
+
+/*
+ * Take the action for the specified signal
+ * from the current set of pending signals.
+ */
+void
+postsig(signum)
+	register int signum;
+{
+	register struct proc *p = curproc;
+	register struct sigacts *ps = p->p_sigacts;
+	register sig_t action;
+	int code, mask, returnmask;
+
+	KASSERT(signum != 0, ("postsig"));
+
+	mask = sigmask(signum);
+	p->p_siglist &= ~mask;
+	action = ps->ps_sigact[signum];
+#ifdef KTRACE
+	if (KTRPOINT(p, KTR_PSIG))
+		ktrpsig(p->p_tracep,
+#ifndef COMPAT_LINUX_THREADS
+		    signum, action, ps->ps_flags & SAS_OLDMASK ?
+		    ps->ps_oldmask : p->p_sigmask, 0);
+#else
+		    signum, action, p->p_oldsigmask ?
+		    p->p_oldsigmask : p->p_sigmask, 0);
+#endif /* COMPAT_LINUX_THREADS */
+#endif
+	STOPEVENT(p, S_SIG, signum);
+
+	if (action == SIG_DFL) {
+		/*
+		 * Default action, where the default is to kill
+		 * the process.  (Other cases were ignored above.)
+		 */
+		sigexit(p, signum);
+		/* NOTREACHED */
+	} else {
+		/*
+		 * If we get here, the signal must be caught.
+		 */
+		KASSERT(action != SIG_IGN && (p->p_sigmask & mask) == 0,
+		    ("postsig action"));
+		/*
+		 * Set the new mask value and also defer further
+		 * occurences of this signal.
+		 *
+		 * Special case: user has done a sigpause.  Here the
+		 * current mask is not of interest, but rather the
+		 * mask from before the sigpause is what we want
+		 * restored after the signal processing is completed.
+		 */
+		(void) splhigh();
+#ifndef COMPAT_LINUX_THREADS
+		if (ps->ps_flags & SAS_OLDMASK) {
+			returnmask = ps->ps_oldmask;
+			ps->ps_flags &= ~SAS_OLDMASK;
+#else
+		if (p->p_oldsigmask) {
+			returnmask = p->p_oldsigmask;
+			p->p_oldsigmask = 0;
+#endif /* COMPAT_LINUX_THREADS */
+		} else
+			returnmask = p->p_sigmask;
+		p->p_sigmask |= ps->ps_catchmask[signum] |
+				(mask & ~ps->ps_signodefer);
+		if ((ps->ps_sigreset & mask) != 0) {
+			/*
+			 * See setsigvec() for origin of this code.
+			 */
+			p->p_sigcatch &= ~mask;
+			if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
+				p->p_sigignore |= mask;
+			ps->ps_sigact[signum] = SIG_DFL;
+		}
+		(void) spl0();
+		p->p_stats->p_ru.ru_nsignals++;
+#ifndef COMPAT_LINUX_THREADS
+		if (ps->ps_sig != signum) {
+#else
+		if (p->p_sig != signum) {
+#endif /* COMPAT_LINUX_THREADS */
+			code = 0;
+		} else {
+#ifndef COMPAT_LINUX_THREADS
+			code = ps->ps_code;
+			ps->ps_code = 0;
+			ps->ps_sig = 0;
+#else
+			code = p->p_code;
+			p->p_code = 0;
+			p->p_sig = 0;
+#endif /* COMPAT_LINUX_THREADS */
+		}
+		(*p->p_sysent->sv_sendsig)(action, signum, returnmask, code);
+	}
+}
+
+/*
+ * Kill the current process for stated reason.
+ */
+void
+killproc(p, why)
+	struct proc *p;
+	char *why;
+{
+	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
+		p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why);
+	psignal(p, SIGKILL);
+}
+
+/*
+ * Force the current process to exit with the specified signal, dumping core
+ * if appropriate.  We bypass the normal tests for masked and caught signals,
+ * allowing unrecoverable failures to terminate the process without changing
+ * signal state.  Mark the accounting record with the signal termination.
+ * If dumping core, save the signal number for the debugger.  Calls exit and
+ * does not return.
+ */
+void
+sigexit(p, signum)
+	register struct proc *p;
+	int signum;
+{
+
+	p->p_acflag |= AXSIG;
+	if (sigprop[signum] & SA_CORE) {
+#ifndef COMPAT_LINUX_THREADS
+		p->p_sigacts->ps_sig = signum;
+#else
+		p->p_sig = signum;
+#endif /* COMPAT_LINUX_THREADS */
+		/*
+		 * Log signals which would cause core dumps
+		 * (Log as LOG_INFO to appease those who don't want
+		 * these messages.)
+		 * XXX : Todo, as well as euid, write out ruid too
+		 */
+		if (p->p_sysent->sv_coredump != NULL &&
+		    (*p->p_sysent->sv_coredump)(p) == 0)
+			signum |= WCOREFLAG;
+		if (kern_logsigexit)
+			log(LOG_INFO,
+			    "pid %d (%s), uid %d: exited on signal %d%s\n",
+			    p->p_pid, p->p_comm,
+			    p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1,
+			    signum &~ WCOREFLAG,
+			    signum & WCOREFLAG ? " (core dumped)" : "");
+	}
+	exit1(p, W_EXITCODE(0, signum));
+	/* NOTREACHED */
+}
+
+static char corefilename[MAXPATHLEN+1] = {"%N.core"};
+SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
+	      sizeof(corefilename), "process corefile name format string");
+
+/*
+ * expand_name(name, uid, pid)
+ * Expand the name described in corefilename, using name, uid, and pid.
+ * corefilename is a printf-like string, with three format specifiers:
+ *	%N	name of process ("name")
+ *	%P	process id (pid)
+ *	%U	user id (uid)
+ * For example, "%N.core" is the default; they can be disabled completely
+ * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
+ * This is controlled by the sysctl variable kern.corefile (see above).
+ */
+
+char *
+expand_name(name, uid, pid)
+const char *name; int uid; int pid; {
+	char *temp;
+	char buf[11];		/* Buffer for pid/uid -- max 4B */
+	int i, n;
+	char *format = corefilename;
+
+	temp = malloc(MAXPATHLEN + 3, M_TEMP, M_NOWAIT);
+	if (temp == NULL)
+		return NULL;
+	bzero(temp, MAXPATHLEN+3);
+	for (i = 0, n = 0; i < MAXPATHLEN && format[i]; i++) {
+		int l;
+		switch (format[i]) {
+		case '%':	/* Format character */
+			i++;
+			switch (format[i]) {
+			case '%':
+				temp[n++] = '%';
+				break;
+			case 'N':	/* process name */
+				l = strlen(name);
+				if ((n + l) > MAXPATHLEN) {
+					log(LOG_ERR, "pid %d (%s), uid (%d):  Path `%s%s' is too long\n",
+					    pid, name, uid, temp, name);
+					free(temp, M_TEMP);
+					return NULL;
+				}
+				memcpy(temp+n, name, l);
+				n += l;
+				break;
+			case 'P':	/* process id */
+				sprintf(buf, "%u", pid);
+				l = strlen(buf);
+				if ((n + l) > MAXPATHLEN) {
+					log(LOG_ERR, "pid %d (%s), uid (%d):  Path `%s%s' is too long\n",
+					    pid, name, uid, temp, name);
+					free(temp, M_TEMP);
+					return NULL;
+				}
+				memcpy(temp+n, buf, l);
+				n += l;
+				break;
+			case 'U':	/* user id */
+				sprintf(buf, "%u", uid);
+				l = strlen(buf);
+				if ((n + l) > MAXPATHLEN) {
+					log(LOG_ERR, "pid %d (%s), uid (%d):  Path `%s%s' is too long\n",
+					    pid, name, uid, temp, name);
+					free(temp, M_TEMP);
+					return NULL;
+				}
+				memcpy(temp+n, buf, l);
+				n += l;
+				break;
+			default:
+			  	log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format);
+			}
+			break;
+		default:
+			temp[n++] = format[i];
+		}
+	}
+	return temp;
+}
+
+/*
+ * Nonexistent system call-- signal process (may want to handle it).
+ * Flag error in case process won't see signal immediately (blocked or ignored).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nosys_args {
+	int	dummy;
+};
+#endif
+/* ARGSUSED */
+int
+nosys(p, args)
+	struct proc *p;
+	struct nosys_args *args;
+{
+
+	psignal(p, SIGSYS);
+	return (EINVAL);
+}
+
+/*
+ * Send a signal to a SIGIO or SIGURG to a process or process group using
+ * stored credentials rather than those of the current process.
+ */
+void
+pgsigio(sigio, signum, checkctty)
+	struct sigio *sigio;
+	int signum, checkctty;
+{
+	if (sigio == NULL)
+		return;
+		
+	if (sigio->sio_pgid > 0) {
+		if (CANSIGIO(sigio->sio_ruid, sigio->sio_ucred,
+		             sigio->sio_proc))
+			psignal(sigio->sio_proc, signum);
+	} else if (sigio->sio_pgid < 0) {
+		struct proc *p;
+
+		for (p = sigio->sio_pgrp->pg_members.lh_first; p != NULL;
+		     p = p->p_pglist.le_next)
+			if (CANSIGIO(sigio->sio_ruid, sigio->sio_ucred, p) &&
+			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
+				psignal(p, signum);
+	}
+}
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
new file mode 100644
index 0000000..a96d554
--- /dev/null
+++ b/sys/kern/kern_subr.c
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
+ * $Id: kern_subr.c,v 1.23 1999/01/08 17:31:10 eivind Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+
+int
+uiomove(cp, n, uio)
+	register caddr_t cp;
+	register int n;
+	register struct uio *uio;
+{
+	register struct iovec *iov;
+	u_int cnt;
+	int error;
+
+	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+	    ("uiomove: mode"));
+	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_procp == curproc,
+	    ("uiomove proc"));
+
+	while (n > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		cnt = iov->iov_len;
+		if (cnt == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (cnt > n)
+			cnt = n;
+
+		switch (uio->uio_segflg) {
+
+		case UIO_USERSPACE:
+		case UIO_USERISPACE:
+			if (uio->uio_rw == UIO_READ)
+				error = copyout(cp, iov->iov_base, cnt);
+			else
+				error = copyin(iov->iov_base, cp, cnt);
+			if (error)
+				return (error);
+			break;
+
+		case UIO_SYSSPACE:
+			if (uio->uio_rw == UIO_READ)
+				bcopy((caddr_t)cp, iov->iov_base, cnt);
+			else
+				bcopy(iov->iov_base, (caddr_t)cp, cnt);
+			break;
+		case UIO_NOCOPY:
+			break;
+		}
+		iov->iov_base += cnt;
+		iov->iov_len -= cnt;
+		uio->uio_resid -= cnt;
+		uio->uio_offset += cnt;
+		cp += cnt;
+		n -= cnt;
+	}
+	return (0);
+}
+
+int
+uiomoveco(cp, n, uio, obj)
+	caddr_t cp;
+	int n;
+	struct uio *uio;
+	struct vm_object *obj;
+{
+	struct iovec *iov;
+	u_int cnt;
+	int error;
+
+	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+	    ("uiomoveco: mode"));
+	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_procp == curproc,
+	    ("uiomoveco proc"));
+
+	while (n > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		cnt = iov->iov_len;
+		if (cnt == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (cnt > n)
+			cnt = n;
+
+		switch (uio->uio_segflg) {
+
+		case UIO_USERSPACE:
+		case UIO_USERISPACE:
+			if (uio->uio_rw == UIO_READ) {
+				if (vfs_ioopt && ((cnt & PAGE_MASK) == 0) &&
+					((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
+					((uio->uio_offset & PAGE_MASK) == 0) &&
+					((((intptr_t) cp) & PAGE_MASK) == 0)) {
+						error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
+								uio->uio_offset, cnt,
+								(vm_offset_t) iov->iov_base, NULL);
+				} else {
+					error = copyout(cp, iov->iov_base, cnt);
+				}
+			} else {
+				error = copyin(iov->iov_base, cp, cnt);
+			}
+			if (error)
+				return (error);
+			break;
+
+		case UIO_SYSSPACE:
+			if (uio->uio_rw == UIO_READ)
+				bcopy((caddr_t)cp, iov->iov_base, cnt);
+			else
+				bcopy(iov->iov_base, (caddr_t)cp, cnt);
+			break;
+		case UIO_NOCOPY:
+			break;
+		}
+		iov->iov_base += cnt;
+		iov->iov_len -= cnt;
+		uio->uio_resid -= cnt;
+		uio->uio_offset += cnt;
+		cp += cnt;
+		n -= cnt;
+	}
+	return (0);
+}
+
+int
+uioread(n, uio, obj, nread)
+	int n;
+	struct uio *uio;
+	struct vm_object *obj;
+	int *nread;
+{
+	int npagesmoved;
+	struct iovec *iov;
+	u_int cnt, tcnt;
+	int error;
+
+	*nread = 0;
+	if (vfs_ioopt < 2)
+		return 0;
+
+	error = 0;
+
+	while (n > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		cnt = iov->iov_len;
+		if (cnt == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (cnt > n)
+			cnt = n;
+
+		if ((uio->uio_segflg == UIO_USERSPACE) &&
+			((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
+				 ((uio->uio_offset & PAGE_MASK) == 0) ) {
+
+			if (cnt < PAGE_SIZE)
+				break;
+
+			cnt &= ~PAGE_MASK;
+
+			error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
+						uio->uio_offset, cnt,
+						(vm_offset_t) iov->iov_base, &npagesmoved);
+
+			if (npagesmoved == 0)
+				break;
+
+			tcnt = npagesmoved * PAGE_SIZE;
+			cnt = tcnt;
+
+			if (error)
+				break;
+
+			iov->iov_base += cnt;
+			iov->iov_len -= cnt;
+			uio->uio_resid -= cnt;
+			uio->uio_offset += cnt;
+			*nread += cnt;
+			n -= cnt;
+		} else {
+			break;
+		}
+	}
+	return error;
+}
+
+/*
+ * Give next character to user as result of read.
+ */
+int
+ureadc(c, uio)
+	register int c;
+	register struct uio *uio;
+{
+	register struct iovec *iov;
+
+again:
+	if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
+		panic("ureadc");
+	iov = uio->uio_iov;
+	if (iov->iov_len == 0) {
+		uio->uio_iovcnt--;
+		uio->uio_iov++;
+		goto again;
+	}
+	switch (uio->uio_segflg) {
+
+	case UIO_USERSPACE:
+		if (subyte(iov->iov_base, c) < 0)
+			return (EFAULT);
+		break;
+
+	case UIO_SYSSPACE:
+		*iov->iov_base = c;
+		break;
+
+	case UIO_USERISPACE:
+		if (suibyte(iov->iov_base, c) < 0)
+			return (EFAULT);
+		break;
+	case UIO_NOCOPY:
+		break;
+	}
+	iov->iov_base++;
+	iov->iov_len--;
+	uio->uio_resid--;
+	uio->uio_offset++;
+	return (0);
+}
+
+#ifdef vax	/* unused except by ct.c, other oddities XXX */
+/*
+ * Get next character written in by user from uio.
+ */
+int
+uwritec(uio)
+	struct uio *uio;
+{
+	register struct iovec *iov;
+	register int c;
+
+	if (uio->uio_resid <= 0)
+		return (-1);
+again:
+	if (uio->uio_iovcnt <= 0)
+		panic("uwritec");
+	iov = uio->uio_iov;
+	if (iov->iov_len == 0) {
+		uio->uio_iov++;
+		if (--uio->uio_iovcnt == 0)
+			return (-1);
+		goto again;
+	}
+	switch (uio->uio_segflg) {
+
+	case UIO_USERSPACE:
+		c = fubyte(iov->iov_base);
+		break;
+
+	case UIO_SYSSPACE:
+		c = *(u_char *) iov->iov_base;
+		break;
+
+	case UIO_USERISPACE:
+		c = fuibyte(iov->iov_base);
+		break;
+	}
+	if (c < 0)
+		return (-1);
+	iov->iov_base++;
+	iov->iov_len--;
+	uio->uio_resid--;
+	uio->uio_offset++;
+	return (c);
+}
+#endif /* vax */
+
+/*
+ * General routine to allocate a hash table.
+ */
+void *
+hashinit(elements, type, hashmask)
+	int elements;
+	struct malloc_type *type;
+	u_long *hashmask;
+{
+	long hashsize;
+	LIST_HEAD(generic, generic) *hashtbl;
+	int i;
+
+	if (elements <= 0)
+		panic("hashinit: bad elements");
+	for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
+		continue;
+	hashsize >>= 1;
+	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+	for (i = 0; i < hashsize; i++)
+		LIST_INIT(&hashtbl[i]);
+	*hashmask = hashsize - 1;
+	return (hashtbl);
+}
+
+static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
+			2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
+			7159, 7673, 8191, 12281, 16381, 24571, 32749 };
+#define NPRIMES (sizeof(primes) / sizeof(primes[0]))
+
+/*
+ * General routine to allocate a prime number sized hash table.
+ */
+void *
+phashinit(elements, type, nentries)
+	int elements;
+	struct malloc_type *type;
+	u_long *nentries;
+{
+	long hashsize;
+	LIST_HEAD(generic, generic) *hashtbl;
+	int i;
+
+	if (elements <= 0)
+		panic("phashinit: bad elements");
+	for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
+		i++;
+		if (i == NPRIMES)
+			break;
+		hashsize = primes[i];
+	}
+	hashsize = primes[i - 1];
+	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+	for (i = 0; i < hashsize; i++)
+		LIST_INIT(&hashtbl[i]);
+	*nentries = hashsize;
+	return (hashtbl);
+}
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
new file mode 100644
index 0000000..f8baf85
--- /dev/null
+++ b/sys/kern/kern_synch.c
@@ -0,0 +1,923 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
+ * $Id: kern_synch.c,v 1.71 1999/01/08 17:31:10 eivind Exp $
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
+#include <sys/sysctl.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/cpu.h>
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+#include <machine/limits.h>	/* for UCHAR_MAX = typeof(p_priority)_MAX */
+
+static void rqinit __P((void *));
+SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
+
+u_char	curpriority;		/* usrpri of curproc */
+int	lbolt;			/* once a second sleep address */
+
+static void	endtsleep __P((void *));
+static void	roundrobin __P((void *arg));
+static void	schedcpu __P((void *arg));
+static void	updatepri __P((struct proc *p));
+
+#define MAXIMUM_SCHEDULE_QUANTUM	(1000000) /* arbitrary limit */
+#ifndef DEFAULT_SCHEDULE_QUANTUM
+#define DEFAULT_SCHEDULE_QUANTUM 10
+#endif
+static int quantum = DEFAULT_SCHEDULE_QUANTUM; /* default value */
+
+static int
+sysctl_kern_quantum SYSCTL_HANDLER_ARGS
+{
+	int error;
+	int new_val = quantum;
+
+	new_val = quantum;
+	error = sysctl_handle_int(oidp, &new_val, 0, req);
+	if (error == 0) {
+		if ((new_val > 0) && (new_val < MAXIMUM_SCHEDULE_QUANTUM)) {
+			quantum = new_val;
+		} else {
+			error = EINVAL;
+		}
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof quantum, sysctl_kern_quantum, "I", "");
+
+/* maybe_resched: Decide if you need to reschedule or not
+ * taking the priorities and schedulers into account.
+ */
+static void maybe_resched(struct proc *chk)
+{
+	struct proc *p = curproc; /* XXX */
+
+	/*
+	 * Compare priorities if the new process is on the same scheduler,
+	 * otherwise the one on the more realtimeish scheduler wins.
+	 *
+	 * XXX idle scheduler still broken because proccess stays on idle
+	 * scheduler during waits (such as when getting FS locks).  If a
+	 * standard process becomes runaway cpu-bound, the system can lockup
+	 * due to idle-scheduler processes in wakeup never getting any cpu.
+	 */
+	if (p == 0 ||
+		(chk->p_priority < curpriority && RTP_PRIO_BASE(p->p_rtprio.type) == RTP_PRIO_BASE(chk->p_rtprio.type)) ||
+		RTP_PRIO_BASE(chk->p_rtprio.type) < RTP_PRIO_BASE(p->p_rtprio.type)
+	) {
+		need_resched();
+	}
+}
+
+#define ROUNDROBIN_INTERVAL (hz / quantum)
+int roundrobin_interval(void)
+{
+	return ROUNDROBIN_INTERVAL;
+}
+
+/*
+ * Force switch among equal priority processes every 100ms.
+ */
+/* ARGSUSED */
+static void
+roundrobin(arg)
+	void *arg;
+{
+#ifndef SMP
+ 	struct proc *p = curproc; /* XXX */
+#endif
+ 
+#ifdef SMP
+	need_resched();
+	forward_roundrobin();
+#else 
+ 	if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
+ 		need_resched();
+#endif
+
+ 	timeout(roundrobin, NULL, ROUNDROBIN_INTERVAL);
+}
+
+/*
+ * Constants for digital decay and forget:
+ *	90% of (p_estcpu) usage in 5 * loadav time
+ *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
+ *          Note that, as ps(1) mentions, this can let percentages
+ *          total over 100% (I've seen 137.9% for 3 processes).
+ *
+ * Note that statclock() updates p_estcpu and p_cpticks asynchronously.
+ *
+ * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
+ * That is, the system wants to compute a value of decay such
+ * that the following for loop:
+ * 	for (i = 0; i < (5 * loadavg); i++)
+ * 		p_estcpu *= decay;
+ * will compute
+ * 	p_estcpu *= 0.1;
+ * for all values of loadavg:
+ *
+ * Mathematically this loop can be expressed by saying:
+ * 	decay ** (5 * loadavg) ~= .1
+ *
+ * The system computes decay as:
+ * 	decay = (2 * loadavg) / (2 * loadavg + 1)
+ *
+ * We wish to prove that the system's computation of decay
+ * will always fulfill the equation:
+ * 	decay ** (5 * loadavg) ~= .1
+ *
+ * If we compute b as:
+ * 	b = 2 * loadavg
+ * then
+ * 	decay = b / (b + 1)
+ *
+ * We now need to prove two things:
+ *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
+ *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
+ *
+ * Facts:
+ *         For x close to zero, exp(x) =~ 1 + x, since
+ *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
+ *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
+ *         For x close to zero, ln(1+x) =~ x, since
+ *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
+ *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
+ *         ln(.1) =~ -2.30
+ *
+ * Proof of (1):
+ *    Solve (factor)**(power) =~ .1 given power (5*loadav):
+ *	solving for factor,
+ *      ln(factor) =~ (-2.30/5*loadav), or
+ *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
+ *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
+ *
+ * Proof of (2):
+ *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
+ *	solving for power,
+ *      power*ln(b/(b+1)) =~ -2.30, or
+ *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
+ *
+ * Actual power values for the implemented algorithm are as follows:
+ *      loadav: 1       2       3       4
+ *      power:  5.68    10.32   14.94   19.55
+ */
+
+/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
+#define	loadfactor(loadav)	(2 * (loadav))
+#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
+
+/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
+static int	fscale __unused = FSCALE;
+SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
+
+/*
+ * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
+ * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
+ * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
+ *
+ * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
+ *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
+ *
+ * If you don't want to bother with the faster/more-accurate formula, you
+ * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
+ * (more general) method of calculating the %age of CPU used by a process.
+ */
+#define	CCPU_SHIFT	11
+
+/*
+ * Recompute process priorities, every hz ticks.
+ */
+/* ARGSUSED */
+static void
+schedcpu(arg)
+	void *arg;
+{
+	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+	register struct proc *p;
+	register int realstathz, s;
+	register unsigned int newcpu;
+
+	realstathz = stathz ? stathz : hz;
+	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+		/*
+		 * Increment time in/out of memory and sleep time
+		 * (if sleeping).  We ignore overflow; with 16-bit int's
+		 * (remember them?) overflow takes 45 days.
+		 */
+		p->p_swtime++;
+		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
+			p->p_slptime++;
+		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
+		/*
+		 * If the process has slept the entire second,
+		 * stop recalculating its priority until it wakes up.
+		 */
+		if (p->p_slptime > 1)
+			continue;
+		s = splhigh();	/* prevent state changes and protect run queue */
+		/*
+		 * p_pctcpu is only for ps.
+		 */
+#if	(FSHIFT >= CCPU_SHIFT)
+		p->p_pctcpu += (realstathz == 100)?
+			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
+                	100 * (((fixpt_t) p->p_cpticks)
+				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
+#else
+		p->p_pctcpu += ((FSCALE - ccpu) *
+			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
+#endif
+		p->p_cpticks = 0;
+		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu) + p->p_nice;
+		p->p_estcpu = min(newcpu, UCHAR_MAX);
+		resetpriority(p);
+		if (p->p_priority >= PUSER) {
+#define	PPQ	(128 / NQS)		/* priorities per queue */
+			if ((p != curproc) &&
+#ifdef SMP
+			    (u_char)p->p_oncpu == 0xff && 	/* idle */
+#endif
+			    p->p_stat == SRUN &&
+			    (p->p_flag & P_INMEM) &&
+			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
+				remrq(p);
+				p->p_priority = p->p_usrpri;
+				setrunqueue(p);
+			} else
+				p->p_priority = p->p_usrpri;
+		}
+		splx(s);
+	}
+	vmmeter();
+	wakeup((caddr_t)&lbolt);
+	timeout(schedcpu, (void *)0, hz);
+}
+
+/*
+ * Recalculate the priority of a process after it has slept for a while.
+ * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
+ * least six times the loadfactor will decay p_estcpu to zero.
+ */
+static void
+updatepri(p)
+	register struct proc *p;
+{
+	register unsigned int newcpu = p->p_estcpu;
+	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+
+	if (p->p_slptime > 5 * loadfac)
+		p->p_estcpu = 0;
+	else {
+		p->p_slptime--;	/* the first time was done in schedcpu */
+		while (newcpu && --p->p_slptime)
+			newcpu = (int) decay_cpu(loadfac, newcpu);
+		p->p_estcpu = min(newcpu, UCHAR_MAX);
+	}
+	resetpriority(p);
+}
+
+/*
+ * We're only looking at 7 bits of the address; everything is
+ * aligned to 4, lots of things are aligned to greater powers
+ * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
+ */
+#define TABLESIZE	128
+static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
+#define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
+
+/*
+ * During autoconfiguration or after a panic, a sleep will simply
+ * lower the priority briefly to allow interrupts, then return.
+ * The priority to be used (safepri) is machine-dependent, thus this
+ * value is initialized and maintained in the machine-dependent layers.
+ * This priority will typically be 0, or the lowest priority
+ * that is safe for use on the interrupt stack; it can be made
+ * higher to block network software interrupts after panics.
+ */
+int safepri;
+
+void
+sleepinit()
+{
+	int i;
+
+	for (i = 0; i < TABLESIZE; i++)
+		TAILQ_INIT(&slpque[i]);
+}
+
+/*
+ * General sleep call.  Suspends the current process until a wakeup is
+ * performed on the specified identifier.  The process will then be made
+ * runnable with the specified priority.  Sleeps at most timo/hz seconds
+ * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
+ * before and after sleeping, else signals are not checked.  Returns 0 if
+ * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
+ * signal needs to be delivered, ERESTART is returned if the current system
+ * call should be restarted if possible, and EINTR is returned if the system
+ * call should be interrupted by the signal (return EINTR).
+ */
+int
+tsleep(ident, priority, wmesg, timo)
+	void *ident;
+	int priority, timo;
+	const char *wmesg;
+{
+	struct proc *p = curproc;
+	int s, sig, catch = priority & PCATCH;
+	struct callout_handle thandle;
+
+#ifdef KTRACE
+	if (KTRPOINT(p, KTR_CSW))
+		ktrcsw(p->p_tracep, 1, 0);
+#endif
+	s = splhigh();
+	if (cold || panicstr) {
+		/*
+		 * After a panic, or during autoconfiguration,
+		 * just give interrupts a chance, then just return;
+		 * don't run any other procs or panic below,
+		 * in case this is the idle process and already asleep.
+		 */
+		splx(safepri);
+		splx(s);
+		return (0);
+	}
+	KASSERT(p != NULL, ("tsleep1"));
+	KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
+	/*
+	 * Process may be sitting on a slpque if asleep() was called, remove
+	 * it before re-adding.
+	 */
+	if (p->p_wchan != NULL)
+		unsleep(p);
+
+	p->p_wchan = ident;
+	p->p_wmesg = wmesg;
+	p->p_slptime = 0;
+	p->p_priority = priority & PRIMASK;
+	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
+	if (timo)
+		thandle = timeout(endtsleep, (void *)p, timo);
+	/*
+	 * We put ourselves on the sleep queue and start our timeout
+	 * before calling CURSIG, as we could stop there, and a wakeup
+	 * or a SIGCONT (or both) could occur while we were stopped.
+	 * A SIGCONT would cause us to be marked as SSLEEP
+	 * without resuming us, thus we must be ready for sleep
+	 * when CURSIG is called.  If the wakeup happens while we're
+	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
+	 */
+	if (catch) {
+		p->p_flag |= P_SINTR;
+		if ((sig = CURSIG(p))) {
+			if (p->p_wchan)
+				unsleep(p);
+			p->p_stat = SRUN;
+			goto resume;
+		}
+		if (p->p_wchan == 0) {
+			catch = 0;
+			goto resume;
+		}
+	} else
+		sig = 0;
+	p->p_stat = SSLEEP;
+	p->p_stats->p_ru.ru_nvcsw++;
+	mi_switch();
+resume:
+	curpriority = p->p_usrpri;
+	splx(s);
+	p->p_flag &= ~P_SINTR;
+	if (p->p_flag & P_TIMEOUT) {
+		p->p_flag &= ~P_TIMEOUT;
+		if (sig == 0) {
+#ifdef KTRACE
+			if (KTRPOINT(p, KTR_CSW))
+				ktrcsw(p->p_tracep, 0, 0);
+#endif
+			return (EWOULDBLOCK);
+		}
+	} else if (timo)
+		untimeout(endtsleep, (void *)p, thandle);
+	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
+#ifdef KTRACE
+		if (KTRPOINT(p, KTR_CSW))
+			ktrcsw(p->p_tracep, 0, 0);
+#endif
+		if (p->p_sigacts->ps_sigintr & sigmask(sig))
+			return (EINTR);
+		return (ERESTART);
+	}
+#ifdef KTRACE
+	if (KTRPOINT(p, KTR_CSW))
+		ktrcsw(p->p_tracep, 0, 0);
+#endif
+	return (0);
+}
+
+/*
+ * asleep() - async sleep call.  Place process on wait queue and return 
+ * immediately without blocking.  The process stays runnable until await() 
+ * is called.  If ident is NULL, remove process from wait queue if it is still
+ * on one.
+ *
+ * Only the most recent sleep condition is effective when making successive
+ * calls to asleep() or when calling tsleep().
+ *
+ * The timeout, if any, is not initiated until await() is called.  The sleep
+ * priority, signal, and timeout is specified in the asleep() call but may be
+ * overriden in the await() call.
+ *
+ * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
+ */
+
+int
+asleep(void *ident, int priority, const char *wmesg, int timo)
+{
+	struct proc *p = curproc;
+	int s;
+
+	/*
+	 * splhigh() while manipulating sleep structures and slpque.
+	 *
+	 * Remove preexisting wait condition (if any) and place process
+	 * on appropriate slpque, but do not put process to sleep.
+	 */
+
+	s = splhigh();
+
+	if (p->p_wchan != NULL)
+		unsleep(p);
+
+	if (ident) {
+		p->p_wchan = ident;
+		p->p_wmesg = wmesg;
+		p->p_slptime = 0;
+		p->p_asleep.as_priority = priority;
+		p->p_asleep.as_timo = timo;
+		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
+	}
+
+	splx(s);
+
+	return(0);
+}
+
+/*
+ * await() - wait for async condition to occur.   The process blocks until
+ * wakeup() is called on the most recent asleep() address.  If wakeup is called
+ * priority to await(), await() winds up being a NOP.
+ *
+ * If await() is called more then once (without an intervening asleep() call),
+ * await() is still effectively a NOP but it calls mi_switch() to give other
+ * processes some cpu before returning.  The process is left runnable.
+ *
+ * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
+ */
+
+int
+await(int priority, int timo)
+{
+	struct proc *p = curproc;
+	int s;
+
+	s = splhigh();
+
+	if (p->p_wchan != NULL) {
+		struct callout_handle thandle;
+		int sig;
+		int catch;
+
+		/*
+		 * The call to await() can override defaults specified in
+		 * the original asleep().
+		 */
+		if (priority < 0)
+			priority = p->p_asleep.as_priority;
+		if (timo < 0)
+			timo = p->p_asleep.as_timo;
+
+		/*
+		 * Install timeout
+		 */
+
+		if (timo)
+			thandle = timeout(endtsleep, (void *)p, timo);
+
+		sig = 0;
+		catch = priority & PCATCH;
+
+		if (catch) {
+			p->p_flag |= P_SINTR;
+			if ((sig = CURSIG(p))) {
+				if (p->p_wchan)
+					unsleep(p);
+				p->p_stat = SRUN;
+				goto resume;
+			}
+			if (p->p_wchan == NULL) {
+				catch = 0;
+				goto resume;
+			}
+		}
+		p->p_stat = SSLEEP;
+		p->p_stats->p_ru.ru_nvcsw++;
+		mi_switch();
+resume:
+		curpriority = p->p_usrpri;
+
+		splx(s);
+		p->p_flag &= ~P_SINTR;
+		if (p->p_flag & P_TIMEOUT) {
+			p->p_flag &= ~P_TIMEOUT;
+			if (sig == 0) {
+#ifdef KTRACE
+				if (KTRPOINT(p, KTR_CSW))
+					ktrcsw(p->p_tracep, 0, 0);
+#endif
+				return (EWOULDBLOCK);
+			}
+		} else if (timo)
+			untimeout(endtsleep, (void *)p, thandle);
+		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
+#ifdef KTRACE
+			if (KTRPOINT(p, KTR_CSW))
+				ktrcsw(p->p_tracep, 0, 0);
+#endif
+			if (p->p_sigacts->ps_sigintr & sigmask(sig))
+				return (EINTR);
+			return (ERESTART);
+		}
+#ifdef KTRACE
+		if (KTRPOINT(p, KTR_CSW))
+			ktrcsw(p->p_tracep, 0, 0);
+#endif
+	} else {
+		/*
+		 * If as_priority is 0, await() has been called without an 
+		 * intervening asleep().  We are still effectively a NOP, 
+		 * but we call mi_switch() for safety.
+		 */
+
+		if (p->p_asleep.as_priority == 0) {
+			p->p_stats->p_ru.ru_nvcsw++;
+			mi_switch();
+		}
+		splx(s);
+	}
+
+	/*
+	 * clear p_asleep.as_priority as an indication that await() has been
+	 * called.  If await() is called again without an intervening asleep(),
+	 * await() is still effectively a NOP but the above mi_switch() code
+	 * is triggered as a safety.
+	 */
+	p->p_asleep.as_priority = 0;
+
+	return (0);
+}
+
+/*
+ * Implement timeout for tsleep or asleep()/await()
+ *
+ * If process hasn't been awakened (wchan non-zero),
+ * set timeout flag and undo the sleep.  If proc
+ * is stopped, just unsleep so it will remain stopped.
+ */
+static void
+endtsleep(arg)
+	void *arg;
+{
+	register struct proc *p;
+	int s;
+
+	p = (struct proc *)arg;
+	s = splhigh();
+	if (p->p_wchan) {
+		if (p->p_stat == SSLEEP)
+			setrunnable(p);
+		else
+			unsleep(p);
+		p->p_flag |= P_TIMEOUT;
+	}
+	splx(s);
+}
+
+/*
+ * Remove a process from its wait queue
+ */
+void
+unsleep(p)
+	register struct proc *p;
+{
+	int s;
+
+	s = splhigh();
+	if (p->p_wchan) {
+		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
+		p->p_wchan = 0;
+	}
+	splx(s);
+}
+
+/*
+ * Make all processes sleeping on the specified identifier runnable.
+ */
+void
+wakeup(ident)
+	register void *ident;
+{
+	register struct slpquehead *qp;
+	register struct proc *p;
+	int s;
+
+	s = splhigh();
+	qp = &slpque[LOOKUP(ident)];
+restart:
+	for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
+		if (p->p_wchan == ident) {
+			TAILQ_REMOVE(qp, p, p_procq);
+			p->p_wchan = 0;
+			if (p->p_stat == SSLEEP) {
+				/* OPTIMIZED EXPANSION OF setrunnable(p); */
+				if (p->p_slptime > 1)
+					updatepri(p);
+				p->p_slptime = 0;
+				p->p_stat = SRUN;
+				if (p->p_flag & P_INMEM) {
+					setrunqueue(p);
+					maybe_resched(p);
+				} else {
+					p->p_flag |= P_SWAPINREQ;
+					wakeup((caddr_t)&proc0);
+				}
+				/* END INLINE EXPANSION */
+				goto restart;
+			}
+		}
+	}
+	splx(s);
+}
+
+/*
+ * Make a process sleeping on the specified identifier runnable.
+ * May wake more than one process if a target prcoess is currently
+ * swapped out.
+ */
+void
+wakeup_one(ident)
+	register void *ident;
+{
+	register struct slpquehead *qp;
+	register struct proc *p;
+	int s;
+
+	s = splhigh();
+	qp = &slpque[LOOKUP(ident)];
+
+	for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
+		if (p->p_wchan == ident) {
+			TAILQ_REMOVE(qp, p, p_procq);
+			p->p_wchan = 0;
+			if (p->p_stat == SSLEEP) {
+				/* OPTIMIZED EXPANSION OF setrunnable(p); */
+				if (p->p_slptime > 1)
+					updatepri(p);
+				p->p_slptime = 0;
+				p->p_stat = SRUN;
+				if (p->p_flag & P_INMEM) {
+					setrunqueue(p);
+					maybe_resched(p);
+					break;
+				} else {
+					p->p_flag |= P_SWAPINREQ;
+					wakeup((caddr_t)&proc0);
+				}
+				/* END INLINE EXPANSION */
+			}
+		}
+	}
+	splx(s);
+}
+
+/*
+ * The machine independent parts of mi_switch().
+ * Must be called at splstatclock() or higher.
+ */
+void
+mi_switch()
+{
+	register struct proc *p = curproc;	/* XXX */
+	register struct rlimit *rlim;
+	int x;
+
+	/*
+	 * XXX this spl is almost unnecessary.  It is partly to allow for
+	 * sloppy callers that don't do it (issignal() via CURSIG() is the
+	 * main offender).  It is partly to work around a bug in the i386
+	 * cpu_switch() (the ipl is not preserved).  We ran for years
+	 * without it.  I think there was only a interrupt latency problem.
+	 * The main caller, tsleep(), does an splx() a couple of instructions
+	 * after calling here.  The buggy caller, issignal(), usually calls
+	 * here at spl0() and sometimes returns at splhigh().  The process
+	 * then runs for a little too long at splhigh().  The ipl gets fixed
+	 * when the process returns to user mode (or earlier).
+	 *
+	 * It would probably be better to always call here at spl0(). Callers
+	 * are prepared to give up control to another process, so they must
+	 * be prepared to be interrupted.  The clock stuff here may not
+	 * actually need splstatclock().
+	 */
+	x = splstatclock();
+
+#ifdef SIMPLELOCK_DEBUG
+	if (p->p_simple_locks)
+		printf("sleep: holding simple lock\n");
+#endif
+	/*
+	 * Compute the amount of time during which the current
+	 * process was running, and add that to its total so far.
+	 */
+	microuptime(&switchtime);
+	p->p_runtime += (switchtime.tv_usec - p->p_switchtime.tv_usec) +
+	    (switchtime.tv_sec - p->p_switchtime.tv_sec) * (int64_t)1000000;
+
+	/*
+	 * Check if the process exceeds its cpu resource allocation.
+	 * If over max, kill it.
+	 */
+	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
+	    p->p_runtime > p->p_limit->p_cpulimit) {
+		rlim = &p->p_rlimit[RLIMIT_CPU];
+		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
+			killproc(p, "exceeded maximum CPU limit");
+		} else {
+			psignal(p, SIGXCPU);
+			if (rlim->rlim_cur < rlim->rlim_max) {
+				/* XXX: we should make a private copy */
+				rlim->rlim_cur += 5;
+			}
+		}
+	}
+
+	/*
+	 * Pick a new current process and record its start time.
+	 */
+	cnt.v_swtch++;
+	cpu_switch(p);
+	if (switchtime.tv_sec)
+		p->p_switchtime = switchtime;
+	else
+		microuptime(&p->p_switchtime);
+	splx(x);
+}
+
+/*
+ * Initialize the (doubly-linked) run queues
+ * to be empty.
+ */
+/* ARGSUSED*/
+static void
+rqinit(dummy)
+	void *dummy;
+{
+	register int i;
+
+	for (i = 0; i < NQS; i++) {
+		qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i];
+		rtqs[i].ph_link = rtqs[i].ph_rlink = (struct proc *)&rtqs[i];
+		idqs[i].ph_link = idqs[i].ph_rlink = (struct proc *)&idqs[i];
+	}
+}
+
+/*
+ * Change process state to be runnable,
+ * placing it on the run queue if it is in memory,
+ * and awakening the swapper if it isn't in memory.
+ */
+void
+setrunnable(p)
+	register struct proc *p;
+{
+	register int s;
+
+	s = splhigh();
+	switch (p->p_stat) {
+	case 0:
+	case SRUN:
+	case SZOMB:
+	default:
+		panic("setrunnable");
+	case SSTOP:
+	case SSLEEP:
+		unsleep(p);		/* e.g. when sending signals */
+		break;
+
+	case SIDL:
+		break;
+	}
+	p->p_stat = SRUN;
+	if (p->p_flag & P_INMEM)
+		setrunqueue(p);
+	splx(s);
+	if (p->p_slptime > 1)
+		updatepri(p);
+	p->p_slptime = 0;
+	if ((p->p_flag & P_INMEM) == 0) {
+		p->p_flag |= P_SWAPINREQ;
+		wakeup((caddr_t)&proc0);
+	}
+	else
+		maybe_resched(p);
+}
+
+/*
+ * Compute the priority of a process when running in user mode.
+ * Arrange to reschedule if the resulting priority is better
+ * than that of the current process.
+ */
+void
+resetpriority(p)
+	register struct proc *p;
+{
+	register unsigned int newpriority;
+
+	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
+		newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
+		newpriority = min(newpriority, MAXPRI);
+		p->p_usrpri = newpriority;
+	}
+	maybe_resched(p);
+}
+
+/* ARGSUSED */
+static void sched_setup __P((void *dummy));
+static void
+sched_setup(dummy)
+	void *dummy;
+{
+	/* Kick off timeout driven events by calling first time. */
+	roundrobin(NULL);
+	schedcpu(NULL);
+}
+SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
+
diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
new file mode 100644
index 0000000..e1192a9
--- /dev/null
+++ b/sys/kern/kern_syscalls.c
@@ -0,0 +1,109 @@
+/*-
+ * Copyright (c) 1999 Assar Westerlund
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     $Id: kern_syscalls.c,v 1.2 1999/01/09 14:59:50 dfr Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/syscall.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+#include <sys/proc.h>
+
+/*
+ * Acts like "nosys" but can be identified in sysent for dynamic call 
+ * number assignment for a limited number of calls. 
+ * 
+ * Place holder for system call slots reserved for loadable modules.
+ */     
+int
+lkmnosys(struct proc *p, struct nosys_args *args)
+{
+	return(nosys(p, args));
+}
+
+int
+syscall_register(int *offset, struct sysent *new_sysent,
+		 struct sysent *old_sysent)
+{
+       if (*offset == NO_SYSCALL) {
+               int i;
+
+               for (i = 1; i < SYS_MAXSYSCALL; ++i)
+                       if (sysent[i].sy_call == (sy_call_t *)lkmnosys)
+                               break;
+               if (i == SYS_MAXSYSCALL)
+                       return ENFILE;
+               *offset = i;
+       } else if (*offset < 0 || *offset >= SYS_MAXSYSCALL)
+               return EINVAL;
+       else if (sysent[*offset].sy_call != (sy_call_t *)lkmnosys)
+               return EEXIST;
+
+       *old_sysent = sysent[*offset];
+       sysent[*offset] = *new_sysent;
+       return 0;
+}
+
+int
+syscall_deregister(int *offset, struct sysent *old_sysent)
+{
+       if (*offset)
+               sysent[*offset] = *old_sysent;
+       return 0;
+}
+
+int
+syscall_module_handler(struct module *mod, int what, void *arg)
+{
+       struct syscall_module_data *data = (struct syscall_module_data*)arg;
+       modspecific_t ms;
+       int error;
+
+       switch (what) {
+       case MOD_LOAD :
+               error = syscall_register(data->offset, data->new_sysent,
+                                        &data->old_sysent);
+               if (error)
+                       return error;
+	       ms.intval = *data->offset;
+	       module_setspecific(mod, &ms);
+               break;
+       case MOD_UNLOAD :
+               error = syscall_deregister(data->offset, &data->old_sysent);
+               if (error)
+                       return error;
+               break;
+       }
+       if (data->chainevh)
+               return data->chainevh(mod, what, data->chainarg);
+       else
+               return 0;
+}
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
new file mode 100644
index 0000000..fbf2f6a
--- /dev/null
+++ b/sys/kern/kern_sysctl.c
@@ -0,0 +1,1122 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
+ * $Id: kern_sysctl.c,v 1.81 1998/12/27 18:03:29 dfr Exp $
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
+
+/*
+ * Locking and stats
+ */
+static struct sysctl_lock {
+	int	sl_lock;
+	int	sl_want;
+	int	sl_locked;
+} memlock;
+
+static int sysctl_root SYSCTL_HANDLER_ARGS;
+
+extern struct linker_set sysctl_;
+
+/*
+ * Initialization of the MIB tree.
+ *
+ * Order by number in each linker_set.
+ */
+
+static int
+sysctl_order_cmp(const void *a, const void *b)
+{
+	struct sysctl_oid const * const *pa;
+	struct sysctl_oid const * const *pb;
+
+	pa = (struct sysctl_oid const * const *)a;
+	pb = (struct sysctl_oid const * const *)b;
+	if (*pa == NULL && *pb == NULL)
+		return 0;
+	if (*pa == NULL)
+		return (1);
+	if (*pb == NULL)
+		return (-1);
+	return ((*pa)->oid_number - (*pb)->oid_number);
+}
+
+static void
+sysctl_order(void *arg)
+{
+	int j, k;
+	struct linker_set *l = (struct linker_set *) arg;
+	struct sysctl_oid **oidpp;
+
+	/* First, find the highest oid we have */
+	j = l->ls_length;
+	oidpp = (struct sysctl_oid **) l->ls_items;
+	for (k = 0; j--; oidpp++) {
+		if (!*oidpp)
+			continue;
+		if ((*oidpp)->oid_arg1 == arg) {
+			*oidpp = 0;
+			continue;
+		}
+		if ((*oidpp)->oid_number > k)
+			k = (*oidpp)->oid_number;
+	}
+
+	/* Next, replace all OID_AUTO oids with new numbers */
+	j = l->ls_length;
+	oidpp = (struct sysctl_oid **) l->ls_items;
+	k += 100;
+	for (; j--; oidpp++) 
+		if (*oidpp && (*oidpp)->oid_number == OID_AUTO)
+			(*oidpp)->oid_number = k++;
+
+	/* Finally: sort by oid */
+	j = l->ls_length;
+	oidpp = (struct sysctl_oid **) l->ls_items;
+	for (; j--; oidpp++) {
+		if (!*oidpp)
+			continue;
+		if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE)
+			if (!(*oidpp)->oid_handler)
+				sysctl_order((*oidpp)->oid_arg1);
+	}
+	qsort(l->ls_items, l->ls_length, sizeof l->ls_items[0],
+		sysctl_order_cmp);
+}
+
+SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_order, &sysctl_);
+
+void
+sysctl_order_all(void)
+{
+	sysctl_order(&sysctl_);
+}
+
+/*
+ * "Staff-functions"
+ *
+ * These functions implement a presently undocumented interface 
+ * used by the sysctl program to walk the tree, and get the type
+ * so it can print the value.
+ * This interface is under work and consideration, and should probably
+ * be killed with a big axe by the first person who can find the time.
+ * (be aware though, that the proper interface isn't as obvious as it
+ * may seem, there are various conflicting requirements.
+ *
+ * {0,0}	printf the entire MIB-tree.
+ * {0,1,...}	return the name of the "..." OID.
+ * {0,2,...}	return the next OID.
+ * {0,3}	return the OID of the name in "new"
+ * {0,4,...}	return the kind & format info for the "..." OID.
+ */
+
+static void
+sysctl_sysctl_debug_dump_node(struct linker_set *l, int i)
+{
+	int j, k;
+	struct sysctl_oid **oidpp;
+
+	j = l->ls_length;
+	oidpp = (struct sysctl_oid **) l->ls_items;
+	for (; j--; oidpp++) {
+
+		if (!*oidpp)
+			continue;
+
+		for (k=0; k<i; k++)
+			printf(" ");
+
+		printf("%d %s ", (*oidpp)->oid_number, (*oidpp)->oid_name);
+
+		printf("%c%c",
+			(*oidpp)->oid_kind & CTLFLAG_RD ? 'R':' ',
+			(*oidpp)->oid_kind & CTLFLAG_WR ? 'W':' ');
+
+		if ((*oidpp)->oid_handler)
+			printf(" *Handler");
+
+		switch ((*oidpp)->oid_kind & CTLTYPE) {
+			case CTLTYPE_NODE:
+				printf(" Node\n");
+				if (!(*oidpp)->oid_handler) {
+					sysctl_sysctl_debug_dump_node(
+						(*oidpp)->oid_arg1, i+2);
+				}
+				break;
+			case CTLTYPE_INT:    printf(" Int\n"); break;
+			case CTLTYPE_STRING: printf(" String\n"); break;
+			case CTLTYPE_QUAD:   printf(" Quad\n"); break;
+			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
+			default:	     printf("\n");
+		}
+
+	}
+}
+
+static int
+sysctl_sysctl_debug SYSCTL_HANDLER_ARGS
+{
+	sysctl_sysctl_debug_dump_node(&sysctl_, 0);
+	return ENOENT;
+}
+
+SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
+	0, 0, sysctl_sysctl_debug, "-", "");
+
+static int
+sysctl_sysctl_name SYSCTL_HANDLER_ARGS
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	int i, j, error = 0;
+	struct sysctl_oid **oidpp;
+	struct linker_set *lsp = &sysctl_;
+	char buf[10];
+
+	while (namelen) {
+		if (!lsp) {
+			snprintf(buf,sizeof(buf),"%d",*name);
+			if (req->oldidx)
+				error = SYSCTL_OUT(req, ".", 1);
+			if (!error)
+				error = SYSCTL_OUT(req, buf, strlen(buf));
+			if (error)
+				return (error);
+			namelen--;
+			name++;
+			continue;
+		}
+		oidpp = (struct sysctl_oid **) lsp->ls_items;
+		j = lsp->ls_length;
+		lsp = 0;
+		for (i = 0; i < j; i++, oidpp++) {
+			if (*oidpp && ((*oidpp)->oid_number != *name))
+				continue;
+
+			if (req->oldidx)
+				error = SYSCTL_OUT(req, ".", 1);
+			if (!error)
+				error = SYSCTL_OUT(req, (*oidpp)->oid_name,
+					strlen((*oidpp)->oid_name));
+			if (error)
+				return (error);
+
+			namelen--;
+			name++;
+
+			if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+				break;
+
+			if ((*oidpp)->oid_handler)
+				break;
+
+			lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+			break;
+		}
+	}
+	return (SYSCTL_OUT(req, "", 1));
+}
+
+SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, "");
+
+static int
+sysctl_sysctl_next_ls (struct linker_set *lsp, int *name, u_int namelen, 
+	int *next, int *len, int level, struct sysctl_oid **oidp)
+{
+	int i, j;
+	struct sysctl_oid **oidpp;
+
+	oidpp = (struct sysctl_oid **) lsp->ls_items;
+	j = lsp->ls_length;
+	*len = level;
+	for (i = 0; i < j; i++, oidpp++) {
+		if (!*oidpp)
+			continue;
+
+		*next = (*oidpp)->oid_number;
+		*oidp = *oidpp;
+
+		if (!namelen) {
+			if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
+				return 0;
+			if ((*oidpp)->oid_handler) 
+				/* We really should call the handler here...*/
+				return 0;
+			lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+			if (!sysctl_sysctl_next_ls (lsp, 0, 0, next+1, 
+				len, level+1, oidp))
+				return 0;
+			goto next;
+		}
+
+		if ((*oidpp)->oid_number < *name)
+			continue;
+
+		if ((*oidpp)->oid_number > *name) {
+			if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+				return 0;
+			if ((*oidpp)->oid_handler)
+				return 0;
+			lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+			if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1, 
+				next+1, len, level+1, oidp))
+				return (0);
+			goto next;
+		}
+		if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+			continue;
+
+		if ((*oidpp)->oid_handler)
+			continue;
+
+		lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+		if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1, next+1, 
+			len, level+1, oidp))
+			return (0);
+	next:
+		namelen = 1;
+		*len = level;
+	}
+	return 1;
+}
+
+static int
+sysctl_sysctl_next SYSCTL_HANDLER_ARGS
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	int i, j, error;
+	struct sysctl_oid *oid;
+	struct linker_set *lsp = &sysctl_;
+	int newoid[CTL_MAXNAME];
+
+	i = sysctl_sysctl_next_ls (lsp, name, namelen, newoid, &j, 1, &oid);
+	if (i)
+		return ENOENT;
+	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
+	return (error);
+}
+
+SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, "");
+
+static int
+name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidp)
+{
+	int i, j;
+	struct sysctl_oid **oidpp;
+	struct linker_set *lsp = &sysctl_;
+	char *p;
+
+	if (!*name)
+		return ENOENT;
+
+	p = name + strlen(name) - 1 ;
+	if (*p == '.')
+		*p = '\0';
+
+	*len = 0;
+
+	for (p = name; *p && *p != '.'; p++) 
+		;
+	i = *p;
+	if (i == '.')
+		*p = '\0';
+
+	j = lsp->ls_length;
+	oidpp = (struct sysctl_oid **) lsp->ls_items;
+
+	while (j-- && *len < CTL_MAXNAME) {
+		if (!*oidpp)
+			continue;
+		if (strcmp(name, (*oidpp)->oid_name)) {
+			oidpp++;
+			continue;
+		}
+		*oid++ = (*oidpp)->oid_number;
+		(*len)++;
+
+		if (!i) {
+			if (oidp)
+				*oidp = *oidpp;
+			return (0);
+		}
+
+		if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+			break;
+
+		if ((*oidpp)->oid_handler)
+			break;
+
+		lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+		j = lsp->ls_length;
+		oidpp = (struct sysctl_oid **)lsp->ls_items;
+		name = p+1;
+		for (p = name; *p && *p != '.'; p++) 
+				;
+		i = *p;
+		if (i == '.')
+			*p = '\0';
+	}
+	return ENOENT;
+}
+
+static int
+sysctl_sysctl_name2oid SYSCTL_HANDLER_ARGS
+{
+	char *p;
+	int error, oid[CTL_MAXNAME], len;
+	struct sysctl_oid *op = 0;
+
+	if (!req->newlen) 
+		return ENOENT;
+
+	p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
+
+	error = SYSCTL_IN(req, p, req->newlen);
+	if (error) {
+		free(p, M_SYSCTL);
+		return (error);
+	}
+
+	p [req->newlen] = '\0';
+
+	error = name2oid(p, oid, &len, &op);
+
+	free(p, M_SYSCTL);
+
+	if (error)
+		return (error);
+
+	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
+	return (error);
+}
+
+SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, 
+	sysctl_sysctl_name2oid, "I", "");
+
+static int
+sysctl_sysctl_oidfmt SYSCTL_HANDLER_ARGS
+{
+	int *name = (int *) arg1, error;
+	u_int namelen = arg2;
+	int indx, j;
+	struct sysctl_oid **oidpp;
+	struct linker_set *lsp = &sysctl_;
+
+	j = lsp->ls_length;
+	oidpp = (struct sysctl_oid **) lsp->ls_items;
+
+	indx = 0;
+	while (j-- && indx < CTL_MAXNAME) {
+		if (*oidpp && ((*oidpp)->oid_number == name[indx])) {
+			indx++;
+			if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+				if ((*oidpp)->oid_handler)
+					goto found;
+				if (indx == namelen)
+					goto found;
+				lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+				j = lsp->ls_length;
+				oidpp = (struct sysctl_oid **)lsp->ls_items;
+			} else {
+				if (indx != namelen)
+					return EISDIR;
+				goto found;
+			}
+		} else {
+			oidpp++;
+		}
+	}
+	return ENOENT;
+found:
+	if (!(*oidpp)->oid_fmt)
+		return ENOENT;
+	error = SYSCTL_OUT(req, 
+		&(*oidpp)->oid_kind, sizeof((*oidpp)->oid_kind));
+	if (!error)
+		error = SYSCTL_OUT(req, (*oidpp)->oid_fmt, 
+			strlen((*oidpp)->oid_fmt)+1);
+	return (error);
+}
+
+
+SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, "");
+
+/*
+ * Default "handler" functions.
+ */
+
+/*
+ * Handle an int, signed or unsigned.
+ * Two cases:
+ *     a variable:  point arg1 at it.
+ *     a constant:  pass it in arg2.
+ */
+
+int
+sysctl_handle_int SYSCTL_HANDLER_ARGS
+{
+	int error = 0;
+
+	if (arg1)
+		error = SYSCTL_OUT(req, arg1, sizeof(int));
+	else
+		error = SYSCTL_OUT(req, &arg2, sizeof(int));
+
+	if (error || !req->newptr)
+		return (error);
+
+	if (!arg1)
+		error = EPERM;
+	else
+		error = SYSCTL_IN(req, arg1, sizeof(int));
+	return (error);
+}
+
+/*
+ * Handle a long, signed or unsigned.
+ * Two cases:
+ *     a variable:  point arg1 at it.
+ *     a constant:  pass it in arg2.
+ */
+
+int
+sysctl_handle_long SYSCTL_HANDLER_ARGS
+{
+	int error = 0;
+
+	error = SYSCTL_OUT(req, arg1, sizeof(long));
+
+	if (error || !req->newptr)
+		return (error);
+
+	if (!arg1)
+		error = EPERM;
+	else
+		error = SYSCTL_IN(req, arg1, sizeof(long));
+	return (error);
+}
+
+/*
+ * Handle our generic '\0' terminated 'C' string.
+ * Two cases:
+ * 	a variable string:  point arg1 at it, arg2 is max length.
+ * 	a constant string:  point arg1 at it, arg2 is zero.
+ */
+
+int
+sysctl_handle_string SYSCTL_HANDLER_ARGS
+{
+	int error=0;
+
+	error = SYSCTL_OUT(req, arg1, strlen((char *)arg1)+1);
+
+	if (error || !req->newptr || !arg2)
+		return (error);
+
+	if ((req->newlen - req->newidx) > arg2) {
+		error = E2BIG;
+	} else {
+		arg2 = (req->newlen - req->newidx);
+		error = SYSCTL_IN(req, arg1, arg2);
+		((char *)arg1)[arg2] = '\0';
+	}
+
+	return (error);
+}
+
+/*
+ * Handle any kind of opaque data.
+ * arg1 points to it, arg2 is the size.
+ */
+
+int
+sysctl_handle_opaque SYSCTL_HANDLER_ARGS
+{
+	int error;
+
+	error = SYSCTL_OUT(req, arg1, arg2);
+
+	if (error || !req->newptr)
+		return (error);
+
+	error = SYSCTL_IN(req, arg1, arg2);
+
+	return (error);
+}
+
+/*
+ * Transfer functions to/from kernel space.
+ * XXX: rather untested at this point
+ */
+static int
+sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
+{
+	size_t i = 0;
+
+	if (req->oldptr) {
+		i = l;
+		if (i > req->oldlen - req->oldidx)
+			i = req->oldlen - req->oldidx;
+		if (i > 0)
+			bcopy(p, (char *)req->oldptr + req->oldidx, i);
+	}
+	req->oldidx += l;
+	if (req->oldptr && i != l)
+		return (ENOMEM);
+	return (0);
+}
+
+static int
+sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
+{
+	if (!req->newptr)
+		return 0;
+	if (req->newlen - req->newidx < l)
+		return (EINVAL);
+	bcopy((char *)req->newptr + req->newidx, p, l);
+	req->newidx += l;
+	return (0);
+}
+
+int
+kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval)
+{
+	int error = 0;
+	struct sysctl_req req;
+
+	bzero(&req, sizeof req);
+
+	req.p = p;
+
+	if (oldlenp) {
+		req.oldlen = *oldlenp;
+	}
+
+	if (old) {
+		req.oldptr= old;
+	}
+
+	if (newlen) {
+		req.newlen = newlen;
+		req.newptr = new;
+	}
+
+	req.oldfunc = sysctl_old_kernel;
+	req.newfunc = sysctl_new_kernel;
+	req.lock = 1;
+
+	/* XXX this should probably be done in a general way */
+	while (memlock.sl_lock) {
+		memlock.sl_want = 1;
+		(void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0);
+		memlock.sl_locked++;
+	}
+	memlock.sl_lock = 1;
+
+	error = sysctl_root(0, name, namelen, &req);
+
+	if (req.lock == 2)
+		vsunlock(req.oldptr, req.oldlen, B_WRITE);
+
+	memlock.sl_lock = 0;
+
+	if (memlock.sl_want) {
+		memlock.sl_want = 0;
+		wakeup((caddr_t)&memlock);
+	}
+
+	if (error && error != ENOMEM)
+		return (error);
+
+	if (retval) {
+		if (req.oldptr && req.oldidx > req.oldlen)
+			*retval = req.oldlen;
+		else
+			*retval = req.oldidx;
+	}
+	return (error);
+}
+
+/*
+ * Transfer function to/from user space.
+ */
+static int
+sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
+{
+	int error = 0;
+	size_t i = 0;
+
+	if (req->lock == 1 && req->oldptr) {
+		vslock(req->oldptr, req->oldlen);
+		req->lock = 2;
+	}
+	if (req->oldptr) {
+		i = l;
+		if (i > req->oldlen - req->oldidx)
+			i = req->oldlen - req->oldidx;
+		if (i > 0)
+			error = copyout(p, (char *)req->oldptr + req->oldidx,
+					i);
+	}
+	req->oldidx += l;
+	if (error)
+		return (error);
+	if (req->oldptr && i < l)
+		return (ENOMEM);
+	return (0);
+}
+
+static int
+sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
+{
+	int error;
+
+	if (!req->newptr)
+		return 0;
+	if (req->newlen - req->newidx < l)
+		return (EINVAL);
+	error = copyin((char *)req->newptr + req->newidx, p, l);
+	req->newidx += l;
+	return (error);
+}
+
+/*
+ * Traverse our tree, and find the right node, execute whatever it points
+ * at, and return the resulting error code.
+ */
+
+int
+sysctl_root SYSCTL_HANDLER_ARGS
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	int indx, i, j;
+	struct sysctl_oid **oidpp;
+	struct linker_set *lsp = &sysctl_;
+
+	j = lsp->ls_length;
+	oidpp = (struct sysctl_oid **) lsp->ls_items;
+
+	indx = 0;
+	while (j-- && indx < CTL_MAXNAME) {
+		if (*oidpp && ((*oidpp)->oid_number == name[indx])) {
+			indx++;
+			if ((*oidpp)->oid_kind & CTLFLAG_NOLOCK)
+				req->lock = 0;
+			if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+				if ((*oidpp)->oid_handler)
+					goto found;
+				if (indx == namelen)
+					return ENOENT;
+				lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+				j = lsp->ls_length;
+				oidpp = (struct sysctl_oid **)lsp->ls_items;
+			} else {
+				if (indx != namelen)
+					return EISDIR;
+				goto found;
+			}
+		} else {
+			oidpp++;
+		}
+	}
+	return ENOENT;
+found:
+	/* If writing isn't allowed */
+	if (req->newptr && (!((*oidpp)->oid_kind & CTLFLAG_WR) ||
+	    (((*oidpp)->oid_kind & CTLFLAG_SECURE) && securelevel > 0)))
+		return (EPERM);
+
+	/* Most likely only root can write */
+	if (!((*oidpp)->oid_kind & CTLFLAG_ANYBODY) &&
+	    req->newptr && req->p &&
+	    (i = suser(req->p->p_ucred, &req->p->p_acflag)))
+		return (i);
+
+	if (!(*oidpp)->oid_handler)
+		return EINVAL;
+
+	if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+		i = ((*oidpp)->oid_handler) (*oidpp,
+					name + indx, namelen - indx,
+					req);
+	} else {
+		i = ((*oidpp)->oid_handler) (*oidpp,
+					(*oidpp)->oid_arg1, (*oidpp)->oid_arg2,
+					req);
+	}
+	return (i);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sysctl_args {
+	int	*name;
+	u_int	namelen;
+	void	*old;
+	size_t	*oldlenp;
+	void	*new;
+	size_t	newlen;
+};
+#endif
+
+int
+__sysctl(struct proc *p, struct sysctl_args *uap)
+{
+	int error, i, name[CTL_MAXNAME];
+	size_t j;
+
+	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
+		return (EINVAL);
+
+ 	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
+ 	if (error)
+		return (error);
+
+	error = userland_sysctl(p, name, uap->namelen,
+		uap->old, uap->oldlenp, 0,
+		uap->new, uap->newlen, &j);
+	if (error && error != ENOMEM)
+		return (error);
+	if (uap->oldlenp) {
+		i = copyout(&j, uap->oldlenp, sizeof(j));
+		if (i)
+			return (i);
+	}
+	return (error);
+}
+
+/*
+ * This is used from various compatibility syscalls too.  That's why name
+ * must be in kernel space.
+ */
+int
+userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval)
+{
+	int error = 0;
+	struct sysctl_req req, req2;
+
+	bzero(&req, sizeof req);
+
+	req.p = p;
+
+	if (oldlenp) {
+		if (inkernel) {
+			req.oldlen = *oldlenp;
+		} else {
+			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
+			if (error)
+				return (error);
+		}
+	}
+
+	if (old) {
+		if (!useracc(old, req.oldlen, B_WRITE))
+			return (EFAULT);
+		req.oldptr= old;
+	}
+
+	if (newlen) {
+		if (!useracc(new, req.newlen, B_READ))
+			return (EFAULT);
+		req.newlen = newlen;
+		req.newptr = new;
+	}
+
+	req.oldfunc = sysctl_old_user;
+	req.newfunc = sysctl_new_user;
+	req.lock = 1;
+
+	/* XXX this should probably be done in a general way */
+	while (memlock.sl_lock) {
+		memlock.sl_want = 1;
+		(void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0);
+		memlock.sl_locked++;
+	}
+	memlock.sl_lock = 1;
+
+	do {
+	    req2 = req;
+	    error = sysctl_root(0, name, namelen, &req2);
+	} while (error == EAGAIN);
+
+	req = req2;
+	if (req.lock == 2)
+		vsunlock(req.oldptr, req.oldlen, B_WRITE);
+
+	memlock.sl_lock = 0;
+
+	if (memlock.sl_want) {
+		memlock.sl_want = 0;
+		wakeup((caddr_t)&memlock);
+	}
+
+	if (error && error != ENOMEM)
+		return (error);
+
+	if (retval) {
+		if (req.oldptr && req.oldidx > req.oldlen)
+			*retval = req.oldlen;
+		else
+			*retval = req.oldidx;
+	}
+	return (error);
+}
+
+#ifdef COMPAT_43
+#include <sys/socket.h>
+#include <vm/vm_param.h>
+
+#define	KINFO_PROC		(0<<8)
+#define	KINFO_RT		(1<<8)
+#define	KINFO_VNODE		(2<<8)
+#define	KINFO_FILE		(3<<8)
+#define	KINFO_METER		(4<<8)
+#define	KINFO_LOADAVG		(5<<8)
+#define	KINFO_CLOCKRATE		(6<<8)
+
+/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
+#define	KINFO_BSDI_SYSINFO	(101<<8)
+
+/*
+ * XXX this is bloat, but I hope it's better here than on the potentially
+ * limited kernel stack...  -Peter
+ */
+
+static struct {
+	int	bsdi_machine;		/* "i386" on BSD/386 */
+/*      ^^^ this is an offset to the string, relative to the struct start */
+	char	*pad0;
+	long	pad1;
+	long	pad2;
+	long	pad3;
+	u_long	pad4;
+	u_long	pad5;
+	u_long	pad6;
+
+	int	bsdi_ostype;		/* "BSD/386" on BSD/386 */
+	int	bsdi_osrelease;		/* "1.1" on BSD/386 */
+	long	pad7;
+	long	pad8;
+	char	*pad9;
+
+	long	pad10;
+	long	pad11;
+	int	pad12;
+	long	pad13;
+	quad_t	pad14;
+	long	pad15;
+
+	struct	timeval pad16;
+	/* we dont set this, because BSDI's uname used gethostname() instead */
+	int	bsdi_hostname;		/* hostname on BSD/386 */
+
+	/* the actual string data is appended here */
+
+} bsdi_si;
+/*
+ * this data is appended to the end of the bsdi_si structure during copyout.
+ * The "char *" offsets are relative to the base of the bsdi_si struct.
+ * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
+ * should not exceed the length of the buffer here... (or else!! :-)
+ */
+static char bsdi_strings[80];	/* It had better be less than this! */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getkerninfo_args {
+	int	op;
+	char	*where;
+	size_t	*size;
+	int	arg;
+};
+#endif
+
+int
+ogetkerninfo(struct proc *p, struct getkerninfo_args *uap)
+{
+	int error, name[6];
+	size_t size;
+
+	switch (uap->op & 0xff00) {
+
+	case KINFO_RT:
+		name[0] = CTL_NET;
+		name[1] = PF_ROUTE;
+		name[2] = 0;
+		name[3] = (uap->op & 0xff0000) >> 16;
+		name[4] = uap->op & 0xff;
+		name[5] = uap->arg;
+		error = userland_sysctl(p, name, 6, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_VNODE:
+		name[0] = CTL_KERN;
+		name[1] = KERN_VNODE;
+		error = userland_sysctl(p, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_PROC:
+		name[0] = CTL_KERN;
+		name[1] = KERN_PROC;
+		name[2] = uap->op & 0xff;
+		name[3] = uap->arg;
+		error = userland_sysctl(p, name, 4, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_FILE:
+		name[0] = CTL_KERN;
+		name[1] = KERN_FILE;
+		error = userland_sysctl(p, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_METER:
+		name[0] = CTL_VM;
+		name[1] = VM_METER;
+		error = userland_sysctl(p, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_LOADAVG:
+		name[0] = CTL_VM;
+		name[1] = VM_LOADAVG;
+		error = userland_sysctl(p, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_CLOCKRATE:
+		name[0] = CTL_KERN;
+		name[1] = KERN_CLOCKRATE;
+		error = userland_sysctl(p, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_BSDI_SYSINFO: {
+		/*
+		 * this is pretty crude, but it's just enough for uname()
+		 * from BSDI's 1.x libc to work.
+		 *
+		 * In particular, it doesn't return the same results when
+		 * the supplied buffer is too small.  BSDI's version apparently
+		 * will return the amount copied, and set the *size to how
+		 * much was needed.  The emulation framework here isn't capable
+		 * of that, so we just set both to the amount copied.
+		 * BSDI's 2.x product apparently fails with ENOMEM in this
+		 * scenario.
+		 */
+
+		u_int needed;
+		u_int left;
+		char *s;
+
+		bzero((char *)&bsdi_si, sizeof(bsdi_si));
+		bzero(bsdi_strings, sizeof(bsdi_strings));
+
+		s = bsdi_strings;
+
+		bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, ostype);
+		s += strlen(s) + 1;
+
+		bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, osrelease);
+		s += strlen(s) + 1;
+
+		bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, machine);
+		s += strlen(s) + 1;
+
+		needed = sizeof(bsdi_si) + (s - bsdi_strings);
+
+		if (uap->where == NULL) {
+			/* process is asking how much buffer to supply.. */
+			size = needed;
+			error = 0;
+			break;
+		}
+
+
+		/* if too much buffer supplied, trim it down */
+		if (size > needed)
+			size = needed;
+
+		/* how much of the buffer is remaining */
+		left = size;
+
+		if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
+			break;
+
+		/* is there any point in continuing? */
+		if (left > sizeof(bsdi_si)) {
+			left -= sizeof(bsdi_si);
+			error = copyout(&bsdi_strings,
+					uap->where + sizeof(bsdi_si), left);
+		}
+		break;
+	}
+
+	default:
+		return (EOPNOTSUPP);
+	}
+	if (error)
+		return (error);
+	p->p_retval[0] = size;
+	if (uap->size)
+		error = copyout((caddr_t)&size, (caddr_t)uap->size,
+		    sizeof(size));
+	return (error);
+}
+#endif /* COMPAT_43 */
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
new file mode 100644
index 0000000..2ea378f
--- /dev/null
+++ b/sys/kern/kern_tc.c
@@ -0,0 +1,870 @@
+/*-
+ * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ * $Id: kern_clock.c,v 1.85 1998/11/23 09:58:53 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dkstat.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/timex.h>
+#include <vm/vm.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#include <machine/limits.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+#if defined(SMP) && defined(BETTER_CLOCK)
+#include <machine/smp.h>
+#endif
+
+/* This is where the NTIMECOUNTER option hangs out */
+#include "opt_ntp.h"
+
+/*
+ * Number of timecounters used to implement stable storage
+ */
+#ifndef NTIMECOUNTER
+#define NTIMECOUNTER	5
+#endif
+
+static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter", 
+	"Timecounter stable storage");
+
+static void initclocks __P((void *dummy));
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+static void tco_forward __P((int force));
+static void tco_setscales __P((struct timecounter *tc));
+static __inline unsigned tco_delta __P((struct timecounter *tc));
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+#if defined(SMP) && defined(BETTER_CLOCK)
+long cp_time[CPUSTATES];
+#else
+static long cp_time[CPUSTATES];
+#endif
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
+time_t time_second;
+
+/*
+ * Which update policy to use.
+ *   0 - every tick, bad hardware may fail with "calcru negative..."
+ *   1 - more resistent to the above hardware, but less efficient.
+ */
+static int tco_method;
+
+/*
+ * Implement a dummy timecounter which we can use until we get a real one
+ * in the air.  This allows the console and other early stuff to use
+ * timeservices.
+ */
+
+static unsigned 
+dummy_get_timecount(struct timecounter *tc)
+{
+	static unsigned now;
+	return (++now);
+}
+
+static struct timecounter dummy_timecounter = {
+	dummy_get_timecount,
+	0,
+	~0u,
+	1000000,
+	"dummy"
+};
+
+struct timecounter *timecounter = &dummy_timecounter;
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other.
+ *
+ * The main timer, running hz times per second, is used to trigger interval
+ * timers, timeouts and rescheduling as needed.
+ *
+ * The second timer handles kernel and user profiling,
+ * and does resource use estimation.  If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks.  For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires.  Otherwise, it would never accumulate
+ * cpu ticks.  The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock.  This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling.  This profile clock runs at profhz.  We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics.  (For profiling, every tick counts.)
+ *
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
+ */
+
+int	stathz;
+int	profhz;
+static int profprocs;
+int	ticks;
+static int psdiv, pscnt;		/* prof => stat divider */
+int	psratio;			/* ratio: prof / stat */
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+	void *dummy;
+{
+	register int i;
+
+	/*
+	 * Set divisors to 1 (normal case) and let the machine-specific
+	 * code do its bit.
+	 */
+	psdiv = pscnt = 1;
+	cpu_initclocks();
+
+	/*
+	 * Compute profhz/stathz, and fix profhz if needed.
+	 */
+	i = stathz ? stathz : hz;
+	if (profhz == 0)
+		profhz = i;
+	psratio = profhz / i;
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(frame)
+	register struct clockframe *frame;
+{
+	register struct proc *p;
+
+	p = curproc;
+	if (p) {
+		register struct pstats *pstats;
+
+		/*
+		 * Run current process's virtual and profile time, as needed.
+		 */
+		pstats = p->p_stats;
+		if (CLKF_USERMODE(frame) &&
+		    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+			psignal(p, SIGVTALRM);
+		if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+			psignal(p, SIGPROF);
+	}
+
+#if defined(SMP) && defined(BETTER_CLOCK)
+	forward_hardclock(pscnt);
+#endif
+
+	/*
+	 * If no separate statistics clock is available, run it from here.
+	 */
+	if (stathz == 0)
+		statclock(frame);
+
+	tco_forward(0);
+	ticks++;
+
+	/*
+	 * Process callouts at a very low cpu priority, so we don't keep the
+	 * relatively high clock interrupt priority any longer than necessary.
+	 */
+	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
+		if (CLKF_BASEPRI(frame)) {
+			/*
+			 * Save the overhead of a software interrupt;
+			 * it will happen as soon as we return, so do it now.
+			 */
+			(void)splsoftclock();
+			softclock();
+		} else
+			setsoftclock();
+	} else if (softticks + 1 == ticks)
+		++softticks;
+}
+
+/*
+ * Compute number of ticks in the specified amount of time.
+ */
+int
+tvtohz(tv)
+	struct timeval *tv;
+{
+	register unsigned long ticks;
+	register long sec, usec;
+
+	/*
+	 * If the number of usecs in the whole seconds part of the time
+	 * difference fits in a long, then the total number of usecs will
+	 * fit in an unsigned long.  Compute the total and convert it to
+	 * ticks, rounding up and adding 1 to allow for the current tick
+	 * to expire.  Rounding also depends on unsigned long arithmetic
+	 * to avoid overflow.
+	 *
+	 * Otherwise, if the number of ticks in the whole seconds part of
+	 * the time difference fits in a long, then convert the parts to
+	 * ticks separately and add, using similar rounding methods and
+	 * overflow avoidance.  This method would work in the previous
+	 * case but it is slightly slower and assumes that hz is integral.
+	 *
+	 * Otherwise, round the time difference down to the maximum
+	 * representable value.
+	 *
+	 * If ints have 32 bits, then the maximum value for any timeout in
+	 * 10ms ticks is 248 days.
+	 */
+	sec = tv->tv_sec;
+	usec = tv->tv_usec;
+	if (usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+	if (sec < 0) {
+#ifdef DIAGNOSTIC
+		if (usec > 0) {
+			sec++;
+			usec -= 1000000;
+		}
+		printf("tvotohz: negative time difference %ld sec %ld usec\n",
+		       sec, usec);
+#endif
+		ticks = 1;
+	} else if (sec <= LONG_MAX / 1000000)
+		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+			/ tick + 1;
+	else if (sec <= LONG_MAX / hz)
+		ticks = sec * hz
+			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
+	else
+		ticks = LONG_MAX;
+	if (ticks > INT_MAX)
+		ticks = INT_MAX;
+	return ((int)ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+	register struct proc *p;
+{
+	int s;
+
+	if ((p->p_flag & P_PROFIL) == 0) {
+		p->p_flag |= P_PROFIL;
+		if (++profprocs == 1 && stathz != 0) {
+			s = splstatclock();
+			psdiv = pscnt = psratio;
+			setstatclockrate(profhz);
+			splx(s);
+		}
+	}
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+	register struct proc *p;
+{
+	int s;
+
+	if (p->p_flag & P_PROFIL) {
+		p->p_flag &= ~P_PROFIL;
+		if (--profprocs == 0 && stathz != 0) {
+			s = splstatclock();
+			psdiv = pscnt = 1;
+			setstatclockrate(stathz);
+			splx(s);
+		}
+	}
+}
+
+/*
+ * Statistics clock.  Grab profile sample, and if divider reaches 0,
+ * do process and kernel statistics.
+ */
+void
+statclock(frame)
+	register struct clockframe *frame;
+{
+#ifdef GPROF
+	register struct gmonparam *g;
+	int i;
+#endif
+	register struct proc *p;
+	struct pstats *pstats;
+	long rss;
+	struct rusage *ru;
+	struct vmspace *vm;
+
+	if (curproc != NULL && CLKF_USERMODE(frame)) {
+		p = curproc;
+		if (p->p_flag & P_PROFIL)
+			addupc_intr(p, CLKF_PC(frame), 1);
+#if defined(SMP) && defined(BETTER_CLOCK)
+		if (stathz != 0)
+			forward_statclock(pscnt);
+#endif
+		if (--pscnt > 0)
+			return;
+		/*
+		 * Came from user mode; CPU was in user state.
+		 * If this process is being profiled record the tick.
+		 */
+		p->p_uticks++;
+		if (p->p_nice > NZERO)
+			cp_time[CP_NICE]++;
+		else
+			cp_time[CP_USER]++;
+	} else {
+#ifdef GPROF
+		/*
+		 * Kernel statistics are just like addupc_intr, only easier.
+		 */
+		g = &_gmonparam;
+		if (g->state == GMON_PROF_ON) {
+			i = CLKF_PC(frame) - g->lowpc;
+			if (i < g->textsize) {
+				i /= HISTFRACTION * sizeof(*g->kcount);
+				g->kcount[i]++;
+			}
+		}
+#endif
+#if defined(SMP) && defined(BETTER_CLOCK)
+		if (stathz != 0)
+			forward_statclock(pscnt);
+#endif
+		if (--pscnt > 0)
+			return;
+		/*
+		 * Came from kernel mode, so we were:
+		 * - handling an interrupt,
+		 * - doing syscall or trap work on behalf of the current
+		 *   user process, or
+		 * - spinning in the idle loop.
+		 * Whichever it is, charge the time as appropriate.
+		 * Note that we charge interrupts to the current process,
+		 * regardless of whether they are ``for'' that process,
+		 * so that we know how much of its real time was spent
+		 * in ``non-process'' (i.e., interrupt) work.
+		 */
+		p = curproc;
+		if (CLKF_INTR(frame)) {
+			if (p != NULL)
+				p->p_iticks++;
+			cp_time[CP_INTR]++;
+		} else if (p != NULL) {
+			p->p_sticks++;
+			cp_time[CP_SYS]++;
+		} else
+			cp_time[CP_IDLE]++;
+	}
+	pscnt = psdiv;
+
+	/*
+	 * We maintain statistics shown by user-level statistics
+	 * programs:  the amount of time in each cpu state.
+	 */
+
+	/*
+	 * We adjust the priority of the current process.  The priority of
+	 * a process gets worse as it accumulates CPU time.  The cpu usage
+	 * estimator (p_estcpu) is increased here.  The formula for computing
+	 * priorities (in kern_synch.c) will compute a different value each
+	 * time p_estcpu increases by 4.  The cpu usage estimator ramps up
+	 * quite quickly when the process is running (linearly), and decays
+	 * away exponentially, at a rate which is proportionally slower when
+	 * the system is busy.  The basic principal is that the system will
+	 * 90% forget that the process used a lot of CPU time in 5 * loadav
+	 * seconds.  This causes the system to favor processes which haven't
+	 * run much recently, and to round-robin among other processes.
+	 */
+	if (p != NULL) {
+		p->p_cpticks++;
+		if (++p->p_estcpu == 0)
+			p->p_estcpu--;
+		if ((p->p_estcpu & 3) == 0) {
+			resetpriority(p);
+			if (p->p_priority >= PUSER)
+				p->p_priority = p->p_usrpri;
+		}
+
+		/* Update resource usage integrals and maximums. */
+		if ((pstats = p->p_stats) != NULL &&
+		    (ru = &pstats->p_ru) != NULL &&
+		    (vm = p->p_vmspace) != NULL) {
+			ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+			ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+			ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+			rss = vm->vm_pmap.pm_stats.resident_count *
+			      PAGE_SIZE / 1024;
+			if (ru->ru_maxrss < rss)
+				ru->ru_maxrss = rss;
+        	}
+	}
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
+{
+	struct clockinfo clkinfo;
+	/*
+	 * Construct clockinfo structure.
+	 */
+	clkinfo.hz = hz;
+	clkinfo.tick = tick;
+	clkinfo.tickadj = tickadj;
+	clkinfo.profhz = profhz;
+	clkinfo.stathz = stathz ? stathz : hz;
+	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
+
+static __inline unsigned
+tco_delta(struct timecounter *tc)
+{
+
+	return ((tc->tc_get_timecount(tc) - tc->tc_offset_count) & 
+	    tc->tc_counter_mask);
+}
+
+/*
+ * We have four functions for looking at the clock, two for microseconds
+ * and two for nanoseconds.  For each there is fast but less precise
+ * version "get{nano|micro}time" which will return a time which is up
+ * to 1/HZ previous to the call, whereas the raw version "{nano|micro}time"
+ * will return a timestamp which is as precise as possible.
+ */
+
+void
+getmicrotime(struct timeval *tvp)
+{
+	struct timecounter *tc;
+
+	if (!tco_method) {
+		tc = timecounter;
+		*tvp = tc->tc_microtime;
+	} else {
+		microtime(tvp);
+	}
+}
+
+void
+getnanotime(struct timespec *tsp)
+{
+	struct timecounter *tc;
+
+	if (!tco_method) {
+		tc = timecounter;
+		*tsp = tc->tc_nanotime;
+	} else {
+		nanotime(tsp);
+	}
+}
+
+void
+microtime(struct timeval *tv)
+{
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	tv->tv_sec = tc->tc_offset_sec;
+	tv->tv_usec = tc->tc_offset_micro;
+	tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
+	tv->tv_usec += boottime.tv_usec;
+	tv->tv_sec += boottime.tv_sec;
+	while (tv->tv_usec >= 1000000) {
+		tv->tv_usec -= 1000000;
+		tv->tv_sec++;
+	}
+}
+
+void
+nanotime(struct timespec *ts)
+{
+	unsigned count;
+	u_int64_t delta;
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	ts->tv_sec = tc->tc_offset_sec;
+	count = tco_delta(tc);
+	delta = tc->tc_offset_nano;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+	delta >>= 32;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+	delta += boottime.tv_usec * 1000;
+	ts->tv_sec += boottime.tv_sec;
+	while (delta >= 1000000000) {
+		delta -= 1000000000;
+		ts->tv_sec++;
+	}
+	ts->tv_nsec = delta;
+}
+
+void
+timecounter_timespec(unsigned count, struct timespec *ts)
+{
+	u_int64_t delta;
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	ts->tv_sec = tc->tc_offset_sec;
+	count -= tc->tc_offset_count;
+	count &= tc->tc_counter_mask;
+	delta = tc->tc_offset_nano;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+	delta >>= 32;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+	delta += boottime.tv_usec * 1000;
+	ts->tv_sec += boottime.tv_sec;
+	while (delta >= 1000000000) {
+		delta -= 1000000000;
+		ts->tv_sec++;
+	}
+	ts->tv_nsec = delta;
+}
+
+void
+getmicrouptime(struct timeval *tvp)
+{
+	struct timecounter *tc;
+
+	if (!tco_method) {
+		tc = timecounter;
+		tvp->tv_sec = tc->tc_offset_sec;
+		tvp->tv_usec = tc->tc_offset_micro;
+	} else {
+		microuptime(tvp);
+	}
+}
+
+void
+getnanouptime(struct timespec *tsp)
+{
+	struct timecounter *tc;
+
+	if (!tco_method) {
+		tc = timecounter;
+		tsp->tv_sec = tc->tc_offset_sec;
+		tsp->tv_nsec = tc->tc_offset_nano >> 32;
+	} else {
+		nanouptime(tsp);
+	}
+}
+
+void
+microuptime(struct timeval *tv)
+{
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	tv->tv_sec = tc->tc_offset_sec;
+	tv->tv_usec = tc->tc_offset_micro;
+	tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
+	if (tv->tv_usec >= 1000000) {
+		tv->tv_usec -= 1000000;
+		tv->tv_sec++;
+	}
+}
+
+void
+nanouptime(struct timespec *ts)
+{
+	unsigned count;
+	u_int64_t delta;
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	ts->tv_sec = tc->tc_offset_sec;
+	count = tco_delta(tc);
+	delta = tc->tc_offset_nano;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_f);
+	delta >>= 32;
+	delta += ((u_int64_t)count * tc->tc_scale_nano_i);
+	if (delta >= 1000000000) {
+		delta -= 1000000000;
+		ts->tv_sec++;
+	}
+	ts->tv_nsec = delta;
+}
+
+static void
+tco_setscales(struct timecounter *tc)
+{
+	u_int64_t scale;
+
+	scale = 1000000000LL << 32;
+	if (tc->tc_adjustment > 0)
+		scale += (tc->tc_adjustment * 1000LL) << 10;
+	else
+		scale -= (-tc->tc_adjustment * 1000LL) << 10;
+	scale /= tc->tc_frequency;
+	tc->tc_scale_micro = scale / 1000;
+	tc->tc_scale_nano_f = scale & 0xffffffff;
+	tc->tc_scale_nano_i = scale >> 32;
+}
+
+void
+init_timecounter(struct timecounter *tc)
+{
+	struct timespec ts1;
+	struct timecounter *t1, *t2, *t3;
+	int i;
+
+	tc->tc_adjustment = 0;
+	tco_setscales(tc);
+	tc->tc_offset_count = tc->tc_get_timecount(tc);
+	tc->tc_tweak = tc;
+	MALLOC(t1, struct timecounter *, sizeof *t1, M_TIMECOUNTER, M_WAITOK);
+	*t1 = *tc;
+	t2 = t1;
+	for (i = 1; i < NTIMECOUNTER; i++) {
+		MALLOC(t3, struct timecounter *, sizeof *t3,
+		    M_TIMECOUNTER, M_WAITOK);
+		*t3 = *tc;
+		t3->tc_other = t2;
+		t2 = t3;
+	}
+	t1->tc_other = t3;
+	tc = t1;
+
+	printf("Timecounter \"%s\"  frequency %lu Hz\n", 
+	    tc->tc_name, (u_long)tc->tc_frequency);
+
+	/* XXX: For now always start using the counter. */
+	tc->tc_offset_count = tc->tc_get_timecount(tc);
+	nanouptime(&ts1);
+	tc->tc_offset_nano = (u_int64_t)ts1.tv_nsec << 32;
+	tc->tc_offset_micro = ts1.tv_nsec / 1000;
+	tc->tc_offset_sec = ts1.tv_sec;
+	timecounter = tc;
+}
+
+void
+set_timecounter(struct timespec *ts)
+{
+	struct timespec ts2;
+
+	nanouptime(&ts2);
+	boottime.tv_sec = ts->tv_sec - ts2.tv_sec;
+	boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000;
+	if (boottime.tv_usec < 0) {
+		boottime.tv_usec += 1000000;
+		boottime.tv_sec--;
+	}
+	/* fiddle all the little crinkly bits around the fiords... */
+	tco_forward(1);
+}
+
+
+#if 0 /* Currently unused */
+void
+switch_timecounter(struct timecounter *newtc)
+{
+	int s;
+	struct timecounter *tc;
+	struct timespec ts;
+
+	s = splclock();
+	tc = timecounter;
+	if (newtc == tc || newtc == tc->tc_other) {
+		splx(s);
+		return;
+	}
+	nanouptime(&ts);
+	newtc->tc_offset_sec = ts.tv_sec;
+	newtc->tc_offset_nano = (u_int64_t)ts.tv_nsec << 32;
+	newtc->tc_offset_micro = ts.tv_nsec / 1000;
+	newtc->tc_offset_count = newtc->tc_get_timecount(newtc);
+	timecounter = newtc;
+	splx(s);
+}
+#endif
+
+static struct timecounter *
+sync_other_counter(void)
+{
+	struct timecounter *tc, *tcn, *tco;
+	unsigned delta;
+
+	tco = timecounter;
+	tc = tco->tc_other;
+	tcn = tc->tc_other;
+	*tc = *tco;
+	tc->tc_other = tcn;
+	delta = tco_delta(tc);
+	tc->tc_offset_count += delta;
+	tc->tc_offset_count &= tc->tc_counter_mask;
+	tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_f;
+	tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_i << 32;
+	return (tc);
+}
+
+static void
+tco_forward(int force)
+{
+	struct timecounter *tc, *tco;
+
+	tco = timecounter;
+	tc = sync_other_counter();
+	/*
+	 * We may be inducing a tiny error here, the tc_poll_pps() may
+	 * process a latched count which happens after the tco_delta()
+	 * in sync_other_counter(), which would extend the previous
+	 * counters parameters into the domain of this new one.
+	 * Since the timewindow is very small for this, the error is
+	 * going to be only a few weenieseconds (as Dave Mills would
+	 * say), so lets just not talk more about it, OK ?
+	 */
+	if (tco->tc_poll_pps) 
+		tco->tc_poll_pps(tco);
+	if (timedelta != 0) {
+		tc->tc_offset_nano += (u_int64_t)(tickdelta * 1000) << 32;
+		timedelta -= tickdelta;
+		force++;
+	}
+
+	while (tc->tc_offset_nano >= 1000000000ULL << 32) {
+		tc->tc_offset_nano -= 1000000000ULL << 32;
+		tc->tc_offset_sec++;
+		tc->tc_frequency = tc->tc_tweak->tc_frequency;
+		tc->tc_adjustment = tc->tc_tweak->tc_adjustment;
+		ntp_update_second(tc);	/* XXX only needed if xntpd runs */
+		tco_setscales(tc);
+		force++;
+	}
+
+	if (tco_method && !force)
+		return;
+
+	tc->tc_offset_micro = (tc->tc_offset_nano / 1000) >> 32;
+
+	/* Figure out the wall-clock time */
+	tc->tc_nanotime.tv_sec = tc->tc_offset_sec + boottime.tv_sec;
+	tc->tc_nanotime.tv_nsec = 
+	    (tc->tc_offset_nano >> 32) + boottime.tv_usec * 1000;
+	tc->tc_microtime.tv_usec = tc->tc_offset_micro + boottime.tv_usec;
+	if (tc->tc_nanotime.tv_nsec >= 1000000000) {
+		tc->tc_nanotime.tv_nsec -= 1000000000;
+		tc->tc_microtime.tv_usec -= 1000000;
+		tc->tc_nanotime.tv_sec++;
+	}
+	time_second = tc->tc_microtime.tv_sec = tc->tc_nanotime.tv_sec;
+
+	timecounter = tc;
+}
+
+static int
+sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS
+{
+
+	return (sysctl_handle_opaque(oidp, 
+	    &timecounter->tc_tweak->tc_frequency,
+	    sizeof(timecounter->tc_tweak->tc_frequency), req));
+}
+
+static int
+sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS
+{
+
+	return (sysctl_handle_opaque(oidp, 
+	    &timecounter->tc_tweak->tc_adjustment,
+	    sizeof(timecounter->tc_tweak->tc_adjustment), req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+
+SYSCTL_INT(_kern_timecounter, KERN_ARGMAX, method, CTLFLAG_RW, &tco_method, 0,
+    "This variable determines the method used for updating timecounters. "
+    "If the default algorithm (0) fails with \"calcru negative...\" messages "
+    "try the alternate algorithm (1) which handles bad hardware better."
+
+);
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT | CTLFLAG_RW,
+    0, sizeof(u_int), sysctl_kern_timecounter_frequency, "I", "");
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT | CTLFLAG_RW,
+    0, sizeof(int), sysctl_kern_timecounter_adjustment, "I", "");
diff --git a/sys/kern/kern_threads.c b/sys/kern/kern_threads.c
new file mode 100644
index 0000000..57e8d96
--- /dev/null
+++ b/sys/kern/kern_threads.c
@@ -0,0 +1,154 @@
+/*
+ *
+ * Portions of this code was derived from the file kern_fork.c and as such
+ * is subject to the copyrights below.
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (c) 1996 Douglas Santry
+ *
+ * This code is subject to the beer copyright.  If I chance to meet you in a
+ * bar and this code helped you in some way, you owe me a beer.  Only
+ * in Germany will I accept domestic beer.  This code may or may not work
+ * and I certainly make no claims as to its fitness for *any* purpose.
+ * 
+ * $Id: kern_threads.c,v 1.9 1998/10/25 17:44:51 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+
+/*
+ * Low level support for sleep/wakeup paradigm
+ * If a timeout is specified:
+ *	returns 0 if wakeup
+ *	returns EAGAIN if timed out
+ *	returns EINVAL if error
+ *
+ * If a timeout is not specified:
+ *
+ *	returns time waiting in ticks.
+ */
+int
+thr_sleep(struct proc *p, struct thr_sleep_args *uap) {
+	int sleepstart;
+	struct timespec ts;
+	struct timeval atv;
+	int error, timo;
+
+	timo = 0;
+	if (uap->timeout != 0) {
+		/*
+		 * Get timespec struct
+		 */
+		if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) {
+			p->p_wakeup = 0;
+			return error;
+		}
+		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) {
+			p->p_wakeup = 0;
+			return (EINVAL);
+		}
+		TIMESPEC_TO_TIMEVAL(&atv, &ts);
+		if (itimerfix(&atv)) {
+			p->p_wakeup = 0;
+			return (EINVAL);
+		}
+		timo = tvtohz(&atv);
+	}
+
+	p->p_retval[0] = 0;
+	if (p->p_wakeup == 0) {
+		sleepstart = ticks;
+		p->p_flag |= P_SINTR;
+		error = tsleep(p, PRIBIO, "thrslp", timo);
+		p->p_flag &= ~P_SINTR;
+		if (error == EWOULDBLOCK) {
+			p->p_wakeup = 0;
+			p->p_retval[0] = EAGAIN;
+			return 0;
+		}
+		if (uap->timeout == 0)
+			p->p_retval[0] = ticks - sleepstart;
+	}
+	p->p_wakeup = 0;
+	return (0);
+}
+
+int
+thr_wakeup(struct proc *p, struct thr_wakeup_args *uap) {
+	struct proc *pSlave = p->p_leader;
+
+	while(pSlave && (pSlave->p_pid != uap->pid))
+		pSlave = pSlave->p_peers;
+
+	if(pSlave == 0) {
+		p->p_retval[0] = ESRCH;
+		return(0);
+	}
+
+	pSlave->p_wakeup++;
+	if((pSlave->p_stat == SSLEEP) && (pSlave->p_wchan == pSlave)) {
+		wakeup(pSlave);
+		return(0);
+	}
+
+	p->p_retval[0] = EAGAIN;
+	return 0;
+}
+
+/*
+ * General purpose yield system call
+ */
+int
+yield(struct proc *p, struct yield_args *uap) {
+	int s;
+
+	p->p_retval[0] = 0;
+
+	s = splhigh();
+	p->p_priority = MAXPRI;
+	setrunqueue(p);
+	mi_switch();
+	splx(s);
+
+	return(0);
+}
+
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
new file mode 100644
index 0000000..2bd17bb
--- /dev/null
+++ b/sys/kern/kern_time.c
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_time.c	8.1 (Berkeley) 6/10/93
+ * $Id: kern_time.c,v 1.58 1998/06/09 13:10:53 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/sysproto.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+struct timezone tz;
+
+/*
+ * Time of day and interval timer support.
+ *
+ * These routines provide the kernel entry points to get and set
+ * the time-of-day and per-process interval timers.  Subroutines
+ * here provide support for adding and subtracting timeval structures
+ * and decrementing interval timers, optionally reloading the interval
+ * timers when they expire.
+ */
+
+static int	nanosleep1 __P((struct proc *p, struct timespec *rqt,
+		    struct timespec *rmt));
+static int	settime __P((struct timeval *));
+static void	timevalfix __P((struct timeval *));
+static void	no_lease_updatetime __P((int));
+
+static void 
+no_lease_updatetime(deltat)
+	int deltat;
+{
+}
+
+void (*lease_updatetime) __P((int))  = no_lease_updatetime;
+
+static int
+settime(tv)
+	struct timeval *tv;
+{
+	struct timeval delta, tv1;
+	struct timespec ts;
+	int s;
+
+	s = splclock();
+	microtime(&tv1);
+	delta = *tv;
+	timevalsub(&delta, &tv1);
+
+	/*
+	 * If the system is secure, we do not allow the time to be 
+	 * set to an earlier value (it may be slowed using adjtime,
+	 * but not set back). This feature prevent interlopers from
+	 * setting arbitrary time stamps on files.
+	 */
+	if (delta.tv_sec < 0 && securelevel > 1) {
+		splx(s);
+		return (EPERM);
+	}
+
+	ts.tv_sec = tv->tv_sec;
+	ts.tv_nsec = tv->tv_usec * 1000;
+	set_timecounter(&ts);
+	(void) splsoftclock();
+	lease_updatetime(delta.tv_sec);
+	splx(s);
+	resettodr();
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_gettime_args {
+	clockid_t clock_id;
+	struct	timespec *tp;
+};
+#endif
+
+/* ARGSUSED */
+int
+clock_gettime(p, uap)
+	struct proc *p;
+	struct clock_gettime_args *uap;
+{
+	struct timespec ats;
+
+	if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+		return (EINVAL);
+	nanotime(&ats);
+	return (copyout(&ats, SCARG(uap, tp), sizeof(ats)));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_settime_args {
+	clockid_t clock_id;
+	const struct	timespec *tp;
+};
+#endif
+
+/* ARGSUSED */
+int
+clock_settime(p, uap)
+	struct proc *p;
+	struct clock_settime_args *uap;
+{
+	struct timeval atv;
+	struct timespec ats;
+	int error;
+
+	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
+		return (error);
+	if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+		return (EINVAL);
+	if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0)
+		return (error);
+	if (ats.tv_nsec < 0 || ats.tv_nsec >= 1000000000)
+		return (EINVAL);
+	/* XXX Don't convert nsec->usec and back */
+	TIMESPEC_TO_TIMEVAL(&atv, &ats);
+	if ((error = settime(&atv)))
+		return (error);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_getres_args {
+	clockid_t clock_id;
+	struct	timespec *tp;
+};
+#endif
+
+int
+clock_getres(p, uap)
+	struct proc *p;
+	struct clock_getres_args *uap;
+{
+	struct timespec ts;
+	int error;
+
+	if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+		return (EINVAL);
+	error = 0;
+	if (SCARG(uap, tp)) {
+		ts.tv_sec = 0;
+		ts.tv_nsec = 1000000000 / timecounter->tc_frequency;
+		error = copyout(&ts, SCARG(uap, tp), sizeof(ts));
+	}
+	return (error);
+}
+
+static int nanowait;
+
+static int
+nanosleep1(p, rqt, rmt)
+	struct proc *p;
+	struct timespec *rqt, *rmt;
+{
+	struct timespec ts, ts2, ts3;
+	struct timeval tv;
+	int error;
+
+	if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
+		return (EINVAL);
+	if (rqt->tv_sec < 0 || rqt->tv_sec == 0 && rqt->tv_nsec == 0)
+		return (0);
+	getnanouptime(&ts);
+	timespecadd(&ts, rqt);
+	TIMESPEC_TO_TIMEVAL(&tv, rqt);
+	for (;;) {
+		error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp",
+		    tvtohz(&tv));
+		getnanouptime(&ts2);
+		if (error != EWOULDBLOCK) {
+			if (error == ERESTART)
+				error = EINTR;
+			if (rmt != NULL) {
+				timespecsub(&ts, &ts2);
+				if (ts.tv_sec < 0)
+					timespecclear(&ts);
+				*rmt = ts;
+			}
+			return (error);
+		}
+		if (timespeccmp(&ts2, &ts, >=))
+			return (0);
+		ts3 = ts;
+		timespecsub(&ts3, &ts2);
+		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
+	}
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nanosleep_args {
+	struct	timespec *rqtp;
+	struct	timespec *rmtp;
+};
+#endif
+
+/* ARGSUSED */
+int
+nanosleep(p, uap)
+	struct proc *p;
+	struct nanosleep_args *uap;
+{
+	struct timespec rmt, rqt;
+	int error, error2;
+
+	error = copyin(SCARG(uap, rqtp), &rqt, sizeof(rqt));
+	if (error)
+		return (error);
+	if (SCARG(uap, rmtp))
+		if (!useracc((caddr_t)SCARG(uap, rmtp), sizeof(rmt), B_WRITE))
+			return (EFAULT);
+	error = nanosleep1(p, &rqt, &rmt);
+	if (error && SCARG(uap, rmtp)) {
+		error2 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt));
+		if (error2)	/* XXX shouldn't happen, did useracc() above */
+			return (error2);
+	}
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct gettimeofday_args {
+	struct	timeval *tp;
+	struct	timezone *tzp;
+};
+#endif
+/* ARGSUSED */
+int
+gettimeofday(p, uap)
+	struct proc *p;
+	register struct gettimeofday_args *uap;
+{
+	struct timeval atv;
+	int error = 0;
+
+	if (uap->tp) {
+		microtime(&atv);
+		if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp,
+		    sizeof (atv))))
+			return (error);
+	}
+	if (uap->tzp)
+		error = copyout((caddr_t)&tz, (caddr_t)uap->tzp,
+		    sizeof (tz));
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct settimeofday_args {
+	struct	timeval *tv;
+	struct	timezone *tzp;
+};
+#endif
+/* ARGSUSED */
+int
+settimeofday(p, uap)
+	struct proc *p;
+	struct settimeofday_args *uap;
+{
+	struct timeval atv;
+	struct timezone atz;
+	int error;
+
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
+		return (error);
+	/* Verify all parameters before changing time. */
+	if (uap->tv) {
+		if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
+		    sizeof(atv))))
+			return (error);
+		if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
+			return (EINVAL);
+	}
+	if (uap->tzp &&
+	    (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz))))
+		return (error);
+	if (uap->tv && (error = settime(&atv)))
+		return (error);
+	if (uap->tzp)
+		tz = atz;
+	return (0);
+}
+
+int	tickdelta;			/* current clock skew, us. per tick */
+long	timedelta;			/* unapplied time correction, us. */
+static long	bigadj = 1000000;	/* use 10x skew above bigadj us. */
+
+#ifndef _SYS_SYSPROTO_H_
+struct adjtime_args {
+	struct timeval *delta;
+	struct timeval *olddelta;
+};
+#endif
+/* ARGSUSED */
+int
+adjtime(p, uap)
+	struct proc *p;
+	register struct adjtime_args *uap;
+{
+	struct timeval atv;
+	register long ndelta, ntickdelta, odelta;
+	int s, error;
+
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
+		return (error);
+	if ((error =
+	    copyin((caddr_t)uap->delta, (caddr_t)&atv, sizeof(struct timeval))))
+		return (error);
+
+	/*
+	 * Compute the total correction and the rate at which to apply it.
+	 * Round the adjustment down to a whole multiple of the per-tick
+	 * delta, so that after some number of incremental changes in
+	 * hardclock(), tickdelta will become zero, lest the correction
+	 * overshoot and start taking us away from the desired final time.
+	 */
+	ndelta = atv.tv_sec * 1000000 + atv.tv_usec;
+	if (ndelta > bigadj || ndelta < -bigadj)
+		ntickdelta = 10 * tickadj;
+	else
+		ntickdelta = tickadj;
+	if (ndelta % ntickdelta)
+		ndelta = ndelta / ntickdelta * ntickdelta;
+
+	/*
+	 * To make hardclock()'s job easier, make the per-tick delta negative
+	 * if we want time to run slower; then hardclock can simply compute
+	 * tick + tickdelta, and subtract tickdelta from timedelta.
+	 */
+	if (ndelta < 0)
+		ntickdelta = -ntickdelta;
+	s = splclock();
+	odelta = timedelta;
+	timedelta = ndelta;
+	tickdelta = ntickdelta;
+	splx(s);
+
+	if (uap->olddelta) {
+		atv.tv_sec = odelta / 1000000;
+		atv.tv_usec = odelta % 1000000;
+		(void) copyout((caddr_t)&atv, (caddr_t)uap->olddelta,
+		    sizeof(struct timeval));
+	}
+	return (0);
+}
+
+/*
+ * Get value of an interval timer.  The process virtual and
+ * profiling virtual time timers are kept in the p_stats area, since
+ * they can be swapped out.  These are kept internally in the
+ * way they are specified externally: in time until they expire.
+ *
+ * The real time interval timer is kept in the process table slot
+ * for the process, and its value (it_value) is kept as an
+ * absolute time rather than as a delta, so that it is easy to keep
+ * periodic real-time signals from drifting.
+ *
+ * Virtual time timers are processed in the hardclock() routine of
+ * kern_clock.c.  The real time timer is processed by a timeout
+ * routine, called from the softclock() routine.  Since a callout
+ * may be delayed in real time due to interrupt processing in the system,
+ * it is possible for the real time timeout routine (realitexpire, given below),
+ * to be delayed in real time past when it is supposed to occur.  It
+ * does not suffice, therefore, to reload the real timer .it_value from the
+ * real time timers .it_interval.  Rather, we compute the next time in
+ * absolute time the timer should go off.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getitimer_args {
+	u_int	which;
+	struct	itimerval *itv;
+};
+#endif
+/* ARGSUSED */
+int
+getitimer(p, uap)
+	struct proc *p;
+	register struct getitimer_args *uap;
+{
+	struct timeval ctv;
+	struct itimerval aitv;
+	int s;
+
+	if (uap->which > ITIMER_PROF)
+		return (EINVAL);
+	s = splclock(); /* XXX still needed ? */
+	if (uap->which == ITIMER_REAL) {
+		/*
+		 * Convert from absolute to relative time in .it_value
+		 * part of real time timer.  If time for real time timer
+		 * has passed return 0, else return difference between
+		 * current time and time for the timer to go off.
+		 */
+		aitv = p->p_realtimer;
+		if (timevalisset(&aitv.it_value)) {
+			getmicrouptime(&ctv);
+			if (timevalcmp(&aitv.it_value, &ctv, <))
+				timevalclear(&aitv.it_value);
+			else
+				timevalsub(&aitv.it_value, &ctv);
+		}
+	} else
+		aitv = p->p_stats->p_timer[uap->which];
+	splx(s);
+	return (copyout((caddr_t)&aitv, (caddr_t)uap->itv,
+	    sizeof (struct itimerval)));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setitimer_args {
+	u_int	which;
+	struct	itimerval *itv, *oitv;
+};
+#endif
+/* ARGSUSED */
+int
+setitimer(p, uap)
+	struct proc *p;
+	register struct setitimer_args *uap;
+{
+	struct itimerval aitv;
+	struct timeval ctv;
+	register struct itimerval *itvp;
+	int s, error;
+
+	if (uap->which > ITIMER_PROF)
+		return (EINVAL);
+	itvp = uap->itv;
+	if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv,
+	    sizeof(struct itimerval))))
+		return (error);
+	if ((uap->itv = uap->oitv) &&
+	    (error = getitimer(p, (struct getitimer_args *)uap)))
+		return (error);
+	if (itvp == 0)
+		return (0);
+	if (itimerfix(&aitv.it_value))
+		return (EINVAL);
+	if (!timevalisset(&aitv.it_value))
+		timevalclear(&aitv.it_interval);
+	else if (itimerfix(&aitv.it_interval))
+		return (EINVAL);
+	s = splclock(); /* XXX: still needed ? */
+	if (uap->which == ITIMER_REAL) {
+		if (timevalisset(&p->p_realtimer.it_value))
+			untimeout(realitexpire, (caddr_t)p, p->p_ithandle);
+		if (timevalisset(&aitv.it_value)) 
+			p->p_ithandle = timeout(realitexpire, (caddr_t)p,
+						tvtohz(&aitv.it_value));
+		getmicrouptime(&ctv);
+		timevaladd(&aitv.it_value, &ctv);
+		p->p_realtimer = aitv;
+	} else
+		p->p_stats->p_timer[uap->which] = aitv;
+	splx(s);
+	return (0);
+}
+
+/*
+ * Real interval timer expired:
+ * send process whose timer expired an alarm signal.
+ * If time is not set up to reload, then just return.
+ * Else compute next time timer should go off which is > current time.
+ * This is where delay in processing this timeout causes multiple
+ * SIGALRM calls to be compressed into one.
+ * tvtohz() always adds 1 to allow for the time until the next clock
+ * interrupt being strictly less than 1 clock tick, but we don't want
+ * that here since we want to appear to be in sync with the clock
+ * interrupt even when we're delayed.
+ */
+void
+realitexpire(arg)
+	void *arg;
+{
+	register struct proc *p;
+	struct timeval ctv, ntv;
+	int s;
+
+	p = (struct proc *)arg;
+	psignal(p, SIGALRM);
+	if (!timevalisset(&p->p_realtimer.it_interval)) {
+		timevalclear(&p->p_realtimer.it_value);
+		return;
+	}
+	for (;;) {
+		s = splclock(); /* XXX: still neeeded ? */
+		timevaladd(&p->p_realtimer.it_value,
+		    &p->p_realtimer.it_interval);
+		getmicrouptime(&ctv);
+		if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
+			ntv = p->p_realtimer.it_value;
+			timevalsub(&ntv, &ctv);
+			p->p_ithandle = timeout(realitexpire, (caddr_t)p,
+			    tvtohz(&ntv) - 1);
+			splx(s);
+			return;
+		}
+		splx(s);
+	}
+}
+
+/*
+ * Check that a proposed value to load into the .it_value or
+ * .it_interval part of an interval timer is acceptable, and
+ * fix it to have at least minimal value (i.e. if it is less
+ * than the resolution of the clock, round it up.)
+ */
+int
+itimerfix(tv)
+	struct timeval *tv;
+{
+
+	if (tv->tv_sec < 0 || tv->tv_sec > 100000000 ||
+	    tv->tv_usec < 0 || tv->tv_usec >= 1000000)
+		return (EINVAL);
+	if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
+		tv->tv_usec = tick;
+	return (0);
+}
+
+/*
+ * Decrement an interval timer by a specified number
+ * of microseconds, which must be less than a second,
+ * i.e. < 1000000.  If the timer expires, then reload
+ * it.  In this case, carry over (usec - old value) to
+ * reduce the value reloaded into the timer so that
+ * the timer does not drift.  This routine assumes
+ * that it is called in a context where the timers
+ * on which it is operating cannot change in value.
+ */
+int
+itimerdecr(itp, usec)
+	register struct itimerval *itp;
+	int usec;
+{
+
+	if (itp->it_value.tv_usec < usec) {
+		if (itp->it_value.tv_sec == 0) {
+			/* expired, and already in next interval */
+			usec -= itp->it_value.tv_usec;
+			goto expire;
+		}
+		itp->it_value.tv_usec += 1000000;
+		itp->it_value.tv_sec--;
+	}
+	itp->it_value.tv_usec -= usec;
+	usec = 0;
+	if (timevalisset(&itp->it_value))
+		return (1);
+	/* expired, exactly at end of interval */
+expire:
+	if (timevalisset(&itp->it_interval)) {
+		itp->it_value = itp->it_interval;
+		itp->it_value.tv_usec -= usec;
+		if (itp->it_value.tv_usec < 0) {
+			itp->it_value.tv_usec += 1000000;
+			itp->it_value.tv_sec--;
+		}
+	} else
+		itp->it_value.tv_usec = 0;		/* sec is already 0 */
+	return (0);
+}
+
+/*
+ * Add and subtract routines for timevals.
+ * N.B.: subtract routine doesn't deal with
+ * results which are before the beginning,
+ * it just gets very confused in this case.
+ * Caveat emptor.
+ */
+void
+timevaladd(t1, t2)
+	struct timeval *t1, *t2;
+{
+
+	t1->tv_sec += t2->tv_sec;
+	t1->tv_usec += t2->tv_usec;
+	timevalfix(t1);
+}
+
+void
+timevalsub(t1, t2)
+	struct timeval *t1, *t2;
+{
+
+	t1->tv_sec -= t2->tv_sec;
+	t1->tv_usec -= t2->tv_usec;
+	timevalfix(t1);
+}
+
+static void
+timevalfix(t1)
+	struct timeval *t1;
+{
+
+	if (t1->tv_usec < 0) {
+		t1->tv_sec--;
+		t1->tv_usec += 1000000;
+	}
+	if (t1->tv_usec >= 1000000) {
+		t1->tv_sec++;
+		t1->tv_usec -= 1000000;
+	}
+}
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
new file mode 100644
index 0000000..278fcce
--- /dev/null
+++ b/sys/kern/kern_timeout.c
@@ -0,0 +1,286 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ * $Id: kern_timeout.c,v 1.54 1998/02/25 06:13:32 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+
+/*
+ * TODO:
+ *	allocate more timeout table slots when table overflows.
+ */
+
+/* Exported to machdep.c and/or kern_clock.c.  */
+struct callout *callout;
+struct callout_list callfree;
+int callwheelsize, callwheelbits, callwheelmask;
+struct callout_tailq *callwheel;
+int softticks;			/* Like ticks, but for softclock(). */
+
+static struct callout *nextsoftcheck;	/* Next callout to be checked. */
+
+/*
+ * The callout mechanism is based on the work of Adam M. Costello and 
+ * George Varghese, published in a technical report entitled "Redesigning
+ * the BSD Callout and Timer Facilities" and modified slightly for inclusion
+ * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
+ * used in this implementation was published by G.Varghese and A. Lauck in
+ * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
+ * the Efficient Implementation of a Timer Facility" in the Proceedings of
+ * the 11th ACM Annual Symposium on Operating Systems Principles,
+ * Austin, Texas Nov 1987.
+ */
+
+/*
+ * Software (low priority) clock interrupt.
+ * Run periodic events from timeout queue.
+ */
+void
+softclock()
+{
+	register struct callout *c;
+	register struct callout_tailq *bucket;
+	register int s;
+	register int curticks;
+	register int steps;	/* #steps since we last allowed interrupts */
+
+#ifndef MAX_SOFTCLOCK_STEPS
+#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */
+#endif /* MAX_SOFTCLOCK_STEPS */
+
+	steps = 0;
+	s = splhigh();
+	while (softticks != ticks) {
+		softticks++;
+		/*
+		 * softticks may be modified by hard clock, so cache
+		 * it while we work on a given bucket.
+		 */
+		curticks = softticks;
+		bucket = &callwheel[curticks & callwheelmask];
+		c = TAILQ_FIRST(bucket);
+		while (c) {
+			if (c->c_time != curticks) {
+				c = TAILQ_NEXT(c, c_links.tqe);
+				++steps;
+				if (steps >= MAX_SOFTCLOCK_STEPS) {
+					nextsoftcheck = c;
+					/* Give interrupts a chance. */
+					splx(s);
+					s = splhigh();
+					c = nextsoftcheck;
+					steps = 0;
+				}
+			} else {
+				void (*c_func)(void *);
+				void *c_arg;
+
+				nextsoftcheck = TAILQ_NEXT(c, c_links.tqe);
+				TAILQ_REMOVE(bucket, c, c_links.tqe);
+				c_func = c->c_func;
+				c_arg = c->c_arg;
+				c->c_func = NULL;
+				SLIST_INSERT_HEAD(&callfree, c, c_links.sle);
+				splx(s);
+				c_func(c_arg);
+				s = splhigh();
+				steps = 0;
+				c = nextsoftcheck;
+			}
+		}
+	}
+	nextsoftcheck = NULL;
+	splx(s);
+}
+
+/*
+ * timeout --
+ *	Execute a function after a specified length of time.
+ *
+ * untimeout --
+ *	Cancel previous timeout function call.
+ *
+ * callout_handle_init --
+ *	Initialize a handle so that using it with untimeout is benign.
+ *
+ *	See AT&T BCI Driver Reference Manual for specification.  This
+ *	implementation differs from that one in that although an 
+ *	identification value is returned from timeout, the original
+ *	arguments to timeout as well as the identifier are used to
+ *	identify entries for untimeout.
+ */
+struct callout_handle
+timeout(ftn, arg, to_ticks)
+	timeout_t *ftn;
+	void *arg;
+	register int to_ticks;
+{
+	int s;
+	struct callout *new;
+	struct callout_handle handle;
+
+	if (to_ticks <= 0)
+		to_ticks = 1;
+
+	/* Lock out the clock. */
+	s = splhigh();
+
+	/* Fill in the next free callout structure. */
+	new = SLIST_FIRST(&callfree);
+	if (new == NULL)
+		/* XXX Attempt to malloc first */
+		panic("timeout table full");
+
+	SLIST_REMOVE_HEAD(&callfree, c_links.sle);
+	new->c_arg = arg;
+	new->c_func = ftn;
+	new->c_time = ticks + to_ticks;
+	TAILQ_INSERT_TAIL(&callwheel[new->c_time & callwheelmask],
+			  new, c_links.tqe);
+
+	splx(s);
+	handle.callout = new;
+	return (handle);
+}
+
+void
+untimeout(ftn, arg, handle)
+	timeout_t *ftn;
+	void *arg;
+	struct callout_handle handle;
+{
+	register int s;
+
+	/*
+	 * Check for a handle that was initialized
+	 * by callout_handle_init, but never used
+	 * for a real timeout.
+	 */
+	if (handle.callout == NULL)
+		return;
+
+	s = splhigh();
+	if ((handle.callout->c_func == ftn)
+	 && (handle.callout->c_arg == arg)) {
+		if (nextsoftcheck == handle.callout) {
+			nextsoftcheck = TAILQ_NEXT(handle.callout, c_links.tqe);
+		}
+		TAILQ_REMOVE(&callwheel[handle.callout->c_time & callwheelmask],
+			     handle.callout, c_links.tqe);
+		handle.callout->c_func = NULL;
+		SLIST_INSERT_HEAD(&callfree, handle.callout, c_links.sle);
+	}
+	splx(s);
+}
+
+void
+callout_handle_init(struct callout_handle *handle)
+{
+	handle->callout = NULL;
+}
+
+#ifdef APM_FIXUP_CALLTODO
+/* 
+ * Adjust the kernel calltodo timeout list.  This routine is used after 
+ * an APM resume to recalculate the calltodo timer list values with the 
+ * number of hz's we have been sleeping.  The next hardclock() will detect 
+ * that there are fired timers and run softclock() to execute them.
+ *
+ * Please note, I have not done an exhaustive analysis of what code this
+ * might break.  I am motivated to have my select()'s and alarm()'s that
+ * have expired during suspend firing upon resume so that the applications
+ * which set the timer can do the maintanence the timer was for as close
+ * as possible to the originally intended time.  Testing this code for a 
+ * week showed that resuming from a suspend resulted in 22 to 25 timers 
+ * firing, which seemed independant on whether the suspend was 2 hours or
+ * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
+ */
+void
+adjust_timeout_calltodo(time_change)
+    struct timeval *time_change;
+{
+	register struct callout *p;
+	unsigned long delta_ticks;
+	int s;
+
+	/* 
+	 * How many ticks were we asleep?
+	 * (stolen from tvtohz()).
+	 */
+
+	/* Don't do anything */
+	if (time_change->tv_sec < 0)
+		return;
+	else if (time_change->tv_sec <= LONG_MAX / 1000000)
+		delta_ticks = (time_change->tv_sec * 1000000 +
+			       time_change->tv_usec + (tick - 1)) / tick + 1;
+	else if (time_change->tv_sec <= LONG_MAX / hz)
+		delta_ticks = time_change->tv_sec * hz +
+			      (time_change->tv_usec + (tick - 1)) / tick + 1;
+	else
+		delta_ticks = LONG_MAX;
+
+	if (delta_ticks > INT_MAX)
+		delta_ticks = INT_MAX;
+
+	/* 
+	 * Now rip through the timer calltodo list looking for timers
+	 * to expire.
+	 */
+
+	/* don't collide with softclock() */
+	s = splhigh(); 
+	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
+		p->c_time -= delta_ticks;
+
+		/* Break if the timer had more time on it than delta_ticks */
+		if (p->c_time > 0)
+			break;
+
+		/* take back the ticks the timer didn't use (p->c_time <= 0) */
+		delta_ticks = -p->c_time;
+	}
+	splx(s);
+
+	return;
+}
+#endif /* APM_FIXUP_CALLTODO */
diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c
new file mode 100644
index 0000000..b7cb83b
--- /dev/null
+++ b/sys/kern/kern_xxx.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_xxx.c	8.2 (Berkeley) 11/14/93
+ * $Id: kern_xxx.c,v 1.27 1997/12/16 17:40:21 eivind Exp $
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/utsname.h>
+
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+
+#ifndef _SYS_SYSPROTO_H_
+struct gethostname_args {
+	char	*hostname;
+	u_int	len;
+};
+#endif
+/* ARGSUSED */
+int
+ogethostname(p, uap)
+	struct proc *p;
+	struct gethostname_args *uap;
+{
+	int name[2];
+	size_t len = uap->len;
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_HOSTNAME;
+	return (userland_sysctl(p, name, 2, uap->hostname, &len, 
+		1, 0, 0, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sethostname_args {
+	char	*hostname;
+	u_int	len;
+};
+#endif
+/* ARGSUSED */
+int
+osethostname(p, uap)
+	struct proc *p;
+	register struct sethostname_args *uap;
+{
+	int name[2];
+	int error;
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_HOSTNAME;
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
+		return (error);
+	return (userland_sysctl(p, name, 2, 0, 0, 0,
+		uap->hostname, uap->len, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogethostid_args {
+	int	dummy;
+};
+#endif
+/* ARGSUSED */
+int
+ogethostid(p, uap)
+	struct proc *p;
+	struct ogethostid_args *uap;
+{
+
+	*(long *)(p->p_retval) = hostid;
+	return (0);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifdef COMPAT_43
+#ifndef _SYS_SYSPROTO_H_
+struct osethostid_args {
+	long	hostid;
+};
+#endif
+/* ARGSUSED */
+int
+osethostid(p, uap)
+	struct proc *p;
+	struct osethostid_args *uap;
+{
+	int error;
+
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
+		return (error);
+	hostid = uap->hostid;
+	return (0);
+}
+
+int
+oquota(p, uap)
+	struct proc *p;
+	struct oquota_args *uap;
+{
+
+	return (ENOSYS);
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct uname_args {
+        struct utsname  *name;
+};
+#endif
+
+/* ARGSUSED */
+int
+uname(p, uap)
+	struct proc *p;
+	struct uname_args *uap;
+{
+	int name[2], rtval;
+	size_t len;
+	char *s, *us;
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_OSTYPE;
+	len = sizeof uap->name->sysname;
+	rtval = userland_sysctl(p, name, 2, uap->name->sysname, &len, 
+		1, 0, 0, 0);
+	if( rtval) return rtval;
+	subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
+
+	name[1] = KERN_HOSTNAME;
+	len = sizeof uap->name->nodename;
+	rtval = userland_sysctl(p, name, 2, uap->name->nodename, &len, 
+		1, 0, 0, 0);
+	if( rtval) return rtval;
+	subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
+
+	name[1] = KERN_OSRELEASE;
+	len = sizeof uap->name->release;
+	rtval = userland_sysctl(p, name, 2, uap->name->release, &len, 
+		1, 0, 0, 0);
+	if( rtval) return rtval;
+	subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
+
+/*
+	name = KERN_VERSION;
+	len = sizeof uap->name->version;
+	rtval = userland_sysctl(p, name, 2, uap->name->version, &len, 
+		1, 0, 0, 0);
+	if( rtval) return rtval;
+	subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
+*/
+
+/*
+ * this stupid hackery to make the version field look like FreeBSD 1.1
+ */
+	for(s = version; *s && *s != '#'; s++);
+
+	for(us = uap->name->version; *s && *s != ':'; s++) {
+		rtval = subyte( us++, *s);
+		if( rtval)
+			return rtval;
+	}
+	rtval = subyte( us++, 0);
+	if( rtval)
+		return rtval;
+
+	name[0] = CTL_HW;
+	name[1] = HW_MACHINE;
+	len = sizeof uap->name->machine;
+	rtval = userland_sysctl(p, name, 2, uap->name->machine, &len, 
+		1, 0, 0, 0);
+	if( rtval) return rtval;
+	subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
+
+	return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdomainname_args {
+        char    *domainname;
+        int     len;
+};
+#endif
+
+/* ARGSUSED */
+int
+getdomainname(p, uap)
+        struct proc *p;
+        struct getdomainname_args *uap;
+{
+	int domainnamelen = strlen(domainname) + 1;
+	if ((u_int)uap->len > domainnamelen + 1)
+		uap->len = domainnamelen + 1;
+	return (copyout((caddr_t)domainname, (caddr_t)uap->domainname, uap->len));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setdomainname_args {
+        char    *domainname;
+        int     len;
+};
+#endif
+
+/* ARGSUSED */
+int
+setdomainname(p, uap)
+        struct proc *p;
+        struct setdomainname_args *uap;
+{
+        int error, domainnamelen;
+
+        if ((error = suser(p->p_ucred, &p->p_acflag)))
+                return (error);
+        if ((u_int)uap->len > sizeof (domainname) - 1)
+                return EINVAL;
+        domainnamelen = uap->len;
+        error = copyin((caddr_t)uap->domainname, domainname, uap->len);
+        domainname[domainnamelen] = 0;
+        return (error);
+}
+
diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c
new file mode 100644
index 0000000..3718e253
--- /dev/null
+++ b/sys/kern/ksched.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 1996, 1997
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/* ksched: Soft real time scheduling based on "rtprio".
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/resource.h>
+#include <machine/cpu.h>	/* For need_resched */
+
+#include <posix4/posix4.h>
+
+/* ksched: Real-time extension to support POSIX priority scheduling.
+ */
+
+struct ksched {
+	struct timespec rr_interval;
+};
+
+int ksched_attach(struct ksched **p)
+{
+	struct ksched *ksched= p31b_malloc(sizeof(*ksched));
+
+	ksched->rr_interval.tv_sec = 0;
+	ksched->rr_interval.tv_nsec = 1000000000L / roundrobin_interval();
+
+	*p = ksched;
+	return 0;
+}
+
+int ksched_detach(struct ksched *p)
+{
+	p31b_free(p);
+
+	return 0;
+}
+
+/*
+ * XXX About priorities
+ *
+ *	POSIX 1003.1b requires that numerically higher priorities be of
+ *	higher priority.  It also permits sched_setparam to be
+ *	implementation defined for SCHED_OTHER.  I don't like
+ *	the notion of inverted priorites for normal processes when
+ *  you can use "setpriority" for that.
+ *
+ *	I'm rejecting sched_setparam for SCHED_OTHER with EINVAL.
+ */
+
+/* Macros to convert between the unix (lower numerically is higher priority)
+ * and POSIX 1003.1b (higher numerically is higher priority)
+ */
+
+#define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P))
+#define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P))
+
+/* These improve readability a bit for me:
+ */
+#define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX)
+#define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN)
+
+static __inline int
+getscheduler(int *ret, struct ksched *ksched, struct proc *p)
+{
+	int e = 0;
+
+	switch (p->p_rtprio.type)
+	{
+		case RTP_PRIO_FIFO:
+		*ret = SCHED_FIFO;
+		break;
+
+		case RTP_PRIO_REALTIME:
+		*ret = SCHED_RR;
+		break;
+
+		default:
+		*ret = SCHED_OTHER;
+		break;
+	}
+
+	return e;
+}
+
+int ksched_setparam(int *ret, struct ksched *ksched,
+	struct proc *p, const struct sched_param *param)
+{
+	int e, policy;
+
+	e = getscheduler(&policy, ksched, p);
+
+	if (e == 0)
+	{
+		if (policy == SCHED_OTHER)
+			e = EINVAL;
+		else
+			e = ksched_setscheduler(ret, ksched, p, policy, param);
+	}
+
+	return e;
+}
+
+int ksched_getparam(int *ret, struct ksched *ksched,
+	struct proc *p, struct sched_param *param)
+{
+	if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type))
+		param->sched_priority = rtpprio_to_p4prio(p->p_rtprio.prio);
+
+	return 0;
+}
+
+/*
+ * XXX The priority and scheduler modifications should
+ *     be moved into published interfaces in kern/kern_sync.
+ *
+ * The permissions to modify process p were checked in "p31b_proc()".
+ *
+ */
+int ksched_setscheduler(int *ret, struct ksched *ksched,
+	struct proc *p, int policy, const struct sched_param *param)
+{
+	int e = 0;
+	struct rtprio rtp;
+
+	switch(policy)
+	{
+		case SCHED_RR:
+		case SCHED_FIFO:
+
+		if (param->sched_priority >= P1B_PRIO_MIN &&
+		param->sched_priority <= P1B_PRIO_MAX)
+		{
+			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+			rtp.type = (policy == SCHED_FIFO)
+				? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
+
+			p->p_rtprio = rtp;
+			need_resched();
+		}
+		else
+			e = EPERM;
+
+
+		break;
+
+		case SCHED_OTHER:
+		{
+			rtp.type = RTP_PRIO_NORMAL;
+			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+			p->p_rtprio = rtp;
+
+			/* XXX Simply revert to whatever we had for last
+			 *     normal scheduler priorities.
+			 *     This puts a requirement
+			 *     on the scheduling code: You must leave the
+			 *     scheduling info alone.
+			 */
+			need_resched();
+		}
+		break;
+	}
+
+	return e;
+}
+
+int ksched_getscheduler(int *ret, struct ksched *ksched, struct proc *p)
+{
+	return getscheduler(ret, ksched, p);
+}
+
+/* ksched_yield: Yield the CPU.
+ */
+int ksched_yield(int *ret, struct ksched *ksched)
+{
+	need_resched();
+	return 0;
+}
+
+int ksched_get_priority_max(int *ret, struct ksched *ksched, int policy)
+{
+	int e = 0;
+
+	switch (policy)
+	{
+		case SCHED_FIFO:
+		case SCHED_RR:
+		*ret = RTP_PRIO_MAX;
+		break;
+
+		case SCHED_OTHER:
+		*ret =  PRIO_MAX;
+		break;
+
+		default:
+		e = EINVAL;
+	}
+
+	return e;
+}
+
+int ksched_get_priority_min(int *ret, struct ksched *ksched, int policy)
+{
+	int e = 0;
+
+	switch (policy)
+	{
+		case SCHED_FIFO:
+		case SCHED_RR:
+		*ret = P1B_PRIO_MIN;
+		break;
+
+		case SCHED_OTHER:
+		*ret =  PRIO_MIN;
+		break;
+
+		default:
+		e = EINVAL;
+	}
+
+	return e;
+}
+
+int ksched_rr_get_interval(int *ret, struct ksched *ksched,
+	struct proc *p, struct timespec *timespec)
+{
+	*timespec = ksched->rr_interval;
+
+	return 0;
+}
diff --git a/sys/kern/link_aout.c b/sys/kern/link_aout.c
new file mode 100644
index 0000000..29b5884
--- /dev/null
+++ b/sys/kern/link_aout.c
@@ -0,0 +1,585 @@
+/*-
+ * Copyright (c) 1997 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: link_aout.c,v 1.16 1998/11/03 14:25:21 peter Exp $
+ */
+
+#ifndef __alpha__
+
+#define FREEBSD_AOUT	1
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+#include <a.out.h>
+#include <link.h>
+
+static int		link_aout_load_module(const char*, linker_file_t*);
+
+static int		link_aout_load_file(const char*, linker_file_t*);
+
+static int		link_aout_lookup_symbol(linker_file_t, const char*,
+						linker_sym_t*);
+static int		link_aout_symbol_values(linker_file_t file, linker_sym_t sym,
+						linker_symval_t* symval);
+static int		link_aout_search_symbol(linker_file_t lf, caddr_t value,
+						linker_sym_t* sym, long* diffp);
+static void		link_aout_unload_file(linker_file_t);
+static void		link_aout_unload_module(linker_file_t);
+
+static struct linker_class_ops link_aout_class_ops = {
+    link_aout_load_module,
+};
+
+static struct linker_file_ops link_aout_file_ops = {
+    link_aout_lookup_symbol,
+    link_aout_symbol_values,
+    link_aout_search_symbol,
+    link_aout_unload_file,
+};
+static struct linker_file_ops link_aout_module_ops = {
+    link_aout_lookup_symbol,
+    link_aout_symbol_values,
+    link_aout_search_symbol,
+    link_aout_unload_module,
+};
+
+typedef struct aout_file {
+    char*		address;	/* Load address */
+    struct _dynamic*	dynamic;	/* Symbol table etc. */
+} *aout_file_t;
+
+static int		load_dependancies(linker_file_t lf);
+static int		relocate_file(linker_file_t lf);
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_aout_init(void* arg)
+{
+#ifndef __ELF__
+    struct _dynamic* dp = &_DYNAMIC;
+#endif
+
+    linker_add_class("a.out", NULL, &link_aout_class_ops);
+
+#ifndef __ELF__
+    if (dp) {
+	aout_file_t af;
+
+	af = malloc(sizeof(struct aout_file), M_LINKER, M_NOWAIT);
+	if (af == NULL)
+	    panic("link_aout_init: Can't create linker structures for kernel");
+	bzero(af, sizeof(*af));
+
+	af->address = 0;
+	af->dynamic = dp;
+	linker_kernel_file =
+	    linker_make_file(kernelname, af, &link_aout_file_ops);
+	if (linker_kernel_file == NULL)
+	    panic("link_aout_init: Can't create linker structures for kernel");
+	/*
+	 * XXX there must be a better way of getting these constants.
+	 */
+	linker_kernel_file->address = (caddr_t) 0xf0100000;
+	linker_kernel_file->size = -0xf0100000;
+	linker_current_file = linker_kernel_file;
+    }
+#endif
+}
+
+SYSINIT(link_aout, SI_SUB_KLD, SI_ORDER_THIRD, link_aout_init, 0);
+
+static int
+link_aout_load_module(const char* filename, linker_file_t* result)
+{
+    caddr_t		modptr, baseptr;
+    char		*type;
+    struct exec		*ehdr;
+    aout_file_t		af;
+    linker_file_t	lf;
+    int			error;
+    
+    /* Look to see if we have the module preloaded. */
+    if ((modptr = preload_search_by_name(filename)) == NULL)
+	return(link_aout_load_file(filename, result));
+
+    /* It's preloaded, check we can handle it and collect information. */
+    if (((type = (char *)preload_search_info(modptr, MODINFO_TYPE)) == NULL) ||
+	strcmp(type, "a.out module") ||
+	((baseptr = preload_search_info(modptr, MODINFO_ADDR)) == NULL) ||
+	((ehdr = (struct exec *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_AOUTEXEC)) == NULL))
+	return(0);			/* we can't handle this */
+
+    /* Looks like we can handle this one */
+    af = malloc(sizeof(struct aout_file), M_LINKER, M_WAITOK);
+    bzero(af, sizeof(*af));
+    af->address = baseptr;
+
+    /* Assume _DYNAMIC is the first data item. */
+    af->dynamic = (struct _dynamic*)(af->address + ehdr->a_text);
+    if (af->dynamic->d_version != LD_VERSION_BSD) {
+	free(af, M_LINKER);
+	return(0);			/* we can't handle this */
+    }
+    af->dynamic->d_un.d_sdt = (struct section_dispatch_table *)
+	((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address);
+
+    /* Register with kld */
+    lf = linker_make_file(filename, af, &link_aout_module_ops);
+    if (lf == NULL) {
+	free(af, M_LINKER);
+	return(ENOMEM);
+    }
+    lf->address = af->address;
+    lf->size = ehdr->a_text + ehdr->a_data + ehdr->a_bss;
+
+    /* Try to load dependancies */
+    if (((error = load_dependancies(lf)) != 0) ||
+	((error = relocate_file(lf)) != 0)) {
+	linker_file_unload(lf);
+	return(error);
+    }
+    *result = lf;
+    return(0);
+}
+
+static int
+link_aout_load_file(const char* filename, linker_file_t* result)
+{
+    struct nameidata nd;
+    struct proc* p = curproc;	/* XXX */
+    int error = 0;
+    int resid;
+    struct exec header;
+    aout_file_t af;
+    linker_file_t lf;
+    char *pathname;
+
+    pathname = linker_search_path(filename);
+    if (pathname == NULL)
+	return ENOENT;
+    NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, pathname, p);
+    error = vn_open(&nd, FREAD, 0);
+    free(pathname, M_LINKER);
+    if (error)
+	return error;
+
+    /*
+     * Read the a.out header from the file.
+     */
+    error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) &header, sizeof header, 0,
+		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+    if (error)
+	goto out;
+
+    if (N_BADMAG(header) || !(N_GETFLAG(header) & EX_DYNAMIC))
+	goto out;
+
+    /*
+     * We have an a.out file, so make some space to read it in.
+     */
+    af = malloc(sizeof(struct aout_file), M_LINKER, M_WAITOK);
+    bzero(af, sizeof(*af));
+    af->address = malloc(header.a_text + header.a_data + header.a_bss,
+			 M_LINKER, M_WAITOK);
+    
+    /*
+     * Read the text and data sections and zero the bss.
+     */
+    error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) af->address,
+		    header.a_text + header.a_data, 0,
+		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+    if (error)
+	goto out;
+    bzero(af->address + header.a_text + header.a_data, header.a_bss);
+
+    /*
+     * Assume _DYNAMIC is the first data item.
+     */
+    af->dynamic = (struct _dynamic*) (af->address + header.a_text);
+    if (af->dynamic->d_version != LD_VERSION_BSD) {
+	free(af->address, M_LINKER);
+	free(af, M_LINKER);
+	goto out;
+    }
+    af->dynamic->d_un.d_sdt = (struct section_dispatch_table *)
+	((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address);
+
+    lf = linker_make_file(filename, af, &link_aout_file_ops);
+    if (lf == NULL) {
+	free(af->address, M_LINKER);
+	free(af, M_LINKER);
+	error = ENOMEM;
+	goto out;
+    }
+    lf->address = af->address;
+    lf->size = header.a_text + header.a_data + header.a_bss;
+
+    if ((error = load_dependancies(lf)) != 0
+	|| (error = relocate_file(lf)) != 0) {
+	linker_file_unload(lf);
+	goto out;
+    }
+
+    *result = lf;
+
+out:
+    VOP_UNLOCK(nd.ni_vp, 0, p);
+    vn_close(nd.ni_vp, FREAD, p->p_ucred, p);
+
+    return error;
+}
+
+static void
+link_aout_unload_file(linker_file_t file)
+{
+    aout_file_t af = file->priv;
+
+    if (af) {
+	if (af->address)
+	    free(af->address, M_LINKER);
+	free(af, M_LINKER);
+    }
+}
+
+static void
+link_aout_unload_module(linker_file_t file)
+{
+    aout_file_t af = file->priv;
+
+    if (af)
+	free(af, M_LINKER);
+    if (file->filename)
+	preload_delete_name(file->filename);
+}
+
+#define AOUT_RELOC(af, type, off) (type*) ((af)->address + (off))
+
+static int
+load_dependancies(linker_file_t lf)
+{
+    aout_file_t af = lf->priv;
+    linker_file_t lfdep;
+    long off;
+    struct sod* sodp;
+    char* name;
+    char* filename = 0;
+    int error = 0;
+
+    /*
+     * All files are dependant on /kernel.
+     */
+    if (linker_kernel_file) {
+	linker_kernel_file->refs++;
+	linker_file_add_dependancy(lf, linker_kernel_file);
+    }
+
+    off = LD_NEED(af->dynamic);
+
+    /*
+     * Load the dependancies.
+     */
+    while (off != 0) {
+	sodp = AOUT_RELOC(af, struct sod, off);
+	name = AOUT_RELOC(af, char, sodp->sod_name);
+
+	error = linker_load_file(name, &lfdep);
+	if (error)
+	    goto out;
+	error = linker_file_add_dependancy(lf, lfdep);
+	if (error)
+	    goto out;
+	off = sodp->sod_next;
+    }
+
+out:
+    if (filename)
+	free(filename, M_TEMP);
+    return error;
+}
+
+/*
+ * XXX i386 dependant.
+ */
+static long
+read_relocation(struct relocation_info* r, char* addr)
+{
+    int length = r->r_length;
+    if (length == 0)
+	return *(u_char*) addr;
+    else if (length == 1)
+	return *(u_short*) addr;
+    else if (length == 2)
+	return *(u_int*) addr;
+    else
+	printf("link_aout: unsupported relocation size %d\n", r->r_length);
+    return 0;
+}
+
+static void
+write_relocation(struct relocation_info* r, char* addr, long value)
+{
+    int length = r->r_length;
+    if (length == 0)
+	*(u_char*) addr = value;
+    else if (length == 1)
+	*(u_short*) addr = value;
+    else if (length == 2)
+	*(u_int*) addr = value;
+    else
+	printf("link_aout: unsupported relocation size %d\n", r->r_length);
+}
+
+static int
+relocate_file(linker_file_t lf)
+{
+    aout_file_t af = lf->priv;
+    struct relocation_info* rel;
+    struct relocation_info* erel;
+    struct relocation_info* r;
+    struct nzlist* symbolbase;
+    char* stringbase;
+    struct nzlist* np;
+    char* sym;
+    long relocation;
+
+    rel = AOUT_RELOC(af, struct relocation_info, LD_REL(af->dynamic));
+    erel = AOUT_RELOC(af, struct relocation_info,
+		      LD_REL(af->dynamic) + LD_RELSZ(af->dynamic));
+    symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+    stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+    for (r = rel; r < erel; r++) {
+	char* addr;
+
+	if (r->r_address == 0)
+	    break;
+
+	addr = AOUT_RELOC(af, char, r->r_address);
+	if (r->r_extern) {
+	    np = &symbolbase[r->r_symbolnum];
+	    sym = &stringbase[np->nz_strx];
+
+	    if (sym[0] != '_') {
+		printf("link_aout: bad symbol name %s\n", sym);
+		relocation = 0;
+	    } else
+		relocation = (intptr_t)
+		    linker_file_lookup_symbol(lf, sym + 1,
+					      np->nz_type != (N_SETV+N_EXT));
+	    if (!relocation) {
+		printf("link_aout: symbol %s not found\n", sym);
+		return ENOENT;
+	    }
+	    
+	    relocation += read_relocation(r, addr);
+
+	    if (r->r_jmptable) {
+		printf("link_aout: can't cope with jump table relocations\n");
+		continue;
+	    }
+
+	    if (r->r_pcrel)
+		relocation -= (intptr_t) af->address;
+
+	    if (r->r_copy) {
+		printf("link_aout: can't cope with copy relocations\n");
+		continue;
+	    }
+	    
+	    write_relocation(r, addr, relocation);
+	} else {
+	    write_relocation(r, addr,
+			     (intptr_t)(read_relocation(r, addr) + af->address));
+	}
+	
+    }
+
+    return 0;
+}
+
+static long
+symbol_hash_value(aout_file_t af, const char* name)
+{
+    long hashval;
+    const char* p;
+
+    hashval = '_';		/* fake a starting '_' for C symbols */
+    for (p = name; *p; p++)
+	hashval = (hashval << 1) + *p;
+
+    return (hashval & 0x7fffffff) % LD_BUCKETS(af->dynamic);
+}
+
+int
+link_aout_lookup_symbol(linker_file_t file, const char* name,
+			linker_sym_t* sym)
+{
+    aout_file_t af = file->priv;
+    long hashval;
+    struct rrs_hash* hashbase;
+    struct nzlist* symbolbase;
+    char* stringbase;
+    struct rrs_hash* hp;
+    struct nzlist* np;
+    char* cp;
+
+    if (LD_BUCKETS(af->dynamic) == 0)
+	return NULL;
+
+    hashbase = AOUT_RELOC(af, struct rrs_hash, LD_HASH(af->dynamic));
+    symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+    stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+restart:
+    hashval = symbol_hash_value(af, name);
+    hp = &hashbase[hashval];
+    if (hp->rh_symbolnum == -1)
+	return ENOENT;
+
+    while (hp) {
+	np = (struct nzlist *) &symbolbase[hp->rh_symbolnum];
+	cp = stringbase + np->nz_strx;
+	/*
+	 * Note: we fake the leading '_' for C symbols.
+	 */
+	if (cp[0] == '_' && !strcmp(cp + 1, name))
+	    break;
+
+	if (hp->rh_next == 0)
+	    hp = NULL;
+	else
+	    hp = &hashbase[hp->rh_next];
+    }
+
+    if (hp == NULL)
+	/*
+	 * Not found.
+	 */
+	return ENOENT;
+
+    /*
+     * Check for an aliased symbol, whatever that is.
+     */
+    if (np->nz_type == N_INDR+N_EXT) {
+	name = stringbase + (++np)->nz_strx + 1; /* +1 for '_' */
+	goto restart;
+    }
+
+    /*
+     * Check this is an actual definition of the symbol.
+     */
+    if (np->nz_value == 0)
+	return ENOENT;
+
+    if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) {
+	if (np->nz_other == AUX_FUNC)
+	    /* weak function */
+	    return ENOENT;
+    }
+
+    *sym = (linker_sym_t) np;
+
+    return 0;
+}
+
+
+static int
+link_aout_symbol_values(linker_file_t file, linker_sym_t sym,
+			linker_symval_t* symval)
+{
+    aout_file_t af = file->priv;
+    struct nzlist* np = (struct nzlist*) sym;
+    char* stringbase;
+    long numsym = LD_STABSZ(af->dynamic) / sizeof(struct nzlist);
+    struct nzlist *symbase;
+
+    /* Is it one of ours?  It could be another module... */
+    symbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+    if (np < symbase)
+	return ENOENT;
+    if ((np - symbase) > numsym)
+	return ENOENT;
+
+    stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+    symval->name = stringbase + np->nz_strx + 1; /* +1 for '_' */
+    if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) {
+	symval->value = 0;
+	symval->size = np->nz_value;
+    } else {
+	symval->value = AOUT_RELOC(af, char, np->nz_value);
+	symval->size = np->nz_size;
+    }
+    return 0;
+}
+
+static int
+link_aout_search_symbol(linker_file_t lf, caddr_t value,
+			linker_sym_t* sym, long* diffp)
+{
+	aout_file_t af = lf->priv;
+	u_long off = (uintptr_t) (void *) value;
+	u_long diff = off;
+	struct nzlist* sp;
+	struct nzlist* ep;
+	struct nzlist* best = 0;
+
+	for (sp = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)),
+		 ep = (struct nzlist *) ((caddr_t) sp + LD_STABSZ(af->dynamic));
+	     sp < ep; sp++) {
+		if (sp->nz_name == 0)
+			continue;
+		if (off >= sp->nz_value) {
+			if (off - sp->nz_value < diff) {
+				diff = off - sp->nz_value;
+				best = sp;
+				if (diff == 0)
+					break;
+			} else if (off - sp->nz_value == diff) {
+				best = sp;
+			}
+		}
+	}
+	if (best == 0)
+		*diffp = off;
+	else
+		*diffp = diff;
+	*sym = (linker_sym_t) best;
+
+	return 0;
+}
+
+#endif /* !__alpha__ */
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
new file mode 100644
index 0000000..c5e84da
--- /dev/null
+++ b/sys/kern/link_elf.c
@@ -0,0 +1,981 @@
+/*-
+ * Copyright (c) 1998 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: link_elf.c,v 1.10 1998/11/06 15:16:07 peter Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+#include <machine/elf.h>
+
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+static int	link_elf_load_module(const char*, linker_file_t*);
+static int	link_elf_load_file(const char*, linker_file_t*);
+static int	link_elf_lookup_symbol(linker_file_t, const char*,
+				       linker_sym_t*);
+static int	link_elf_symbol_values(linker_file_t, linker_sym_t, linker_symval_t*);
+static int	link_elf_search_symbol(linker_file_t, caddr_t value,
+				       linker_sym_t* sym, long* diffp);
+
+static void	link_elf_unload_file(linker_file_t);
+static void	link_elf_unload_module(linker_file_t);
+
+static struct linker_class_ops link_elf_class_ops = {
+    link_elf_load_module,
+};
+
+static struct linker_file_ops link_elf_file_ops = {
+    link_elf_lookup_symbol,
+    link_elf_symbol_values,
+    link_elf_search_symbol,
+    link_elf_unload_file,
+};
+
+static struct linker_file_ops link_elf_module_ops = {
+    link_elf_lookup_symbol,
+    link_elf_symbol_values,
+    link_elf_search_symbol,
+    link_elf_unload_module,
+};
+typedef struct elf_file {
+    caddr_t		address;	/* Relocation address */
+#ifdef SPARSE_MAPPING
+    vm_object_t		object;		/* VM object to hold file pages */
+#endif
+    const Elf_Dyn*	dynamic;	/* Symbol table etc. */
+    Elf_Off		nbuckets;	/* DT_HASH info */
+    Elf_Off		nchains;
+    const Elf_Off*	buckets;
+    const Elf_Off*	chains;
+    caddr_t		hash;
+    caddr_t		strtab;		/* DT_STRTAB */
+    int			strsz;		/* DT_STRSZ */
+    const Elf_Sym*	symtab;		/* DT_SYMTAB */
+    Elf_Addr*		got;		/* DT_PLTGOT */
+    const Elf_Rel*	pltrel;		/* DT_JMPREL */
+    int			pltrelsize;	/* DT_PLTRELSZ */
+    const Elf_Rela*	pltrela;	/* DT_JMPREL */
+    int			pltrelasize;	/* DT_PLTRELSZ */
+    const Elf_Rel*	rel;		/* DT_REL */
+    int			relsize;	/* DT_RELSZ */
+    const Elf_Rela*	rela;		/* DT_RELA */
+    int			relasize;	/* DT_RELASZ */
+    caddr_t		modptr;
+    const Elf_Sym*	ddbsymtab;	/* The symbol table we are using */
+    long		ddbsymcnt;	/* Number of symbols */
+    caddr_t		ddbstrtab;	/* String table */
+    long		ddbstrcnt;	/* number of bytes in string table */
+    caddr_t		symbase;	/* malloc'ed symbold base */
+    caddr_t		strbase;	/* malloc'ed string base */
+} *elf_file_t;
+
+static int		parse_dynamic(linker_file_t lf);
+static int		load_dependancies(linker_file_t lf);
+static int		relocate_file(linker_file_t lf);
+static int		parse_module_symbols(linker_file_t lf);
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_elf_init(void* arg)
+{
+#ifdef __ELF__
+    Elf_Dyn	*dp;
+    caddr_t	modptr, baseptr, sizeptr;
+    elf_file_t	ef;
+    char	*modname;
+#endif
+
+#if ELF_TARG_CLASS == ELFCLASS32
+    linker_add_class("elf32", NULL, &link_elf_class_ops);
+#else
+    linker_add_class("elf64", NULL, &link_elf_class_ops);
+#endif
+
+#ifdef __ELF__
+    dp = (Elf_Dyn*) &_DYNAMIC;
+    if (dp) {
+	ef = malloc(sizeof(struct elf_file), M_LINKER, M_NOWAIT);
+	if (ef == NULL)
+	    panic("link_elf_init: Can't create linker structures for kernel");
+	bzero(ef, sizeof(*ef));
+
+	ef->address = 0;
+#ifdef SPARSE_MAPPING
+	ef->object = 0;
+#endif
+	ef->dynamic = dp;
+	modname = NULL;
+	modptr = preload_search_by_type("elf kernel");
+	if (modptr)
+	    modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+	if (modname == NULL)
+	    modname = "kernel";
+	linker_kernel_file = linker_make_file(modname, ef, &link_elf_file_ops);
+	if (linker_kernel_file == NULL)
+	    panic("link_elf_init: Can't create linker structures for kernel");
+	parse_dynamic(linker_kernel_file);
+	/* Sigh, magic constants. */
+#ifdef __alpha__
+	linker_kernel_file->address = (caddr_t) 0xfffffc0000300000;
+#else
+	linker_kernel_file->address = (caddr_t) 0xf0100000;
+#endif
+	linker_kernel_file->size = -(long)linker_kernel_file->address;
+
+	if (modptr) {
+	    ef->modptr = modptr;
+	    baseptr = preload_search_info(modptr, MODINFO_ADDR);
+	    if (baseptr)
+		linker_kernel_file->address = *(caddr_t *)baseptr;
+	    sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+	    if (sizeptr)
+		linker_kernel_file->size = *(size_t *)sizeptr;
+	}
+	(void)parse_module_symbols(linker_kernel_file);
+	linker_current_file = linker_kernel_file;
+    }
+#endif
+}
+
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+
+static int
+parse_module_symbols(linker_file_t lf)
+{
+    elf_file_t ef = lf->priv;
+    caddr_t	pointer;
+    caddr_t	ssym, esym, base;
+    caddr_t	strtab;
+    int		strcnt;
+    Elf_Sym*	symtab;
+    int		symcnt;
+
+    if (ef->modptr == NULL)
+	return 0;
+    pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM);
+    if (pointer == NULL)
+	return 0;
+    ssym = *(caddr_t *)pointer;
+    pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM);
+    if (pointer == NULL)
+	return 0;
+    esym = *(caddr_t *)pointer;
+
+    base = ssym;
+
+    symcnt = *(long *)base;
+    base += sizeof(long);
+    symtab = (Elf_Sym *)base;
+    base += roundup(symcnt, sizeof(long));
+
+    if (base > esym || base < ssym) {
+	printf("Symbols are corrupt!\n");
+	return EINVAL;
+    }
+
+    strcnt = *(long *)base;
+    base += sizeof(long);
+    strtab = base;
+    base += roundup(strcnt, sizeof(long));
+
+    if (base > esym || base < ssym) {
+	printf("Symbols are corrupt!\n");
+	return EINVAL;
+    }
+
+    ef->ddbsymtab = symtab;
+    ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+    ef->ddbstrtab = strtab;
+    ef->ddbstrcnt = strcnt;
+
+    return 0;
+}
+
+static int
+parse_dynamic(linker_file_t lf)
+{
+    elf_file_t ef = lf->priv;
+    const Elf_Dyn *dp;
+    int plttype = DT_REL;
+
+    for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+	switch (dp->d_tag) {
+	case DT_HASH:
+	{
+	    /* From src/libexec/rtld-elf/rtld.c */
+	    const Elf_Off *hashtab = (const Elf_Off *)
+		(ef->address + dp->d_un.d_ptr);
+	    ef->nbuckets = hashtab[0];
+	    ef->nchains = hashtab[1];
+	    ef->buckets = hashtab + 2;
+	    ef->chains = ef->buckets + ef->nbuckets;
+	    break;
+	}
+	case DT_STRTAB:
+	    ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_STRSZ:
+	    ef->strsz = dp->d_un.d_val;
+	    break;
+	case DT_SYMTAB:
+	    ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_SYMENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Sym))
+		return ENOEXEC;
+	    break;
+	case DT_PLTGOT:
+	    ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_REL:
+	    ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_RELSZ:
+	    ef->relsize = dp->d_un.d_val;
+	    break;
+	case DT_RELENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Rel))
+		return ENOEXEC;
+	    break;
+	case DT_JMPREL:
+	    ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_PLTRELSZ:
+	    ef->pltrelsize = dp->d_un.d_val;
+	    break;
+	case DT_RELA:
+	    ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_RELASZ:
+	    ef->relasize = dp->d_un.d_val;
+	    break;
+	case DT_RELAENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Rela))
+		return ENOEXEC;
+	    break;
+	case DT_PLTREL:
+	    plttype = dp->d_un.d_val;
+	    if (plttype != DT_REL && plttype != DT_RELA)
+		return ENOEXEC;
+	    break;
+	}
+    }
+
+    if (plttype == DT_RELA) {
+	ef->pltrela = (const Elf_Rela *) ef->pltrel;
+	ef->pltrel = NULL;
+	ef->pltrelasize = ef->pltrelsize;
+	ef->pltrelsize = 0;
+    }
+
+    ef->ddbsymtab = ef->symtab;
+    ef->ddbsymcnt = ef->nchains;
+    ef->ddbstrtab = ef->strtab;
+    ef->ddbstrcnt = ef->strsz;
+
+    return 0;
+}
+
+static void
+link_elf_error(const char *s)
+{
+    printf("kldload: %s\n", s);
+}
+
+static int
+link_elf_load_module(const char *filename, linker_file_t *result)
+{
+    caddr_t		modptr, baseptr, sizeptr, dynptr;
+    char		*type;
+    elf_file_t		ef;
+    linker_file_t	lf;
+    int			error;
+    vm_offset_t		dp;
+
+    /* Look to see if we have the module preloaded */
+    modptr = preload_search_by_name(filename);
+    if (modptr == NULL)
+	return (link_elf_load_file(filename, result));
+
+    /* It's preloaded, check we can handle it and collect information */
+    type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+    baseptr = preload_search_info(modptr, MODINFO_ADDR);
+    sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+    dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC);
+    if (type == NULL || strcmp(type, "elf module") != 0)
+	return (EFTYPE);
+    if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
+	return (EINVAL);
+
+    ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK);
+    if (ef == NULL)
+	return (ENOMEM);
+    bzero(ef, sizeof(*ef));
+    ef->modptr = modptr;
+    ef->address = *(caddr_t *)baseptr;
+#ifdef SPARSE_MAPPING
+    ef->object = 0;
+#endif
+    dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
+    ef->dynamic = (Elf_Dyn *)dp;
+    lf = linker_make_file(filename, ef, &link_elf_module_ops);
+    if (lf == NULL) {
+	free(ef, M_LINKER);
+	return ENOMEM;
+    }
+    lf->address = ef->address;
+    lf->size = *(size_t *)sizeptr;
+
+    error = parse_dynamic(lf);
+    if (error) {
+	linker_file_unload(lf);
+	return error;
+    }
+    error = load_dependancies(lf);
+    if (error) {
+	linker_file_unload(lf);
+	return error;
+    }
+    error = relocate_file(lf);
+    if (error) {
+	linker_file_unload(lf);
+	return error;
+    }
+    (void)parse_module_symbols(lf);
+    *result = lf;
+    return (0);
+}
+
+static int
+link_elf_load_file(const char* filename, linker_file_t* result)
+{
+    struct nameidata nd;
+    struct proc* p = curproc;	/* XXX */
+    Elf_Ehdr *hdr;
+    caddr_t firstpage;
+    int nbytes, i;
+    Elf_Phdr *phdr;
+    Elf_Phdr *phlimit;
+    Elf_Phdr *segs[2];
+    int nsegs;
+    Elf_Phdr *phdyn;
+    Elf_Phdr *phphdr;
+    caddr_t mapbase;
+    size_t mapsize;
+    Elf_Off base_offset;
+    Elf_Addr base_vaddr;
+    Elf_Addr base_vlimit;
+    int error = 0;
+    int resid;
+    elf_file_t ef;
+    linker_file_t lf;
+    char *pathname;
+    Elf_Shdr *shdr;
+    int symtabindex;
+    int symstrindex;
+    int symcnt;
+    int strcnt;
+
+    shdr = NULL;
+    lf = NULL;
+
+    pathname = linker_search_path(filename);
+    if (pathname == NULL)
+	return ENOENT;
+
+    NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, pathname, p);
+    error = vn_open(&nd, FREAD, 0);
+    free(pathname, M_LINKER);
+    if (error)
+	return error;
+
+    /*
+     * Read the elf header from the file.
+     */
+    firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
+    if (firstpage == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    hdr = (Elf_Ehdr *)firstpage;
+    error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
+		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+    nbytes = PAGE_SIZE - resid;
+    if (error)
+	goto out;
+
+    if (!IS_ELF(*hdr)) {
+	error = ENOEXEC;
+	goto out;
+    }
+
+    if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
+      || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+	link_elf_error("Unsupported file layout");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_ident[EI_VERSION] != EV_CURRENT
+      || hdr->e_version != EV_CURRENT) {
+	link_elf_error("Unsupported file version");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
+	link_elf_error("Unsupported file type");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_machine != ELF_TARG_MACH) {
+	link_elf_error("Unsupported machine");
+	error = ENOEXEC;
+	goto out;
+    }
+
+    /*
+     * We rely on the program header being in the first page.  This is
+     * not strictly required by the ABI specification, but it seems to
+     * always true in practice.  And, it simplifies things considerably.
+     */
+    if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
+	  (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
+	  (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
+	link_elf_error("Unreadable program headers");
+
+    /*
+     * Scan the program header entries, and save key information.
+     *
+     * We rely on there being exactly two load segments, text and data,
+     * in that order.
+     */
+    phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
+    phlimit = phdr + hdr->e_phnum;
+    nsegs = 0;
+    phdyn = NULL;
+    phphdr = NULL;
+    while (phdr < phlimit) {
+	switch (phdr->p_type) {
+
+	case PT_LOAD:
+	    if (nsegs == 2) {
+		link_elf_error("Too many sections");
+		error = ENOEXEC;
+		goto out;
+	    }
+	    segs[nsegs] = phdr;
+	    ++nsegs;
+	    break;
+
+	case PT_PHDR:
+	    phphdr = phdr;
+	    break;
+
+	case PT_DYNAMIC:
+	    phdyn = phdr;
+	    break;
+	}
+
+	++phdr;
+    }
+    if (phdyn == NULL) {
+	link_elf_error("Object is not dynamically-linked");
+	error = ENOEXEC;
+	goto out;
+    }
+
+    /*
+     * Allocate the entire address space of the object, to stake out our
+     * contiguous region, and to establish the base address for relocation.
+     */
+    base_offset = trunc_page(segs[0]->p_offset);
+    base_vaddr = trunc_page(segs[0]->p_vaddr);
+    base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
+    mapsize = base_vlimit - base_vaddr;
+
+    ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK);
+    bzero(ef, sizeof(*ef));
+#ifdef SPARSE_MAPPING
+    ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
+    if (ef->object == NULL) {
+	free(ef, M_LINKER);
+	error = ENOMEM;
+	goto out;
+    }
+    vm_object_reference(ef->object);
+    ef->address = (caddr_t) vm_map_min(kernel_map);
+    error = vm_map_find(kernel_map, ef->object, 0,
+			(vm_offset_t *) &ef->address,
+			mapsize, 1,
+			VM_PROT_ALL, VM_PROT_ALL, 0);
+    if (error) {
+	vm_object_deallocate(ef->object);
+	free(ef, M_LINKER);
+	goto out;
+    }
+#else
+    ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
+#endif
+    mapbase = ef->address;
+
+    /*
+     * Read the text and data sections and zero the bss.
+     */
+    for (i = 0; i < 2; i++) {
+	caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
+	error = vn_rdwr(UIO_READ, nd.ni_vp,
+			segbase, segs[i]->p_filesz, segs[i]->p_offset,
+			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+	if (error) {
+#ifdef SPARSE_MAPPING
+	    vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+			  (vm_offset_t) ef->address
+			  + (ef->object->size << PAGE_SHIFT));
+	    vm_object_deallocate(ef->object);
+#else
+	    free(ef->address, M_LINKER);
+#endif
+	    free(ef, M_LINKER);
+	    goto out;
+	}
+	bzero(segbase + segs[i]->p_filesz,
+	      segs[i]->p_memsz - segs[i]->p_filesz);
+
+#ifdef SPARSE_MAPPING
+	/*
+	 * Wire down the pages
+	 */
+	vm_map_pageable(kernel_map,
+			(vm_offset_t) segbase,
+			(vm_offset_t) segbase + segs[i]->p_memsz,
+			FALSE);
+#endif
+    }
+
+    ef->dynamic = (const Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
+
+    lf = linker_make_file(filename, ef, &link_elf_file_ops);
+    if (lf == NULL) {
+#ifdef SPARSE_MAPPING
+	vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+		      (vm_offset_t) ef->address
+		      + (ef->object->size << PAGE_SHIFT));
+	vm_object_deallocate(ef->object);
+#else
+	free(ef->address, M_LINKER);
+#endif
+	free(ef, M_LINKER);
+	error = ENOMEM;
+	goto out;
+    }
+    lf->address = ef->address;
+    lf->size = mapsize;
+
+    error = parse_dynamic(lf);
+    if (error)
+	goto out;
+    error = load_dependancies(lf);
+    if (error)
+	goto out;
+    error = relocate_file(lf);
+    if (error)
+	goto out;
+
+    /* Try and load the symbol table if it's present.  (you can strip it!) */
+    nbytes = hdr->e_shnum * hdr->e_shentsize;
+    if (nbytes == 0 || hdr->e_shoff == 0)
+	goto nosyms;
+    shdr = malloc(nbytes, M_LINKER, M_WAITOK);
+    if (shdr == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    bzero(shdr, nbytes);
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    (caddr_t)shdr, nbytes, hdr->e_shoff,
+		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+    if (error)
+	goto out;
+    symtabindex = -1;
+    symstrindex = -1;
+    for (i = 0; i < hdr->e_shnum; i++) {
+	if (shdr[i].sh_type == SHT_SYMTAB) {
+	    symtabindex = i;
+	    symstrindex = shdr[i].sh_link;
+	}
+    }
+    if (symtabindex < 0 || symstrindex < 0)
+	goto nosyms;
+
+    symcnt = shdr[symtabindex].sh_size;
+    ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
+    strcnt = shdr[symstrindex].sh_size;
+    ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
+
+    if (ef->symbase == NULL || ef->strbase == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    ef->symbase, symcnt, shdr[symtabindex].sh_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+    if (error)
+	goto out;
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    ef->strbase, strcnt, shdr[symstrindex].sh_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+    if (error)
+	goto out;
+
+    ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+    ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
+    ef->ddbstrcnt = strcnt;
+    ef->ddbstrtab = ef->strbase;
+
+nosyms:
+
+    *result = lf;
+
+out:
+    if (error && lf)
+	linker_file_unload(lf);
+    if (shdr)
+	free(shdr, M_LINKER);
+    if (firstpage)
+	free(firstpage, M_LINKER);
+    VOP_UNLOCK(nd.ni_vp, 0, p);
+    vn_close(nd.ni_vp, FREAD, p->p_ucred, p);
+
+    return error;
+}
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+    elf_file_t ef = file->priv;
+
+    if (ef) {
+#ifdef SPARSE_MAPPING
+	if (ef->object) {
+	    vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+			  (vm_offset_t) ef->address
+			  + (ef->object->size << PAGE_SHIFT));
+	    vm_object_deallocate(ef->object);
+	}
+#else
+	if (ef->address)
+	    free(ef->address, M_LINKER);
+#endif
+	if (ef->symbase)
+	    free(ef->symbase, M_LINKER);
+	if (ef->strbase)
+	    free(ef->strbase, M_LINKER);
+	free(ef, M_LINKER);
+    }
+}
+
+static void
+link_elf_unload_module(linker_file_t file)
+{
+    elf_file_t ef = file->priv;
+
+    if (ef)
+	free(ef, M_LINKER);
+    if (file->filename)
+	preload_delete_name(file->filename);
+}
+
+static int
+load_dependancies(linker_file_t lf)
+{
+    elf_file_t ef = lf->priv;
+    linker_file_t lfdep;
+    char* name;
+    const Elf_Dyn *dp;
+    int error = 0;
+
+    /*
+     * All files are dependant on /kernel.
+     */
+    if (linker_kernel_file) {
+	linker_kernel_file->refs++;
+	linker_file_add_dependancy(lf, linker_kernel_file);
+    }
+
+    for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+	if (dp->d_tag == DT_NEEDED) {
+	    name = ef->strtab + dp->d_un.d_val;
+
+	    error = linker_load_file(name, &lfdep);
+	    if (error)
+		goto out;
+	    error = linker_file_add_dependancy(lf, lfdep);
+	    if (error)
+		goto out;
+	}
+    }
+
+out:
+    return error;
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Word r_info)
+{
+    const Elf_Sym *ref;
+
+    if (ELF_R_SYM(r_info)) {
+	ref = ef->symtab + ELF_R_SYM(r_info);
+	return ef->strtab + ref->st_name;
+    } else
+	return NULL;
+}
+
+static int
+relocate_file(linker_file_t lf)
+{
+    elf_file_t ef = lf->priv;
+    const Elf_Rel *rellim;
+    const Elf_Rel *rel;
+    const Elf_Rela *relalim;
+    const Elf_Rela *rela;
+    const char *symname;
+
+    /* Perform relocations without addend if there are any: */
+    rel = ef->rel;
+    if (rel) {
+	rellim = (const Elf_Rel *) ((caddr_t) ef->rel + ef->relsize);
+	while (rel < rellim) {
+	    symname = symbol_name(ef, rel->r_info);
+	    if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) {
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rel++;
+	}
+    }
+
+    /* Perform relocations with addend if there are any: */
+    rela = ef->rela;
+    if (rela) {
+	relalim = (const Elf_Rela *) ((caddr_t) ef->rela + ef->relasize);
+	while (rela < relalim) {
+	    symname = symbol_name(ef, rela->r_info);
+	    if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) {
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rela++;
+	}
+    }
+
+    /* Perform PLT relocations without addend if there are any: */
+    rel = ef->pltrel;
+    if (rel) {
+	rellim = (const Elf_Rel *) ((caddr_t) ef->pltrel + ef->pltrelsize);
+	while (rel < rellim) {
+	    symname = symbol_name(ef, rel->r_info);
+	    if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) {
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rel++;
+	}
+    }
+
+    /* Perform relocations with addend if there are any: */
+    rela = ef->pltrela;
+    if (rela) {
+	relalim = (const Elf_Rela *) ((caddr_t) ef->pltrela + ef->pltrelasize);
+	while (rela < relalim) {
+	    symname = symbol_name(ef, rela->r_info);
+	    if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) {
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rela++;
+	}
+    }
+
+    return 0;
+}
+
+/*
+ * Hash function for symbol table lookup.  Don't even think about changing
+ * this.  It is specified by the System V ABI.
+ */
+static unsigned long
+elf_hash(const char *name)
+{
+    const unsigned char *p = (const unsigned char *) name;
+    unsigned long h = 0;
+    unsigned long g;
+
+    while (*p != '\0') {
+	h = (h << 4) + *p++;
+	if ((g = h & 0xf0000000) != 0)
+	    h ^= g >> 24;
+	h &= ~g;
+    }
+    return h;
+}
+
+int
+link_elf_lookup_symbol(linker_file_t lf, const char* name, linker_sym_t* sym)
+{
+    elf_file_t ef = lf->priv;
+    unsigned long symnum;
+    const Elf_Sym* symp;
+    const char *strp;
+    unsigned long hash;
+    int i;
+
+    /* First, search hashed global symbols */
+    hash = elf_hash(name);
+    symnum = ef->buckets[hash % ef->nbuckets];
+
+    while (symnum != STN_UNDEF) {
+	if (symnum >= ef->nchains) {
+	    printf("link_elf_lookup_symbol: corrupt symbol table\n");
+	    return ENOENT;
+	}
+
+	symp = ef->symtab + symnum;
+	if (symp->st_name == 0) {
+	    printf("link_elf_lookup_symbol: corrupt symbol table\n");
+	    return ENOENT;
+	}
+
+	strp = ef->strtab + symp->st_name;
+
+	if (strcmp(name, strp) == 0) {
+	    if (symp->st_shndx != SHN_UNDEF ||
+		(symp->st_value != 0 &&
+		 ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+		*sym = (linker_sym_t) symp;
+		return 0;
+	    } else
+		return ENOENT;
+	}
+
+	symnum = ef->chains[symnum];
+    }
+
+    /* If we have not found it, look at the full table (if loaded) */
+    if (ef->symtab == ef->ddbsymtab)
+	return ENOENT;
+
+    /* Exhaustive search */
+    for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+	strp = ef->ddbstrtab + symp->st_name;
+	if (strcmp(name, strp) == 0) {
+	    if (symp->st_shndx != SHN_UNDEF ||
+		(symp->st_value != 0 &&
+		 ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+		*sym = (linker_sym_t) symp;
+		return 0;
+	    } else
+		return ENOENT;
+	}
+    }
+
+    return ENOENT;
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, linker_sym_t sym, linker_symval_t* symval)
+{
+	elf_file_t ef = lf->priv;
+	Elf_Sym* es = (Elf_Sym*) sym;
+
+	if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) {
+	    symval->name = ef->strtab + es->st_name;
+	    symval->value = (caddr_t) ef->address + es->st_value;
+	    symval->size = es->st_size;
+	    return 0;
+	}
+	if (ef->symtab == ef->ddbsymtab)
+	    return ENOENT;
+	if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) {
+	    symval->name = ef->ddbstrtab + es->st_name;
+	    symval->value = (caddr_t) ef->address + es->st_value;
+	    symval->size = es->st_size;
+	    return 0;
+	}
+	return ENOENT;
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+		       linker_sym_t* sym, long* diffp)
+{
+	elf_file_t ef = lf->priv;
+	u_long off = (u_long) value;
+	u_long diff = off;
+	const Elf_Sym* es;
+	const Elf_Sym* best = 0;
+	int i;
+
+	for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+		if (es->st_name == 0)
+			continue;
+		if (off >= es->st_value) {
+			if (off - es->st_value < diff) {
+				diff = off - es->st_value;
+				best = es;
+				if (diff == 0)
+					break;
+			} else if (off - es->st_value == diff) {
+				best = es;
+			}
+		}
+	}
+	if (best == 0)
+		*diffp = off;
+	else
+		*diffp = diff;
+	*sym = (linker_sym_t) best;
+
+	return 0;
+}
diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c
new file mode 100644
index 0000000..c5e84da
--- /dev/null
+++ b/sys/kern/link_elf_obj.c
@@ -0,0 +1,981 @@
+/*-
+ * Copyright (c) 1998 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: link_elf.c,v 1.10 1998/11/06 15:16:07 peter Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+#include <machine/elf.h>
+
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+static int	link_elf_load_module(const char*, linker_file_t*);
+static int	link_elf_load_file(const char*, linker_file_t*);
+static int	link_elf_lookup_symbol(linker_file_t, const char*,
+				       linker_sym_t*);
+static int	link_elf_symbol_values(linker_file_t, linker_sym_t, linker_symval_t*);
+static int	link_elf_search_symbol(linker_file_t, caddr_t value,
+				       linker_sym_t* sym, long* diffp);
+
+static void	link_elf_unload_file(linker_file_t);
+static void	link_elf_unload_module(linker_file_t);
+
+static struct linker_class_ops link_elf_class_ops = {
+    link_elf_load_module,
+};
+
+static struct linker_file_ops link_elf_file_ops = {
+    link_elf_lookup_symbol,
+    link_elf_symbol_values,
+    link_elf_search_symbol,
+    link_elf_unload_file,
+};
+
+static struct linker_file_ops link_elf_module_ops = {
+    link_elf_lookup_symbol,
+    link_elf_symbol_values,
+    link_elf_search_symbol,
+    link_elf_unload_module,
+};
+typedef struct elf_file {
+    caddr_t		address;	/* Relocation address */
+#ifdef SPARSE_MAPPING
+    vm_object_t		object;		/* VM object to hold file pages */
+#endif
+    const Elf_Dyn*	dynamic;	/* Symbol table etc. */
+    Elf_Off		nbuckets;	/* DT_HASH info */
+    Elf_Off		nchains;
+    const Elf_Off*	buckets;
+    const Elf_Off*	chains;
+    caddr_t		hash;
+    caddr_t		strtab;		/* DT_STRTAB */
+    int			strsz;		/* DT_STRSZ */
+    const Elf_Sym*	symtab;		/* DT_SYMTAB */
+    Elf_Addr*		got;		/* DT_PLTGOT */
+    const Elf_Rel*	pltrel;		/* DT_JMPREL */
+    int			pltrelsize;	/* DT_PLTRELSZ */
+    const Elf_Rela*	pltrela;	/* DT_JMPREL */
+    int			pltrelasize;	/* DT_PLTRELSZ */
+    const Elf_Rel*	rel;		/* DT_REL */
+    int			relsize;	/* DT_RELSZ */
+    const Elf_Rela*	rela;		/* DT_RELA */
+    int			relasize;	/* DT_RELASZ */
+    caddr_t		modptr;
+    const Elf_Sym*	ddbsymtab;	/* The symbol table we are using */
+    long		ddbsymcnt;	/* Number of symbols */
+    caddr_t		ddbstrtab;	/* String table */
+    long		ddbstrcnt;	/* number of bytes in string table */
+    caddr_t		symbase;	/* malloc'ed symbold base */
+    caddr_t		strbase;	/* malloc'ed string base */
+} *elf_file_t;
+
+static int		parse_dynamic(linker_file_t lf);
+static int		load_dependancies(linker_file_t lf);
+static int		relocate_file(linker_file_t lf);
+static int		parse_module_symbols(linker_file_t lf);
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_elf_init(void* arg)
+{
+#ifdef __ELF__
+    Elf_Dyn	*dp;
+    caddr_t	modptr, baseptr, sizeptr;
+    elf_file_t	ef;
+    char	*modname;
+#endif
+
+#if ELF_TARG_CLASS == ELFCLASS32
+    linker_add_class("elf32", NULL, &link_elf_class_ops);
+#else
+    linker_add_class("elf64", NULL, &link_elf_class_ops);
+#endif
+
+#ifdef __ELF__
+    dp = (Elf_Dyn*) &_DYNAMIC;
+    if (dp) {
+	ef = malloc(sizeof(struct elf_file), M_LINKER, M_NOWAIT);
+	if (ef == NULL)
+	    panic("link_elf_init: Can't create linker structures for kernel");
+	bzero(ef, sizeof(*ef));
+
+	ef->address = 0;
+#ifdef SPARSE_MAPPING
+	ef->object = 0;
+#endif
+	ef->dynamic = dp;
+	modname = NULL;
+	modptr = preload_search_by_type("elf kernel");
+	if (modptr)
+	    modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+	if (modname == NULL)
+	    modname = "kernel";
+	linker_kernel_file = linker_make_file(modname, ef, &link_elf_file_ops);
+	if (linker_kernel_file == NULL)
+	    panic("link_elf_init: Can't create linker structures for kernel");
+	parse_dynamic(linker_kernel_file);
+	/* Sigh, magic constants. */
+#ifdef __alpha__
+	linker_kernel_file->address = (caddr_t) 0xfffffc0000300000;
+#else
+	linker_kernel_file->address = (caddr_t) 0xf0100000;
+#endif
+	linker_kernel_file->size = -(long)linker_kernel_file->address;
+
+	if (modptr) {
+	    ef->modptr = modptr;
+	    baseptr = preload_search_info(modptr, MODINFO_ADDR);
+	    if (baseptr)
+		linker_kernel_file->address = *(caddr_t *)baseptr;
+	    sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+	    if (sizeptr)
+		linker_kernel_file->size = *(size_t *)sizeptr;
+	}
+	(void)parse_module_symbols(linker_kernel_file);
+	linker_current_file = linker_kernel_file;
+    }
+#endif
+}
+
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+
+static int
+parse_module_symbols(linker_file_t lf)
+{
+    elf_file_t ef = lf->priv;
+    caddr_t	pointer;
+    caddr_t	ssym, esym, base;
+    caddr_t	strtab;
+    int		strcnt;
+    Elf_Sym*	symtab;
+    int		symcnt;
+
+    if (ef->modptr == NULL)
+	return 0;
+    pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM);
+    if (pointer == NULL)
+	return 0;
+    ssym = *(caddr_t *)pointer;
+    pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM);
+    if (pointer == NULL)
+	return 0;
+    esym = *(caddr_t *)pointer;
+
+    base = ssym;
+
+    symcnt = *(long *)base;
+    base += sizeof(long);
+    symtab = (Elf_Sym *)base;
+    base += roundup(symcnt, sizeof(long));
+
+    if (base > esym || base < ssym) {
+	printf("Symbols are corrupt!\n");
+	return EINVAL;
+    }
+
+    strcnt = *(long *)base;
+    base += sizeof(long);
+    strtab = base;
+    base += roundup(strcnt, sizeof(long));
+
+    if (base > esym || base < ssym) {
+	printf("Symbols are corrupt!\n");
+	return EINVAL;
+    }
+
+    ef->ddbsymtab = symtab;
+    ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+    ef->ddbstrtab = strtab;
+    ef->ddbstrcnt = strcnt;
+
+    return 0;
+}
+
+static int
+parse_dynamic(linker_file_t lf)
+{
+    elf_file_t ef = lf->priv;
+    const Elf_Dyn *dp;
+    int plttype = DT_REL;
+
+    for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+	switch (dp->d_tag) {
+	case DT_HASH:
+	{
+	    /* From src/libexec/rtld-elf/rtld.c */
+	    const Elf_Off *hashtab = (const Elf_Off *)
+		(ef->address + dp->d_un.d_ptr);
+	    ef->nbuckets = hashtab[0];
+	    ef->nchains = hashtab[1];
+	    ef->buckets = hashtab + 2;
+	    ef->chains = ef->buckets + ef->nbuckets;
+	    break;
+	}
+	case DT_STRTAB:
+	    ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_STRSZ:
+	    ef->strsz = dp->d_un.d_val;
+	    break;
+	case DT_SYMTAB:
+	    ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_SYMENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Sym))
+		return ENOEXEC;
+	    break;
+	case DT_PLTGOT:
+	    ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_REL:
+	    ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_RELSZ:
+	    ef->relsize = dp->d_un.d_val;
+	    break;
+	case DT_RELENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Rel))
+		return ENOEXEC;
+	    break;
+	case DT_JMPREL:
+	    ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_PLTRELSZ:
+	    ef->pltrelsize = dp->d_un.d_val;
+	    break;
+	case DT_RELA:
+	    ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_RELASZ:
+	    ef->relasize = dp->d_un.d_val;
+	    break;
+	case DT_RELAENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Rela))
+		return ENOEXEC;
+	    break;
+	case DT_PLTREL:
+	    plttype = dp->d_un.d_val;
+	    if (plttype != DT_REL && plttype != DT_RELA)
+		return ENOEXEC;
+	    break;
+	}
+    }
+
+    if (plttype == DT_RELA) {
+	ef->pltrela = (const Elf_Rela *) ef->pltrel;
+	ef->pltrel = NULL;
+	ef->pltrelasize = ef->pltrelsize;
+	ef->pltrelsize = 0;
+    }
+
+    ef->ddbsymtab = ef->symtab;
+    ef->ddbsymcnt = ef->nchains;
+    ef->ddbstrtab = ef->strtab;
+    ef->ddbstrcnt = ef->strsz;
+
+    return 0;
+}
+
+static void
+link_elf_error(const char *s)
+{
+    printf("kldload: %s\n", s);
+}
+
+static int
+link_elf_load_module(const char *filename, linker_file_t *result)
+{
+    caddr_t		modptr, baseptr, sizeptr, dynptr;
+    char		*type;
+    elf_file_t		ef;
+    linker_file_t	lf;
+    int			error;
+    vm_offset_t		dp;
+
+    /* Look to see if we have the module preloaded */
+    modptr = preload_search_by_name(filename);
+    if (modptr == NULL)
+	return (link_elf_load_file(filename, result));
+
+    /* It's preloaded, check we can handle it and collect information */
+    type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+    baseptr = preload_search_info(modptr, MODINFO_ADDR);
+    sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+    dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC);
+    if (type == NULL || strcmp(type, "elf module") != 0)
+	return (EFTYPE);
+    if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
+	return (EINVAL);
+
+    ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK);
+    if (ef == NULL)
+	return (ENOMEM);
+    bzero(ef, sizeof(*ef));
+    ef->modptr = modptr;
+    ef->address = *(caddr_t *)baseptr;
+#ifdef SPARSE_MAPPING
+    ef->object = 0;
+#endif
+    dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
+    ef->dynamic = (Elf_Dyn *)dp;
+    lf = linker_make_file(filename, ef, &link_elf_module_ops);
+    if (lf == NULL) {
+	free(ef, M_LINKER);
+	return ENOMEM;
+    }
+    lf->address = ef->address;
+    lf->size = *(size_t *)sizeptr;
+
+    error = parse_dynamic(lf);
+    if (error) {
+	linker_file_unload(lf);
+	return error;
+    }
+    error = load_dependancies(lf);
+    if (error) {
+	linker_file_unload(lf);
+	return error;
+    }
+    error = relocate_file(lf);
+    if (error) {
+	linker_file_unload(lf);
+	return error;
+    }
+    (void)parse_module_symbols(lf);
+    *result = lf;
+    return (0);
+}
+
+static int
+link_elf_load_file(const char* filename, linker_file_t* result)
+{
+    struct nameidata nd;
+    struct proc* p = curproc;	/* XXX */
+    Elf_Ehdr *hdr;
+    caddr_t firstpage;
+    int nbytes, i;
+    Elf_Phdr *phdr;
+    Elf_Phdr *phlimit;
+    Elf_Phdr *segs[2];
+    int nsegs;
+    Elf_Phdr *phdyn;
+    Elf_Phdr *phphdr;
+    caddr_t mapbase;
+    size_t mapsize;
+    Elf_Off base_offset;
+    Elf_Addr base_vaddr;
+    Elf_Addr base_vlimit;
+    int error = 0;
+    int resid;
+    elf_file_t ef;
+    linker_file_t lf;
+    char *pathname;
+    Elf_Shdr *shdr;
+    int symtabindex;
+    int symstrindex;
+    int symcnt;
+    int strcnt;
+
+    shdr = NULL;
+    lf = NULL;
+
+    pathname = linker_search_path(filename);
+    if (pathname == NULL)
+	return ENOENT;
+
+    NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, pathname, p);
+    error = vn_open(&nd, FREAD, 0);
+    free(pathname, M_LINKER);
+    if (error)
+	return error;
+
+    /*
+     * Read the elf header from the file.
+     */
+    firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
+    if (firstpage == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    hdr = (Elf_Ehdr *)firstpage;
+    error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
+		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+    nbytes = PAGE_SIZE - resid;
+    if (error)
+	goto out;
+
+    if (!IS_ELF(*hdr)) {
+	error = ENOEXEC;
+	goto out;
+    }
+
+    if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
+      || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+	link_elf_error("Unsupported file layout");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_ident[EI_VERSION] != EV_CURRENT
+      || hdr->e_version != EV_CURRENT) {
+	link_elf_error("Unsupported file version");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
+	link_elf_error("Unsupported file type");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_machine != ELF_TARG_MACH) {
+	link_elf_error("Unsupported machine");
+	error = ENOEXEC;
+	goto out;
+    }
+
+    /*
+     * We rely on the program header being in the first page.  This is
+     * not strictly required by the ABI specification, but it seems to
+     * always true in practice.  And, it simplifies things considerably.
+     */
+    if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
+	  (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
+	  (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
+	link_elf_error("Unreadable program headers");
+
+    /*
+     * Scan the program header entries, and save key information.
+     *
+     * We rely on there being exactly two load segments, text and data,
+     * in that order.
+     */
+    phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
+    phlimit = phdr + hdr->e_phnum;
+    nsegs = 0;
+    phdyn = NULL;
+    phphdr = NULL;
+    while (phdr < phlimit) {
+	switch (phdr->p_type) {
+
+	case PT_LOAD:
+	    if (nsegs == 2) {
+		link_elf_error("Too many sections");
+		error = ENOEXEC;
+		goto out;
+	    }
+	    segs[nsegs] = phdr;
+	    ++nsegs;
+	    break;
+
+	case PT_PHDR:
+	    phphdr = phdr;
+	    break;
+
+	case PT_DYNAMIC:
+	    phdyn = phdr;
+	    break;
+	}
+
+	++phdr;
+    }
+    if (phdyn == NULL) {
+	link_elf_error("Object is not dynamically-linked");
+	error = ENOEXEC;
+	goto out;
+    }
+
+    /*
+     * Allocate the entire address space of the object, to stake out our
+     * contiguous region, and to establish the base address for relocation.
+     */
+    base_offset = trunc_page(segs[0]->p_offset);
+    base_vaddr = trunc_page(segs[0]->p_vaddr);
+    base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
+    mapsize = base_vlimit - base_vaddr;
+
+    ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK);
+    bzero(ef, sizeof(*ef));
+#ifdef SPARSE_MAPPING
+    ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
+    if (ef->object == NULL) {
+	free(ef, M_LINKER);
+	error = ENOMEM;
+	goto out;
+    }
+    vm_object_reference(ef->object);
+    ef->address = (caddr_t) vm_map_min(kernel_map);
+    error = vm_map_find(kernel_map, ef->object, 0,
+			(vm_offset_t *) &ef->address,
+			mapsize, 1,
+			VM_PROT_ALL, VM_PROT_ALL, 0);
+    if (error) {
+	vm_object_deallocate(ef->object);
+	free(ef, M_LINKER);
+	goto out;
+    }
+#else
+    ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
+#endif
+    mapbase = ef->address;
+
+    /*
+     * Read the text and data sections and zero the bss.
+     */
+    for (i = 0; i < 2; i++) {
+	caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
+	error = vn_rdwr(UIO_READ, nd.ni_vp,
+			segbase, segs[i]->p_filesz, segs[i]->p_offset,
+			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+	if (error) {
+#ifdef SPARSE_MAPPING
+	    vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+			  (vm_offset_t) ef->address
+			  + (ef->object->size << PAGE_SHIFT));
+	    vm_object_deallocate(ef->object);
+#else
+	    free(ef->address, M_LINKER);
+#endif
+	    free(ef, M_LINKER);
+	    goto out;
+	}
+	bzero(segbase + segs[i]->p_filesz,
+	      segs[i]->p_memsz - segs[i]->p_filesz);
+
+#ifdef SPARSE_MAPPING
+	/*
+	 * Wire down the pages
+	 */
+	vm_map_pageable(kernel_map,
+			(vm_offset_t) segbase,
+			(vm_offset_t) segbase + segs[i]->p_memsz,
+			FALSE);
+#endif
+    }
+
+    ef->dynamic = (const Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
+
+    lf = linker_make_file(filename, ef, &link_elf_file_ops);
+    if (lf == NULL) {
+#ifdef SPARSE_MAPPING
+	vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+		      (vm_offset_t) ef->address
+		      + (ef->object->size << PAGE_SHIFT));
+	vm_object_deallocate(ef->object);
+#else
+	free(ef->address, M_LINKER);
+#endif
+	free(ef, M_LINKER);
+	error = ENOMEM;
+	goto out;
+    }
+    lf->address = ef->address;
+    lf->size = mapsize;
+
+    error = parse_dynamic(lf);
+    if (error)
+	goto out;
+    error = load_dependancies(lf);
+    if (error)
+	goto out;
+    error = relocate_file(lf);
+    if (error)
+	goto out;
+
+    /* Try and load the symbol table if it's present.  (you can strip it!) */
+    nbytes = hdr->e_shnum * hdr->e_shentsize;
+    if (nbytes == 0 || hdr->e_shoff == 0)
+	goto nosyms;
+    shdr = malloc(nbytes, M_LINKER, M_WAITOK);
+    if (shdr == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    bzero(shdr, nbytes);
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    (caddr_t)shdr, nbytes, hdr->e_shoff,
+		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+    if (error)
+	goto out;
+    symtabindex = -1;
+    symstrindex = -1;
+    for (i = 0; i < hdr->e_shnum; i++) {
+	if (shdr[i].sh_type == SHT_SYMTAB) {
+	    symtabindex = i;
+	    symstrindex = shdr[i].sh_link;
+	}
+    }
+    if (symtabindex < 0 || symstrindex < 0)
+	goto nosyms;
+
+    symcnt = shdr[symtabindex].sh_size;
+    ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
+    strcnt = shdr[symstrindex].sh_size;
+    ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
+
+    if (ef->symbase == NULL || ef->strbase == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    ef->symbase, symcnt, shdr[symtabindex].sh_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+    if (error)
+	goto out;
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    ef->strbase, strcnt, shdr[symstrindex].sh_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p);
+    if (error)
+	goto out;
+
+    ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+    ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
+    ef->ddbstrcnt = strcnt;
+    ef->ddbstrtab = ef->strbase;
+
+nosyms:
+
+    *result = lf;
+
+out:
+    if (error && lf)
+	linker_file_unload(lf);
+    if (shdr)
+	free(shdr, M_LINKER);
+    if (firstpage)
+	free(firstpage, M_LINKER);
+    VOP_UNLOCK(nd.ni_vp, 0, p);
+    vn_close(nd.ni_vp, FREAD, p->p_ucred, p);
+
+    return error;
+}
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+    elf_file_t ef = file->priv;
+
+    if (ef) {
+#ifdef SPARSE_MAPPING
+	if (ef->object) {
+	    vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+			  (vm_offset_t) ef->address
+			  + (ef->object->size << PAGE_SHIFT));
+	    vm_object_deallocate(ef->object);
+	}
+#else
+	if (ef->address)
+	    free(ef->address, M_LINKER);
+#endif
+	if (ef->symbase)
+	    free(ef->symbase, M_LINKER);
+	if (ef->strbase)
+	    free(ef->strbase, M_LINKER);
+	free(ef, M_LINKER);
+    }
+}
+
+static void
+link_elf_unload_module(linker_file_t file)
+{
+    elf_file_t ef = file->priv;
+
+    if (ef)
+	free(ef, M_LINKER);
+    if (file->filename)
+	preload_delete_name(file->filename);
+}
+
+static int
+load_dependancies(linker_file_t lf)
+{
+    elf_file_t ef = lf->priv;
+    linker_file_t lfdep;
+    char* name;
+    const Elf_Dyn *dp;
+    int error = 0;
+
+    /*
+     * All files are dependant on /kernel.
+     */
+    if (linker_kernel_file) {
+	linker_kernel_file->refs++;
+	linker_file_add_dependancy(lf, linker_kernel_file);
+    }
+
+    for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+	if (dp->d_tag == DT_NEEDED) {
+	    name = ef->strtab + dp->d_un.d_val;
+
+	    error = linker_load_file(name, &lfdep);
+	    if (error)
+		goto out;
+	    error = linker_file_add_dependancy(lf, lfdep);
+	    if (error)
+		goto out;
+	}
+    }
+
+out:
+    return error;
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Word r_info)
+{
+    const Elf_Sym *ref;
+
+    if (ELF_R_SYM(r_info)) {
+	ref = ef->symtab + ELF_R_SYM(r_info);
+	return ef->strtab + ref->st_name;
+    } else
+	return NULL;
+}
+
+static int
+relocate_file(linker_file_t lf)
+{
+    elf_file_t ef = lf->priv;
+    const Elf_Rel *rellim;
+    const Elf_Rel *rel;
+    const Elf_Rela *relalim;
+    const Elf_Rela *rela;
+    const char *symname;
+
+    /* Perform relocations without addend if there are any: */
+    rel = ef->rel;
+    if (rel) {
+	rellim = (const Elf_Rel *) ((caddr_t) ef->rel + ef->relsize);
+	while (rel < rellim) {
+	    symname = symbol_name(ef, rel->r_info);
+	    if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) {
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rel++;
+	}
+    }
+
+    /* Perform relocations with addend if there are any: */
+    rela = ef->rela;
+    if (rela) {
+	relalim = (const Elf_Rela *) ((caddr_t) ef->rela + ef->relasize);
+	while (rela < relalim) {
+	    symname = symbol_name(ef, rela->r_info);
+	    if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) {
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rela++;
+	}
+    }
+
+    /* Perform PLT relocations without addend if there are any: */
+    rel = ef->pltrel;
+    if (rel) {
+	rellim = (const Elf_Rel *) ((caddr_t) ef->pltrel + ef->pltrelsize);
+	while (rel < rellim) {
+	    symname = symbol_name(ef, rel->r_info);
+	    if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) {
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rel++;
+	}
+    }
+
+    /* Perform relocations with addend if there are any: */
+    rela = ef->pltrela;
+    if (rela) {
+	relalim = (const Elf_Rela *) ((caddr_t) ef->pltrela + ef->pltrelasize);
+	while (rela < relalim) {
+	    symname = symbol_name(ef, rela->r_info);
+	    if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) {
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rela++;
+	}
+    }
+
+    return 0;
+}
+
+/*
+ * Hash function for symbol table lookup.  Don't even think about changing
+ * this.  It is specified by the System V ABI.
+ */
+static unsigned long
+elf_hash(const char *name)
+{
+    const unsigned char *p = (const unsigned char *) name;
+    unsigned long h = 0;
+    unsigned long g;
+
+    while (*p != '\0') {
+	h = (h << 4) + *p++;
+	if ((g = h & 0xf0000000) != 0)
+	    h ^= g >> 24;
+	h &= ~g;
+    }
+    return h;
+}
+
+int
+link_elf_lookup_symbol(linker_file_t lf, const char* name, linker_sym_t* sym)
+{
+    elf_file_t ef = lf->priv;
+    unsigned long symnum;
+    const Elf_Sym* symp;
+    const char *strp;
+    unsigned long hash;
+    int i;
+
+    /* First, search hashed global symbols */
+    hash = elf_hash(name);
+    symnum = ef->buckets[hash % ef->nbuckets];
+
+    while (symnum != STN_UNDEF) {
+	if (symnum >= ef->nchains) {
+	    printf("link_elf_lookup_symbol: corrupt symbol table\n");
+	    return ENOENT;
+	}
+
+	symp = ef->symtab + symnum;
+	if (symp->st_name == 0) {
+	    printf("link_elf_lookup_symbol: corrupt symbol table\n");
+	    return ENOENT;
+	}
+
+	strp = ef->strtab + symp->st_name;
+
+	if (strcmp(name, strp) == 0) {
+	    if (symp->st_shndx != SHN_UNDEF ||
+		(symp->st_value != 0 &&
+		 ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+		*sym = (linker_sym_t) symp;
+		return 0;
+	    } else
+		return ENOENT;
+	}
+
+	symnum = ef->chains[symnum];
+    }
+
+    /* If we have not found it, look at the full table (if loaded) */
+    if (ef->symtab == ef->ddbsymtab)
+	return ENOENT;
+
+    /* Exhaustive search */
+    for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+	strp = ef->ddbstrtab + symp->st_name;
+	if (strcmp(name, strp) == 0) {
+	    if (symp->st_shndx != SHN_UNDEF ||
+		(symp->st_value != 0 &&
+		 ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+		*sym = (linker_sym_t) symp;
+		return 0;
+	    } else
+		return ENOENT;
+	}
+    }
+
+    return ENOENT;
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, linker_sym_t sym, linker_symval_t* symval)
+{
+	elf_file_t ef = lf->priv;
+	Elf_Sym* es = (Elf_Sym*) sym;
+
+	if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) {
+	    symval->name = ef->strtab + es->st_name;
+	    symval->value = (caddr_t) ef->address + es->st_value;
+	    symval->size = es->st_size;
+	    return 0;
+	}
+	if (ef->symtab == ef->ddbsymtab)
+	    return ENOENT;
+	if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) {
+	    symval->name = ef->ddbstrtab + es->st_name;
+	    symval->value = (caddr_t) ef->address + es->st_value;
+	    symval->size = es->st_size;
+	    return 0;
+	}
+	return ENOENT;
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+		       linker_sym_t* sym, long* diffp)
+{
+	elf_file_t ef = lf->priv;
+	u_long off = (u_long) value;
+	u_long diff = off;
+	const Elf_Sym* es;
+	const Elf_Sym* best = 0;
+	int i;
+
+	for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+		if (es->st_name == 0)
+			continue;
+		if (off >= es->st_value) {
+			if (off - es->st_value < diff) {
+				diff = off - es->st_value;
+				best = es;
+				if (diff == 0)
+					break;
+			} else if (off - es->st_value == diff) {
+				best = es;
+			}
+		}
+	}
+	if (best == 0)
+		*diffp = off;
+	else
+		*diffp = diff;
+	*sym = (linker_sym_t) best;
+
+	return 0;
+}
diff --git a/sys/kern/makedevops.pl b/sys/kern/makedevops.pl
new file mode 100644
index 0000000..24e0b14
--- /dev/null
+++ b/sys/kern/makedevops.pl
@@ -0,0 +1,394 @@
+#!/usr/bin/perl
+#
+# Copyright (c) 1992, 1993
+#	The Regents of the University of California.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+#    must display the following acknowledgement:
+#	This product includes software developed by the University of
+#	California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# From @(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+# From @(#)makedevops.sh 1.1 1998/06/14 13:53:12 dfr Exp $
+# From @(#)makedevops.sh ?.? 1998/10/05
+
+#
+# Script to produce device front-end sugar.
+#
+
+$debug = 0;
+$cfile = 0;          # by default do not produce any file type
+$hfile = 0;
+
+$keepcurrentdir = 1;
+
+$line_width = 80;
+
+use File::Basename;
+
+# Process the command line
+#
+while ( $arg = shift @ARGV ) {
+   if ( $arg eq '-c' ) {
+      warn "Producing .c output files"
+         if $debug;
+      $cfile = 1;
+   } elsif ( $arg eq '-h' ) {
+      warn "Producing .h output files"
+         if $debug;
+      $hfile = 1;
+   } elsif ( $arg eq '-ch' || $arg eq '-hc' ) {
+      warn "Producing .c and .h output files"
+         if $debug;
+      $cfile = 1;
+      $hfile = 1;
+   } elsif ( $arg eq '-d' ) {
+      $debug = 1;
+   } elsif ( $arg eq '-p' ) {
+      warn "Will produce files in original not in current directory"
+         if $debug;
+      $keepcurrentdir = 0;
+   } elsif ( $arg eq '-l' ) {
+      if ( $line_width = shift @ARGV and $line_width > 0 ) {
+         warn "Line width set to $line_width"
+            if $debug;
+      } else {
+         die "Please specify a valid line width after -l";
+      }
+   } elsif ( $arg =~ m/\.m$/ ) {
+      warn "Filename: $arg"
+         if $debug;
+      push @filenames, $arg;
+   } else {
+      warn "$arg ignored"
+         if $debug;
+   }
+}
+
+
+# Validate the command line parameters
+#
+die "usage: $0 [-d] [-p] [-c|-h] srcfile
+where -c   produce only .c files
+      -h   produce only .h files
+      -p   use the path component in the source file for destination dir
+      -l   set line width for output files [80]
+      -d   switch on debugging
+"
+	unless ($cfile or $hfile)
+	   and $#filenames != -1;
+
+# FIXME should be able to do this more easily
+#
+$tmpdir = $ENV{'TMPDIR'};           # environment variables
+$tmpdir = $ENV{'TMP'}
+   if !$tmpdir;
+$tmpdir = $ENV{'TEMP'}
+   if !$tmpdir;
+$tmpdir = '/tmp'                    # look for a physical directory
+   if !$tmpdir and -d '/tmp';
+$tmpdir = '/usr/tmp'
+   if !$tmpdir and -d '/usr/tmp';
+$tmpdir = '/var/tmp'
+   if !$tmpdir and -d '/var/tmp';
+$tmpdir = '.'                       # give up and use current dir
+   if !$tmpdir;
+
+foreach $src ( @filenames ) {
+   # Names of the created files
+   $ctmpname = "$tmpdir/ctmp.$$";
+   $htmpname = "$tmpdir/htmp.$$";
+
+   ($name, $path, $suffix) = &fileparse($src, '.m');
+   $path = '.'
+      if $keepcurrentdir;
+   $cfilename="$path/$name.c";
+   $hfilename="$path/$name.h";
+
+   warn "Processing from $src to $cfile / $hfile via $ctmp / $htmp"
+      if $debug;
+
+   die "Could not open $src, $!"
+      if !open SRC, "$src";
+   die "Could not open $ctmpname, $!"
+      if $cfile and !open CFILE, ">$ctmpname";
+   die "Could not open $htmpname, $!"
+      if $hfile and !open HFILE, ">$htmpname";
+
+   if ( $cfile ) {
+      # Produce the header of the C file
+      #
+      print CFILE "/*\n";
+      print CFILE " * This file is produced automatically.\n";
+      print CFILE " * Do not modify anything in here by hand.\n";
+      print CFILE " *\n";
+      print CFILE " * Created from\n";
+      print CFILE " *   $src\n";
+      print CFILE " * with\n";
+      print CFILE " *   $0\n";
+      print CFILE " */\n";
+      print CFILE "\n";
+      print CFILE "#include <sys/param.h>\n";
+      print CFILE "#include <sys/queue.h>\n";
+      print CFILE "#include <sys/bus_private.h>\n";
+   }
+
+   if ( $hfile ) {
+      # Produce the header of the H file
+      #
+      print HFILE "/*\n";
+      print HFILE " * This file is produced automatically.\n";
+      print HFILE " * Do not modify anything in here by hand.\n";
+      print HFILE " *\n";
+      print HFILE " * Created from\n";
+      print HFILE " *   $src\n";
+      print HFILE " * with\n";
+      print HFILE " *   $0\n";
+      print HFILE " */\n";
+      print HFILE "\n";
+   }
+
+   %methods = ();    # clear list of methods
+   $lineno = 0;
+   $error = 0;       # to signal clean up and gerror setting
+
+   LINE: while ( $line = <SRC> ) {
+      $lineno++;
+
+      # take special notice of include directives.
+      #
+      if ( $line =~ m/^#\s*include\s+(["<])([^">]+)([">]).*/i ) {
+         warn "Included file: $1$2" . ($1 eq '<'? '>':'"')
+            if $debug;
+         print CFILE "#include $1$2" . ($1 eq '<'? '>':'"') . "\n"
+            if $cfile;
+      }
+
+      $line =~ s/#.*//;                # remove comments
+      $line =~ s/^\s+//;               # remove leading ...
+      $line =~ s/\s+$//;               # remove trailing whitespace
+
+      if ( $line =~ m/^$/ ) {          # skip empty lines
+         # nop
+
+      } elsif ( $line =~ m/^INTERFACE\s*([^\s;]*)(\s*;?)/i ) {
+         $intname = $1;
+         $semicolon = $2;
+         unless ( $intname =~ m/^[a-z_][a-z0-9_]*$/ ) {
+            warn $line
+               if $debug;
+            warn "$src:$lineno: Invalid interface name '$intname', use [a-z_][a-z0-9_]*";
+            $error = 1;
+            last LINE;
+         }
+
+         warn "$src:$lineno: semicolon missing at end of line, no problem"
+            if $semicolon !~ s/;$//;
+
+         warn "Interface $intname"
+            if $debug;
+
+         print HFILE '#ifndef _'.$intname."_if_h_\n"
+            if $hfile;
+         print HFILE '#define _'.$intname."_if_h_\n\n"
+            if $hfile;
+         print CFILE '#include "'.$intname.'_if.h"'."\n\n"
+            if $cfile;
+
+      } elsif ( $line =~ m/^METHOD/i ) {
+         # Get the return type function name and delete that from
+         # the line. What is left is the possibly first function argument
+         # if it is on the same line.
+         #
+         # FIXME For compatibilities sake METHOD and METHODE is accepted.
+         #
+         if ( !$intname ) {
+            warn "$src:$lineno: No interface name defined";
+            $error = 1;
+            last LINE;
+         }
+         $line =~ s/^METHODE?\s+([^{]+?)\s*{\s*//i;
+         @ret = split m/\s+/, $1;
+         $name = pop @ret;          # last element is name of method
+         $ret = join(" ", @ret);    # return type
+
+         warn "Method: name=$name return type=$ret"
+            if $debug;
+         
+         if ( !$name or !$ret ) {
+            warn $line
+               if $debug;
+            warn "$src:$lineno: Invalid method specification";
+            $error = 1;
+            last LINE;
+         }
+
+         unless ( $name =~ m/^[a-z_][a-z_0-9]*$/ ) {
+            warn $line
+               if $debug;
+            warn "$src:$lineno: Invalid method name '$name', use [a-z_][a-z0-9_]*";
+            $error = 1;
+            last LINE;
+         }
+
+         if ( defined($methods{$name}) ) {
+            warn "$src:$lineno: Duplicate method name";
+            $error = 1;
+            last LINE;
+         }
+
+         $methods{$name} = 'VIS';
+
+         while ( $line !~ m/}/ and $line .= <SRC> ) { }
+
+         if ( $line !~ s/};?(.*)// ) { # remove first '}' and trailing garbage
+            # The '}' was not there (the rest is optional), so complain
+            warn "$src:$lineno: Premature end of file";
+            $error = 1;
+            last LINE;
+         }
+         warn "$src:$lineno: Ignored '$1'"  # warn about garbage at end of line
+            if $debug and $1;
+
+         # Create a list of variables without the types prepended
+         #
+         $line =~ s/^\s+//;            # remove leading ...
+         $line =~ s/\s+$//;            # ... and trailing whitespace
+         $line =~ s/\s+/ /;            # remove double spaces
+
+         @arguments = split m/\s*;\s*/, $line;
+         @varnames = ();               # list of varnames
+         foreach $argument (@arguments) {
+            next                       # skip argument if argument is empty
+               if !$argument;
+
+            @ar = split m/[*\s]+/, $argument;
+            if ( $#ar == 0 ) {         # only 1 word in argument?
+               warn "$src:$lineno: no type for '$argument'";
+               $error = 1;
+               last LINE;
+            }
+
+            push @varnames, $ar[-1];   # last element is name of variable
+         };
+
+         warn 'Arguments: ' . join(', ', @arguments) . "\n"
+            . 'Varnames: ' . join(', ', @varnames)
+            if $debug;
+
+         $mname = $intname.'_'.$name;  # method name
+         $umname = uc($mname);         # uppercase method name
+
+         $arguments = join(", ", @arguments);
+         $varnames = join(", ", @varnames);
+
+         if ( $hfile ) {
+            # the method description 
+            print HFILE "extern struct device_op_desc $mname\_desc;\n";
+            # the method typedef
+            print HFILE &format_line("typedef $ret $mname\_t($arguments);",
+                              $line_width, ', ',
+                              ',',' ' x length("typedef $ret $mname\_t("))
+                      . "\n";
+            # the method declaration
+            print HFILE "$mname\_t $umname;\n\n";
+         }
+
+         if ( $cfile ) {
+            # Print out the method desc
+            print CFILE "struct device_op_desc $mname\_desc = {\n";
+            print CFILE "\t0, \"$mname\"\n";
+            print CFILE "};\n\n";
+
+            # Print out the method itself
+            if ( 0 ) {                 # haven't chosen the format yet
+               print CFILE "$ret $umname($varnames)\n";
+               print CFILE "\t".join(";\n\t", @arguments).";\n";
+            } else {
+               print CFILE &format_line("$ret $umname($arguments)",
+                              $line_width, ', ',
+                              ',', ' ' x length("$ret $umname(")) . "\n";
+            }
+            print CFILE "{\n";
+            print CFILE &format_line("\t$mname\_t *m = ($mname\_t *) DEVOPMETH(dev, $mname);",
+                              $line_width-8, ' = ', ' =', "\t\t")
+                      . "\n";
+            print CFILE "\t".($ret eq 'void'? '':'return ') . "m($varnames);\n";
+            print CFILE "}\n\n";
+         }
+      } else {
+         warn $line
+            if $debug;
+         warn "$src:$lineno: Invalid line encountered";
+         $error = 1;
+         last LINE;
+      }
+   } # end LINE
+
+   # print the final '#endif' in the header file
+   #
+   print HFILE "#endif /* _".$intname."_if_h_ */\n"
+      if $hfile;
+
+   close SRC;
+   close CFILE
+      if $cfile;
+   close HFILE
+      if $hfile;
+
+   if ( !$error ) {
+      if ( $cfile ) {
+         ($rc = system("mv $ctmpname $cfilename"))
+            and warn "mv $ctmpname $cfilename failed, $rc";
+      }
+
+      if ( $hfile ) {
+         ($rc = system("mv $htmpname $hfilename"))
+            and warn "mv $htmpname $hfilename failed, $rc";
+      }
+   } else {
+      warn 'File' . ($hfile and $cfile? 's':'') . ' skipped';
+      ($rc = system("rm -f $htmpname $ctmpname"))
+         and warn "rm -f $htmpname $ctmpname failed, $rc";
+      $gerror = 1;
+   }
+}
+
+exit $gerror;
+
+
+sub format_line {
+   my ($line, $maxlength, $break, $new_end, $new_start) = @_;
+   my $rline = "";
+
+   while ( length($line) > $maxlength
+           and ($i = rindex $line, $break, $maxlength-length($new_end)) != -1 ) {
+      $rline .= substr($line, 0, $i) . $new_end . "\n";
+      $line = $new_start . substr($line, $i+length($break));
+   }
+
+   return $rline . $line;
+}
diff --git a/sys/kern/makedevops.sh b/sys/kern/makedevops.sh
new file mode 100644
index 0000000..a5e9ebd
--- /dev/null
+++ b/sys/kern/makedevops.sh
@@ -0,0 +1,232 @@
+#!/bin/sh -
+#
+# Copyright (c) 1992, 1993
+#	The Regents of the University of California.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+#    must display the following acknowledgement:
+#	This product includes software developed by the University of
+#	California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# From @(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+# $Id: makedevops.sh,v 1.1 1998/06/14 13:53:12 dfr Exp $
+#
+
+# Script to produce device front-end sugar.
+#
+# usage: makedevops.sh srcfile
+#
+# These awk scripts are not particularly well written, specifically they
+# don't use arrays well and figure out the same information repeatedly.
+# Please rewrite them if you actually understand how to use awk.  Note,
+# they use nawk extensions and gawk's toupper.
+
+if [ $# -ne 2 ] ; then
+	echo 'usage: makedevops.sh [-c|-h] srcfile'
+	exit 1
+fi
+
+makec=0
+makeh=0
+
+if [ "$1" = "-c" ]; then
+    makec=1
+fi
+
+if [ "$1" = "-h" ]; then
+    makeh=1
+fi
+
+# Name of the source file.
+SRC=$2
+
+# Names of the created files.
+CTMP=ctmp$$
+HTMP=htmp$$
+
+CFILE=`basename $SRC .m`.c
+HFILE=`basename $SRC .m`.h
+
+# Awk program (must support nawk extensions and gawk's "toupper")
+# Use "awk" at Berkeley, "gawk" elsewhere.
+AWK=awk
+
+# Awk script to take file.do and turn it into file.h and file.c
+$AWK "
+	BEGIN {
+		src = \"$SRC\";
+		header = \"$HTMP\";
+		cfile = \"$CTMP\";
+		hfile = \"$HFILE\";
+		"'
+
+		printf("/*\n")						> header;
+		printf(" * This file is produced automatically.\n")	> header;
+		printf(" * Do not modify anything in here by hand.\n")	> header;
+		printf(" *\n")						> header;
+		printf(" * Created from %s with makedevops.sh\n", src)	> header;
+		printf(" */\n\n")					> header;
+
+		printf("/*\n")						> cfile;
+		printf(" * This file is produced automatically.\n")	> cfile;
+		printf(" * Do not modify anything in here by hand.\n")	> cfile;
+		printf(" *\n")						> cfile;
+		printf(" * Created from %s with makedevops.sh\n", src)	> cfile;
+		printf(" */\n\n")					> cfile;
+		printf("#include <sys/param.h>\n")			> cfile;
+		printf("#include <sys/queue.h>\n")			> cfile;
+		printf("#include <sys/bus_private.h>\n")		> cfile;
+
+		methodcount = 0
+	}
+	NF == 0 {
+		next;
+	}
+	/^#include/ {
+		print $0 > cfile;
+	}
+	/^#/ {
+		next;
+	}
+	/^INTERFACE/ {
+		intname = $2;
+		printf("#ifndef _%s_if_h_\n", intname)		> header;
+		printf("#define _%s_if_h_\n\n", intname)	> header;
+		printf("#include \"%s\"\n\n", hfile)			> cfile;
+	}
+	/^METHOD/ {
+		# Get the function name and return type.
+		ret = "";
+		sep = "";
+		for (i = 2; i < NF - 1; i++) {
+			ret = sep $i;
+			sep = " ";
+		}
+		name = $i;
+
+		# Get the function arguments.
+		for (c1 = 0;; ++c1) {
+			if (getline <= 0)
+				exit
+			if ($0 ~ "^};")
+				break;
+			a[c1] = $0;
+		}
+
+		methods[methodcount++] = name;
+
+		mname = intname "_" name;
+		umname = toupper(mname);
+
+		# Print out the method declaration
+		printf("extern struct device_op_desc %s_desc;\n", mname) > header;
+		printf("%s %s(", ret, umname) > header;
+		sep = ", ";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = " );\n";
+			c3 = split(a[c2], t);
+			for (c4 = 0; c4 < c3; ++c4)
+				printf("%s ", t[c4]) > header;
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s%s",
+			    substr(t[c4], 0, beg - 1),
+			    substr(t[c4], beg, end - beg), sep) > header;
+		}
+
+		# Print the method desc
+		printf("struct device_op_desc %s_desc = {\n", mname) > cfile;
+		printf("\t0,\n") > cfile;
+		printf("\t\"%s\"\n", mname) > cfile;
+		printf("};\n\n") > cfile;
+
+		# Print out the method typedef
+		printf("typedef %s %s_t(\n", ret, mname) > cfile;
+		sep = ",\n";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = ");\n";
+			c3 = split(a[c2], t);
+			printf("\t") > cfile;
+			for (c4 = 0; c4 < c3; ++c4)
+				printf("%s ", t[c4]) > cfile;
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s%s",
+			    substr(t[c4], 0, beg - 1),
+			    substr(t[c4], beg, end - beg), sep) > cfile;
+		}
+
+		# Print out the method itself
+		printf("%s %s(\n", ret, umname) > cfile;
+		sep = ",\n";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = ")\n";
+			c3 = split(a[c2], t);
+			printf("\t") > cfile;
+			for (c4 = 0; c4 < c3; ++c4)
+				printf("%s ", t[c4]) > cfile;
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s%s",
+			    substr(t[c4], 0, beg - 1),
+			    substr(t[c4], beg, end - beg), sep) > cfile;
+		}
+		printf("{\n") > cfile;
+		printf("\t%s_t *m = (%s_t *) DEVOPMETH(dev, %s);\n",
+			mname, mname, mname) > cfile;
+		if (ret != "void")
+			printf("\treturn m(") > cfile;
+		else
+			printf("\tm(") > cfile;
+		sep = ", ";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = ");\n";
+			c3 = split(a[c2], t);
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s", substr(t[c3], beg, end - beg), sep) > cfile;
+		}
+		printf("}\n\n") > cfile;
+	}
+	END {
+		printf("\n#endif /* _%s_if_h_ */\n", intname)	> header;
+	}' < $SRC
+
+if [ $makec = 1 ]; then
+    mv $CTMP $CFILE
+else
+    rm $CTMP
+fi
+
+if [ $makeh = 1 ]; then
+    mv $HTMP $HFILE
+else
+    rm $HTMP
+fi
diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh
new file mode 100644
index 0000000..0cbd247
--- /dev/null
+++ b/sys/kern/makesyscalls.sh
@@ -0,0 +1,394 @@
+#! /bin/sh -
+#	@(#)makesyscalls.sh	8.1 (Berkeley) 6/10/93
+# $Id: makesyscalls.sh,v 1.34 1998/06/09 03:32:05 bde Exp $
+
+set -e
+
+# name of compat option:
+compat=COMPAT_43
+
+# output files:
+sysnames="syscalls.c"
+sysproto="../sys/sysproto.h"
+sysproto_h=_SYS_SYSPROTO_H_
+syshdr="../sys/syscall.h"
+sysmk="../sys/syscall.mk"
+syssw="init_sysent.c"
+syshide="../sys/syscall-hide.h"
+syscallprefix="SYS_"
+switchname="sysent"
+namesname="syscallnames"
+
+# tmp files:
+sysdcl="sysent.dcl.$$"
+syscompat="sysent.compat.$$"
+syscompatdcl="sysent.compatdcl.$$"
+sysent="sysent.switch.$$"
+sysinc="sysinc.switch.$$"
+sysarg="sysarg.switch.$$"
+
+trap "rm $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg" 0
+
+touch $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg
+
+case $# in
+    0)	echo "Usage: $0 input-file <config-file>" 1>&2
+	exit 1
+	;;
+esac
+
+if [ -n "$2" -a -f "$2" ]; then
+	. $2
+fi
+
+sed -e '
+s/\$//g
+:join
+	/\\$/{a\
+
+	N
+	s/\\\n//
+	b join
+	}
+2,${
+	/^#/!s/\([{}()*,]\)/ \1 /g
+}
+' < $1 | awk "
+	BEGIN {
+		sysdcl = \"$sysdcl\"
+		sysproto = \"$sysproto\"
+		sysproto_h = \"$sysproto_h\"
+		syscompat = \"$syscompat\"
+		syscompatdcl = \"$syscompatdcl\"
+		sysent = \"$sysent\"
+		syssw = \"$syssw\"
+		sysinc = \"$sysinc\"
+		sysarg = \"$sysarg\"
+		sysnames = \"$sysnames\"
+		syshdr = \"$syshdr\"
+		sysmk = \"$sysmk\"
+		compat = \"$compat\"
+		syshide = \"$syshide\"
+		syscallprefix = \"$syscallprefix\"
+		switchname = \"$switchname\"
+		namesname = \"$namesname\"
+		infile = \"$1\"
+		"'
+
+		printf "/*\n * System call switch table.\n *\n" > syssw
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw
+
+		printf "/*\n * System call prototypes.\n *\n" > sysarg
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg
+
+		printf "\n#ifdef %s\n\n", compat > syscompat
+
+		printf "/*\n * System call names.\n *\n" > sysnames
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
+
+		printf "/*\n * System call numbers.\n *\n" > syshdr
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr
+		printf "# FreeBSD system call names.\n" > sysmk
+		printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk
+		printf "/*\n * System call hiders.\n *\n" > syshide
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshide
+	}
+	NR == 1 {
+		gsub("[$]Id: ", "", $0)
+		gsub(" [$]", "", $0)
+
+		printf " * created from%s\n */\n\n", $0 > syssw
+
+		printf "\n/* The casts are bogus but will do for now. */\n" > sysent
+		printf "struct sysent %s[] = {\n",switchname > sysent
+
+		printf " * created from%s\n */\n\n", $0 > sysarg
+		printf "#ifndef %s\n", sysproto_h > sysarg
+		printf "#define\t%s\n\n", sysproto_h > sysarg
+		printf "#include <sys/signal.h>\n\n" > sysarg
+		printf "struct proc;\n\n" > sysarg
+		printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg
+		printf "\t\t0 : sizeof(register_t) - sizeof(t))\n\n" > sysarg
+
+		printf " * created from%s\n */\n\n", $0 > sysnames
+		printf "char *%s[] = {\n", namesname > sysnames
+
+		printf " * created from%s\n */\n\n", $0 > syshdr
+
+		printf "# created from%s\nMIASM = ", $0 > sysmk
+
+		printf " * created from%s\n */\n\n", $0 > syshide
+		next
+	}
+	NF == 0 || $1 ~ /^;/ {
+		next
+	}
+	$1 ~ /^#[ 	]*include/ {
+		print > sysinc
+		next
+	}
+	$1 ~ /^#[ 	]*if/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > sysnames
+		print > syshide
+		savesyscall = syscall
+		next
+	}
+	$1 ~ /^#[ 	]*else/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > sysnames
+		print > syshide
+		syscall = savesyscall
+		next
+	}
+	$1 ~ /^#/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > sysnames
+		print > syshide
+		next
+	}
+	syscall != $1 {
+		printf "%s: line %d: syscall number out of sync at %d\n",
+		    infile, NR, syscall
+		printf "line is:\n"
+		print
+		exit 1
+	}
+	function parserr(was, wanted) {
+		printf "%s: line %d: unexpected %s (expected %s)\n",
+		    infile, NR, was, wanted
+		exit 1
+	}
+	function parseline() {
+		f=4			# toss number and type
+		argc= 0;
+		bigargc = 0;
+		if ($NF != "}") {
+			funcalias=$(NF-2)
+			argalias=$(NF-1)
+			rettype=$NF
+			end=NF-3
+		} else {
+			funcalias=""
+			argalias=""
+			rettype="int"
+			end=NF
+		}
+		if ($2 == "NODEF") {
+			funcname=$4
+			return
+		}
+		if ($f != "{")
+			parserr($f, "{")
+		f++
+		if ($end != "}")
+			parserr($end, "}")
+		end--
+		if ($end != ";")
+			parserr($end, ";")
+		end--
+		if ($end != ")")
+			parserr($end, ")")
+		end--
+
+		f++	#function return type
+
+		funcname=$f
+		if (funcalias == "")
+			funcalias = funcname
+		if (argalias == "") {
+			argalias = funcname "_args"
+			if ($2 == "COMPAT")
+				argalias = "o" argalias
+		}
+		f++
+
+		if ($f != "(")
+			parserr($f, ")")
+		f++
+
+		if (f == end) {
+			if ($f != "void")
+				parserr($f, "argument definition")
+			return
+		}
+
+		while (f <= end) {
+			argc++
+			argtype[argc]=""
+			oldf=""
+			while (f < end && $(f+1) != ",") {
+				if (argtype[argc] != "" && oldf != "*")
+					argtype[argc] = argtype[argc]" ";
+				argtype[argc] = argtype[argc]$f;
+				oldf = $f;
+				f++
+			}
+			if (argtype[argc] == "")
+				parserr($f, "argument definition")
+			if (argtype[argc] == "off_t")
+				bigargc++
+			argname[argc]=$f;
+			f += 2;			# skip name, and any comma
+		}
+	}
+	{	comment = $4
+		if (NF < 7)
+			for (i = 5; i <= NF; i++)
+				comment = comment " " $i
+	}
+	$2 == "STD" || $2 == "NODEF" || $2 == "NOARGS"  || $2 == "NOPROTO" \
+	    || $2 == "NOIMPL" {
+		parseline()
+		if ((!nosys || funcname != "nosys") && \
+		    (funcname != "lkmnosys")) {
+			if (argc != 0 && $2 != "NOARGS" && $2 != "NOPROTO") {
+				printf("struct\t%s {\n", argalias) > sysarg
+				for (i = 1; i <= argc; i++)
+					printf("\t%s\t%s;\tchar %s_[PAD_(%s)];\n",
+					    argtype[i], argname[i],
+					    argname[i], argtype[i]) > sysarg
+				printf("};\n") > sysarg
+			}
+			else if($2 != "NOARGS" && $2 != "NOPROTO")
+				printf("struct\t%s {\n\tregister_t dummy;\n};\n",
+				    argalias) > sysarg
+		}
+		if ($2 != "NOPROTO" && (!nosys || funcname != "nosys") && \
+		    (!lkmnosys || funcname != "lkmnosys")) {
+			printf("%s\t%s __P((struct proc *, struct %s *))",
+			    rettype, funcname, argalias) > sysdcl
+			if (funcname == "exit")
+				printf(" __dead2") > sysdcl
+			printf(";\n") > sysdcl
+		}
+		if (funcname == "nosys")
+			nosys = 1
+		if (funcname == "lkmnosys")
+			lkmnosys = 1
+	 	if ($2 != "NOIMPL") {
+			printf("\t{ %d, (sy_call_t *)%s },\t\t",
+			    argc+bigargc, funcname) > sysent
+			if(length(funcname) < 11)
+				printf("\t") > sysent
+			printf("/* %d = %s */\n", syscall, funcalias) > sysent
+		} else {
+			printf("\t{ %d, (sy_call_t *)%s },\t\t",
+			    argc+bigargc, "nosys") > sysent
+			if(length("nosys") < 11)
+				printf("\t") > sysent
+			printf("/* %d = %s */\n", syscall, funcalias) > sysent
+		}
+		printf("\t\"%s\",\t\t\t/* %d = %s */\n",
+		    funcalias, syscall, funcalias) > sysnames
+		if ($2 != "NODEF") {
+			printf("#define\t%s%s\t%d\n", syscallprefix,
+		    	    funcalias, syscall) > syshdr
+			printf(" \\\n\t%s.o", funcalias) > sysmk
+		}
+		if ($3 != "NOHIDE")
+			printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+		syscall++
+		next
+	}
+	$2 == "COMPAT" || $2 == "CPT_NOA" {
+		ncompat++
+		parseline()
+		if (argc != 0 && $2 != "CPT_NOA") {
+			printf("struct\t%s {\n", argalias) > syscompat
+			for (i = 1; i <= argc; i++)
+				printf("\t%s\t%s;\tchar %s_[PAD_(%s)];\n",
+				    argtype[i], argname[i],
+				    argname[i], argtype[i]) > syscompat
+			printf("};\n") > syscompat
+		}
+		else if($2 != "CPT_NOA")
+			printf("struct\t%s {\n\tregister_t dummy;\n};\n",
+			    argalias) > sysarg
+		printf("%s\to%s __P((struct proc *, struct %s *));\n",
+		    rettype, funcname, argalias) > syscompatdcl
+		printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n",
+		    argc+bigargc, funcname, syscall, funcalias) > sysent
+		printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
+		    funcalias, syscall, funcalias) > sysnames
+		printf("\t\t\t\t/* %d is old %s */\n",
+		    syscall, funcalias) > syshdr
+		if ($3 != "NOHIDE")
+			printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+		syscall++
+		next
+	}
+	$2 == "LIBCOMPAT" {
+		ncompat++
+		parseline()
+		printf("%s\to%s();\n", rettype, funcname) > syscompatdcl
+		printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n",
+		    argc+bigargc, funcname, syscall, funcalias) > sysent
+		printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
+		    funcalias, syscall, funcalias) > sysnames
+		printf("#define\t%s%s\t%d\t/* compatibility; still used by libc */\n",
+		    syscallprefix, funcalias, syscall) > syshdr
+		printf(" \\\n\t%s.o", funcalias) > sysmk
+		if ($3 != "NOHIDE")
+			printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+		syscall++
+		next
+	}
+	$2 == "OBSOL" {
+		printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = obsolete %s */\n",
+		    syscall, comment) > sysent
+		printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n",
+		    $4, syscall, comment) > sysnames
+		printf("\t\t\t\t/* %d is obsolete %s */\n",
+		    syscall, comment) > syshdr
+		if ($3 != "NOHIDE")
+			printf("HIDE_%s(%s)\n", $3, $4) > syshide
+		syscall++
+		next
+	}
+	$2 == "UNIMPL" {
+		printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = %s */\n",
+		    syscall, comment) > sysent
+		printf("\t\"#%d\",\t\t\t/* %d = %s */\n",
+		    syscall, syscall, comment) > sysnames
+		if ($3 != "NOHIDE")
+			printf("HIDE_%s(%s)\n", $3, $4) > syshide
+		syscall++
+		next
+	}
+	{
+		printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2
+		exit 1
+	}
+	END {
+		if (ncompat != 0) {
+			printf "#include \"opt_compat.h\"\n\n" > syssw
+			printf "\n#ifdef %s\n", compat > sysinc
+			printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysinc
+			printf "#else\n" > sysinc
+			printf "#define compat(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+			printf "#endif\n" > sysinc
+		}
+
+		printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
+		printf("#undef PAD_\n") > syscompatdcl
+		printf("\n#endif /* !%s */\n", sysproto_h) > syscompatdcl
+
+		printf("\n") > sysmk
+		printf("};\n") > sysent
+		printf("};\n") > sysnames
+		printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
+		    > syshdr
+	} '
+
+cat $sysinc $sysent >> $syssw
+cat $sysarg $sysdcl $syscompat $syscompatdcl > $sysproto
diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c
new file mode 100644
index 0000000..d6175ee
--- /dev/null
+++ b/sys/kern/md5c.c
@@ -0,0 +1,342 @@
+/*
+ * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ *
+ * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ * rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * $Id: md5c.c,v 1.14 1998/05/01 16:40:19 bde Exp $
+ *
+ * This code is the same as the code published by RSA Inc.  It has been
+ * edited for clarity and style only.
+ */
+
+#include <sys/types.h>
+
+#ifdef KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <sys/md5.h>
+
+
+#ifdef KERNEL
+#define memset(x,y,z)	bzero(x,z);
+#define memcpy(x,y,z)	bcopy(y, x, z)
+#endif
+
+#if defined(__i386__) || defined(__alpha__)
+#define Encode memcpy
+#define Decode memcpy
+#else /* __i386__ */
+
+/*
+ * Encodes input (u_int32_t) into output (unsigned char). Assumes len is
+ * a multiple of 4.
+ */
+
+/* XXX not prototyped, and not compatible with memcpy(). */
+static void
+Encode (output, input, len)
+	unsigned char *output;
+	u_int32_t *input;
+	unsigned int len;
+{
+	unsigned int i, j;
+
+	for (i = 0, j = 0; j < len; i++, j += 4) {
+		output[j] = (unsigned char)(input[i] & 0xff);
+		output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+		output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+		output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+	}
+}
+
+/*
+ * Decodes input (unsigned char) into output (u_int32_t). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Decode (output, input, len)
+	u_int32_t *output;
+	const unsigned char *input;
+	unsigned int len;
+{
+	unsigned int i, j;
+
+	for (i = 0, j = 0; j < len; i++, j += 4)
+		output[i] = ((u_int32_t)input[j]) | (((u_int32_t)input[j+1]) << 8) |
+		    (((u_int32_t)input[j+2]) << 16) | (((u_int32_t)input[j+3]) << 24);
+}
+#endif /* i386 */
+
+static unsigned char PADDING[64] = {
+  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G, H and I are basic MD5 functions. */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits. */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/*
+ * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+ * Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+	(a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define GG(a, b, c, d, x, s, ac) { \
+	(a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define HH(a, b, c, d, x, s, ac) { \
+	(a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define II(a, b, c, d, x, s, ac) { \
+	(a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+
+/* MD5 initialization. Begins an MD5 operation, writing a new context. */
+
+void
+MD5Init (context)
+	MD5_CTX *context;
+{
+
+	context->count[0] = context->count[1] = 0;
+
+	/* Load magic initialization constants.  */
+	context->state[0] = 0x67452301;
+	context->state[1] = 0xefcdab89;
+	context->state[2] = 0x98badcfe;
+	context->state[3] = 0x10325476;
+}
+
+/* 
+ * MD5 block update operation. Continues an MD5 message-digest
+ * operation, processing another message block, and updating the
+ * context.
+ */
+
+void
+MD5Update (context, input, inputLen)
+	MD5_CTX *context;
+	const unsigned char *input;
+	unsigned int inputLen;
+{
+	unsigned int i, index, partLen;
+
+	/* Compute number of bytes mod 64 */
+	index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+	/* Update number of bits */
+	if ((context->count[0] += ((u_int32_t)inputLen << 3))
+	    < ((u_int32_t)inputLen << 3))
+		context->count[1]++;
+	context->count[1] += ((u_int32_t)inputLen >> 29);
+
+	partLen = 64 - index;
+
+	/* Transform as many times as possible. */
+	if (inputLen >= partLen) {
+		memcpy((void *)&context->buffer[index], (const void *)input,
+		    partLen);
+		MD5Transform (context->state, context->buffer);
+
+		for (i = partLen; i + 63 < inputLen; i += 64)
+			MD5Transform (context->state, &input[i]);
+
+		index = 0;
+	}
+	else
+		i = 0;
+
+	/* Buffer remaining input */
+	memcpy ((void *)&context->buffer[index], (const void *)&input[i],
+	    inputLen-i);
+}
+
+/*
+ * MD5 padding. Adds padding followed by original length.
+ */
+
+void
+MD5Pad (context)
+	MD5_CTX *context;
+{
+	unsigned char bits[8];
+	unsigned int index, padLen;
+
+	/* Save number of bits */
+	Encode (bits, context->count, 8);
+
+	/* Pad out to 56 mod 64. */
+	index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+	padLen = (index < 56) ? (56 - index) : (120 - index);
+	MD5Update (context, PADDING, padLen);
+
+	/* Append length (before padding) */
+	MD5Update (context, bits, 8);
+}
+
+/*
+ * MD5 finalization. Ends an MD5 message-digest operation, writing the
+ * the message digest and zeroizing the context.
+ */
+
+void
+MD5Final (digest, context)
+	unsigned char digest[16];
+	MD5_CTX *context;
+{
+	/* Do padding. */
+	MD5Pad (context);
+
+	/* Store state in digest */
+	Encode (digest, context->state, 16);
+
+	/* Zeroize sensitive information. */
+	memset ((void *)context, 0, sizeof (*context));
+}
+
+/* MD5 basic transformation. Transforms state based on block. */
+
+void
+MD5Transform (state, block)
+	u_int32_t state[4];
+	const unsigned char block[64];
+{
+	u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+	Decode (x, block, 64);
+
+	/* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+	FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+	FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+	FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+	FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+	FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+	FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+	FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+	FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+	FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+	FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+	FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+	FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+	FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+	FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+	FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+	FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+	/* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+	GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+	GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+	GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+	GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+	GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+	GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
+	GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+	GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+	GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+	GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+	GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+	GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+	GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+	GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+	GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+	GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+	/* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+	HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+	HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+	HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+	HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+	HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+	HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+	HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+	HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+	HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+	HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+	HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+	HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
+	HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+	HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+	HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+	HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+	/* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+	II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+	II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+	II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+	II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+	II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+	II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+	II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+	II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+	II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+	II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+	II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+	II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+	II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+	II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+	II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+	II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+
+	/* Zeroize sensitive information. */
+	memset ((void *)x, 0, sizeof (x));
+}
diff --git a/sys/kern/p1003_1b.c b/sys/kern/p1003_1b.c
new file mode 100644
index 0000000..9a70d5c
--- /dev/null
+++ b/sys/kern/p1003_1b.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 1996, 1997, 1998
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/* p1003_1b: Real Time common code.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/syslog.h>
+#include <sys/module.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+
+#include <posix4/posix4.h>
+
+MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
+
+/* p31b_proc: Return a proc struct corresponding to a pid to operate on.
+ *
+ * Enforce permission policy.
+ *
+ * The policy is the same as for sending signals except there
+ * is no notion of process groups.
+ *
+ * pid == 0 means my process.
+ *
+ * This is disabled until I've got a permission gate in again:
+ * only root can do this.
+ */
+
+#if 0
+/*
+ * This is stolen from CANSIGNAL in kern_sig:
+ *
+ * Can process p, with pcred pc, do "write flavor" operations to process q?
+ */
+#define CAN_AFFECT(p, pc, q) \
+	((pc)->pc_ucred->cr_uid == 0 || \
+	    (pc)->p_ruid == (q)->p_cred->p_ruid || \
+	    (pc)->pc_ucred->cr_uid == (q)->p_cred->p_ruid || \
+	    (pc)->p_ruid == (q)->p_ucred->cr_uid || \
+	    (pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid)
+#else
+#define CAN_AFFECT(p, pc, q) ((pc)->pc_ucred->cr_uid == 0)
+#endif
+
+/*
+ * p31b_proc: Look up a proc from a PID.  If proc is 0 it is
+ * my own proc.
+ */
+int p31b_proc(struct proc *p, pid_t pid, struct proc **pp)
+{
+	int ret = 0;
+	struct proc *other_proc = 0;
+
+	if (pid == 0)
+		other_proc = p;
+	else
+		other_proc = pfind(pid);
+
+	if (other_proc)
+	{
+		/* Enforce permission policy.
+		 */
+		if (CAN_AFFECT(p, p->p_cred, other_proc))
+			*pp = other_proc;
+		else
+			ret = EPERM;
+	}
+	else
+		ret = ESRCH;
+
+	return ret;
+}
+
+/* The system calls return ENOSYS if an entry is called that is
+ * not run-time supported.  I am also logging since some programs
+ * start to use this when they shouldn't.  That will be removed if annoying.
+ */
+int
+syscall_not_present(struct proc *p, const char *s, struct nosys_args *uap)
+{
+	log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+			p->p_comm, p->p_pid, s);
+
+	/* a " return nosys(p, uap); " here causes a core dump.
+	 */
+
+	return ENOSYS;
+}
+
+#if !defined(_KPOSIX_PRIORITY_SCHEDULING)
+
+/* Not configured but loadable via an LKM:
+ */
+
+static int sched_attach(void)
+{
+	return 0;
+}
+
+SYSCALL_NOT_PRESENT_GEN(sched_setparam)
+SYSCALL_NOT_PRESENT_GEN(sched_getparam)
+SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_yield)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
+SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
+
+#else
+
+/* Configured in kernel version:
+ */
+static struct ksched *ksched;
+
+static int sched_attach(void)
+{
+	int ret = ksched_attach(&ksched);
+
+	if (ret == 0)
+		p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1);
+
+	return ret;
+}
+
+int sched_setparam(struct proc *p,
+	struct sched_setparam_args *uap)
+{
+	int e;
+
+	struct sched_param sched_param;
+	copyin(uap->param, &sched_param, sizeof(sched_param));
+
+	(void) (0
+	|| (e = p31b_proc(p, uap->pid, &p))
+	|| (e = ksched_setparam(&p->p_retval[0], ksched, p,
+		(const struct sched_param *)&sched_param))
+	);
+
+	return e;
+}
+
+int sched_getparam(struct proc *p,
+	struct sched_getparam_args *uap)
+{
+	int e;
+	struct sched_param sched_param;
+
+	(void) (0
+	|| (e = p31b_proc(p, uap->pid, &p))
+	|| (e = ksched_getparam(&p->p_retval[0], ksched, p, &sched_param))
+	);
+
+	if (!e)
+		copyout(&sched_param, uap->param, sizeof(sched_param));
+
+	return e;
+}
+int sched_setscheduler(struct proc *p,
+	struct sched_setscheduler_args *uap)
+{
+	int e;
+
+	struct sched_param sched_param;
+	copyin(uap->param, &sched_param, sizeof(sched_param));
+
+	(void) (0
+	|| (e = p31b_proc(p, uap->pid, &p))
+	|| (e = ksched_setscheduler(&p->p_retval[0],
+	ksched, p, uap->policy,
+		(const struct sched_param *)&sched_param))
+	);
+
+	return e;
+}
+int sched_getscheduler(struct proc *p,
+	struct sched_getscheduler_args *uap)
+{
+	int e;
+	(void) (0
+	|| (e = p31b_proc(p, uap->pid, &p))
+	|| (e = ksched_getscheduler(&p->p_retval[0], ksched, p))
+	);
+
+	return e;
+}
+int sched_yield(struct proc *p,
+	struct sched_yield_args *uap)
+{
+	return ksched_yield(&p->p_retval[0], ksched);
+}
+int sched_get_priority_max(struct proc *p,
+	struct sched_get_priority_max_args *uap)
+{
+	return ksched_get_priority_max(&p->p_retval[0],
+	ksched, uap->policy);
+}
+int sched_get_priority_min(struct proc *p,
+	struct sched_get_priority_min_args *uap)
+{
+	return ksched_get_priority_min(&p->p_retval[0],
+	ksched, uap->policy);
+}
+int sched_rr_get_interval(struct proc *p,
+	struct sched_rr_get_interval_args *uap)
+{
+	int e;
+
+	(void) (0
+	|| (e = p31b_proc(p, uap->pid, &p))
+	|| (e = ksched_rr_get_interval(&p->p_retval[0], ksched,
+	p, uap->interval))
+	);
+
+	return e;
+}
+
+#endif
+
+static void p31binit(void *notused)
+{
+	(void) sched_attach();
+	p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+}
+
+SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
diff --git a/sys/kern/posix4_mib.c b/sys/kern/posix4_mib.c
new file mode 100644
index 0000000..523f76b
--- /dev/null
+++ b/sys/kern/posix4_mib.c
@@ -0,0 +1,94 @@
+/*-
+ * Copyright (c) 1998
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <posix4/posix4.h>
+
+static int facility[CTL_P1003_1B_MAXID - 1];
+
+/* OID_AUTO isn't working with sysconf(3).  I guess I'd have to
+ * modify it to do a lookup by name from the index.
+ * For now I've left it a top-level sysctl.
+ */
+
+#if 1
+
+#define P1B_SYSCTL(num, name)  \
+SYSCTL_INT(_p1003_1b, num, \
+	name, CTLFLAG_RD, facility + num - 1, 0, "");
+
+#else
+
+#define P1B_SYSCTL(num, name)  \
+SYSCTL_INT(_kern_p1003_1b, OID_AUTO, \
+	name, CTLFLAG_RD, facility + num - 1, 0, "");
+SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B");
+
+#endif
+
+
+P1B_SYSCTL(CTL_P1003_1B_ASYNCHRONOUS_IO, asynchronous_io);
+P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range);
+P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection);
+P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling);
+P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals);
+P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores);
+P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync);
+P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects);
+P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io);
+P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers);
+P1B_SYSCTL(CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max);
+P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max);
+P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max);
+P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize);
+P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max);
+P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max);
+P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max);
+
+/* p31b_setcfg: Set the configuration
+ */
+void p31b_setcfg(int num, int value)
+{
+	if (num >= 1 && num < CTL_P1003_1B_MAXID)
+		facility[num - 1] = value;
+}
diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c
new file mode 100644
index 0000000..9234732
--- /dev/null
+++ b/sys/kern/subr_autoconf.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This software was developed by the Computer Systems Engineering group
+ * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
+ * contributed to Berkeley.
+ *
+ * All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Lawrence Berkeley Laboratories.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_autoconf.c	8.1 (Berkeley) 6/10/93
+ *
+ * $Id: subr_autoconf.c,v 1.7 1998/12/04 22:54:51 archie Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/device.h>
+#ifdef UNUSED
+#include <sys/malloc.h>
+#endif
+
+/*
+ * Autoconfiguration subroutines.
+ */
+
+#ifdef UNUSED
+/*
+ * ioconf.c exports exactly two names: cfdata and cfroots.  All system
+ * devices and drivers are found via these tables.
+ */
+extern struct cfdata cfdata[];
+extern short cfroots[];
+
+#define	ROOT ((struct device *)NULL)
+
+struct matchinfo {
+	cfmatch_t fn;
+	struct	device *parent;
+	void	*aux;
+	struct	cfdata *match;
+	int	pri;
+};
+
+/*
+ * Apply the matching function and choose the best.  This is used
+ * a few times and we want to keep the code small.
+ */
+static void
+mapply(m, cf)
+	register struct matchinfo *m;
+	register struct cfdata *cf;
+{
+	register int pri;
+
+	if (m->fn != NULL)
+		pri = (*m->fn)(m->parent, cf, m->aux);
+	else
+		pri = (*cf->cf_driver->cd_match)(m->parent, cf, m->aux);
+	if (pri > m->pri) {
+		m->match = cf;
+		m->pri = pri;
+	}
+}
+
+/*
+ * Iterate over all potential children of some device, calling the given
+ * function (default being the child's match function) for each one.
+ * Nonzero returns are matches; the highest value returned is considered
+ * the best match.  Return the `found child' if we got a match, or NULL
+ * otherwise.  The `aux' pointer is simply passed on through.
+ *
+ * Note that this function is designed so that it can be used to apply
+ * an arbitrary function to all potential children (its return value
+ * can be ignored).
+ */
+struct cfdata *
+config_search(fn, parent, aux)
+	cfmatch_t fn;
+	register struct device *parent;
+	void *aux;
+{
+	register struct cfdata *cf;
+	register short *p;
+	struct matchinfo m;
+
+	m.fn = fn;
+	m.parent = parent;
+	m.aux = aux;
+	m.match = NULL;
+	m.pri = 0;
+	for (cf = cfdata; cf->cf_driver; cf++) {
+		/*
+		 * Skip cf if no longer eligible, otherwise scan through
+		 * parents for one matching `parent', and try match function.
+		 */
+		if (cf->cf_fstate == FSTATE_FOUND)
+			continue;
+		for (p = cf->cf_parents; *p >= 0; p++)
+			if (parent->dv_cfdata == &cfdata[*p])
+				mapply(&m, cf);
+	}
+	return (m.match);
+}
+
+/*
+ * Find the given root device.
+ * This is much like config_search, but there is no parent.
+ */
+struct cfdata *
+config_rootsearch(fn, rootname, aux)
+	register cfmatch_t fn;
+	register char *rootname;
+	register void *aux;
+{
+	register struct cfdata *cf;
+	register short *p;
+	struct matchinfo m;
+
+	m.fn = fn;
+	m.parent = ROOT;
+	m.aux = aux;
+	m.match = NULL;
+	m.pri = 0;
+	/*
+	 * Look at root entries for matching name.  We do not bother
+	 * with found-state here since only one root should ever be
+	 * searched (and it must be done first).
+	 */
+	for (p = cfroots; *p >= 0; p++) {
+		cf = &cfdata[*p];
+		if (strcmp(cf->cf_driver->cd_name, rootname) == 0)
+			mapply(&m, cf);
+	}
+	return (m.match);
+}
+
+static char *msgs[3] = { "", " not configured\n", " unsupported\n" };
+
+/*
+ * The given `aux' argument describes a device that has been found
+ * on the given parent, but not necessarily configured.  Locate the
+ * configuration data for that device (using the cd_match configuration
+ * driver function) and attach it, and return true.  If the device was
+ * not configured, call the given `print' function and return 0.
+ */
+int
+config_found(parent, aux, print)
+	struct device *parent;
+	void *aux;
+	cfprint_t print;
+{
+	struct cfdata *cf;
+
+	if ((cf = config_search((cfmatch_t)NULL, parent, aux)) != NULL) {
+		config_attach(parent, cf, aux, print);
+		return (1);
+	}
+	printf(msgs[(*print)(aux, parent->dv_xname)]);
+	return (0);
+}
+
+/*
+ * As above, but for root devices.
+ */
+int
+config_rootfound(rootname, aux)
+	char *rootname;
+	void *aux;
+{
+	struct cfdata *cf;
+
+	if ((cf = config_rootsearch((cfmatch_t)NULL, rootname, aux)) != NULL) {
+		config_attach(ROOT, cf, aux, (cfprint_t)NULL);
+		return (1);
+	}
+	printf("root device %s not configured\n", rootname);
+	return (0);
+}
+
+/* just like sprintf(buf, "%d") except that it works from the end */
+static char *
+number(ep, n)
+	register char *ep;
+	register int n;
+{
+
+	*--ep = 0;
+	while (n >= 10) {
+		*--ep = (n % 10) + '0';
+		n /= 10;
+	}
+	*--ep = n + '0';
+	return (ep);
+}
+
+/*
+ * Attach a found device.  Allocates memory for device variables.
+ */
+void
+config_attach(parent, cf, aux, print)
+	register struct device *parent;
+	register struct cfdata *cf;
+	register void *aux;
+	cfprint_t print;
+{
+	register struct device *dev;
+	register struct cfdriver *cd;
+	register size_t lname, lunit;
+	register char *xunit;
+	int myunit;
+	char num[10];
+	static struct device **nextp = &alldevs;
+
+	cd = cf->cf_driver;
+	if (cd->cd_devsize < sizeof(struct device))
+		panic("config_attach");
+	myunit = cf->cf_unit;
+	if (cf->cf_fstate == FSTATE_NOTFOUND)
+		cf->cf_fstate = FSTATE_FOUND;
+	else
+		cf->cf_unit++;
+
+	/* compute length of name and decimal expansion of unit number */
+	lname = strlen(cd->cd_name);
+	xunit = number(&num[sizeof num], myunit);
+	lunit = &num[sizeof num] - xunit;
+	if (lname + lunit >= sizeof(dev->dv_xname))
+		panic("config_attach: device name too long");
+
+	/* get memory for all device vars */
+	dev = (struct device *)malloc(cd->cd_devsize, M_DEVBUF, M_WAITOK);
+					/* XXX cannot wait! */
+	bzero(dev, cd->cd_devsize);
+	*nextp = dev;			/* link up */
+	nextp = &dev->dv_next;
+	dev->dv_class = cd->cd_class;
+	dev->dv_cfdata = cf;
+	dev->dv_unit = myunit;
+	bcopy(cd->cd_name, dev->dv_xname, lname);
+	bcopy(xunit, dev->dv_xname + lname, lunit);
+	dev->dv_parent = parent;
+	if (parent == ROOT)
+		printf("%s (root)", dev->dv_xname);
+	else {
+		printf("%s at %s", dev->dv_xname, parent->dv_xname);
+		(void) (*print)(aux, (char *)0);
+	}
+
+	/* put this device in the devices array */
+	if (dev->dv_unit >= cd->cd_ndevs) {
+		/*
+		 * Need to expand the array.
+		 */
+		int old = cd->cd_ndevs, oldbytes, new, newbytes;
+		void **nsp;
+
+		if (old == 0) {
+			nsp = malloc(MINALLOCSIZE, M_DEVBUF, M_WAITOK);	/*XXX*/
+			bzero(nsp, MINALLOCSIZE);
+			cd->cd_ndevs = MINALLOCSIZE / sizeof(void *);
+		} else {
+			new = cd->cd_ndevs;
+			do {
+				new *= 2;
+			} while (new <= dev->dv_unit);
+			cd->cd_ndevs = new;
+			oldbytes = old * sizeof(void *);
+			newbytes = new * sizeof(void *);
+			nsp = malloc(newbytes, M_DEVBUF, M_WAITOK);	/*XXX*/
+			bcopy(cd->cd_devs, nsp, oldbytes);
+			bzero(&nsp[old], newbytes - oldbytes);
+			free(cd->cd_devs, M_DEVBUF);
+		}
+		cd->cd_devs = nsp;
+	}
+	if (cd->cd_devs[dev->dv_unit])
+		panic("config_attach: duplicate %s", dev->dv_xname);
+	cd->cd_devs[dev->dv_unit] = dev;
+
+	/*
+	 * Before attaching, clobber any unfound devices that are
+	 * otherwise identical.
+	 */
+	for (cf = cfdata; cf->cf_driver; cf++)
+		if (cf->cf_driver == cd && cf->cf_unit == dev->dv_unit &&
+		    cf->cf_fstate == FSTATE_NOTFOUND)
+			cf->cf_fstate = FSTATE_FOUND;
+	(*cd->cd_attach)(parent, dev, aux);
+}
+
+/*
+ * Attach an event.  These must come from initially-zero space (see
+ * commented-out assignments below), but that occurs naturally for
+ * device instance variables.
+ */
+void
+evcnt_attach(dev, name, ev)
+	struct device *dev;
+	const char *name;
+	struct evcnt *ev;
+{
+	static struct evcnt **nextp = &allevents;
+
+	KASSERT(strlen(name) < sizeof(ev->ev_name), ("evcnt_attach"));
+
+	/* ev->ev_next = NULL; */
+	ev->ev_dev = dev;
+	/* ev->ev_count = 0; */
+	snprintf(ev->ev_name, sizeof(ev->ev_name), "%s", name);
+	*nextp = ev;
+	nextp = &ev->ev_next;
+}
+
+#endif
+
+/*
+ * "Interrupt driven config" functions.
+ */
+static TAILQ_HEAD(, intr_config_hook) intr_config_hook_list =
+	TAILQ_HEAD_INITIALIZER(intr_config_hook_list);
+
+
+/* ARGSUSED */
+static void run_interrupt_driven_config_hooks __P((void *dummy));
+static void
+run_interrupt_driven_config_hooks(dummy)
+	void *dummy;
+{
+	struct intr_config_hook *hook;
+
+	for (hook = intr_config_hook_list.tqh_first; hook != NULL;
+	     hook = hook->ich_links.tqe_next) {
+		(*hook->ich_func)(hook->ich_arg);
+	}
+
+	while (intr_config_hook_list.tqh_first != NULL) {
+		tsleep(&intr_config_hook_list, PCONFIG, "conifhk", 0);
+	}
+}
+SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST,
+	run_interrupt_driven_config_hooks, NULL)
+
+/*
+ * Register a hook that will be called after "cold"
+ * autoconfiguration is complete and interrupts can
+ * be used to complete initialization.
+ */
+int
+config_intrhook_establish(hook)
+	struct intr_config_hook *hook;
+{
+	struct intr_config_hook *hook_entry;
+
+	for (hook_entry = intr_config_hook_list.tqh_first; hook_entry != NULL;
+	     hook_entry = hook_entry->ich_links.tqe_next)
+		if (hook_entry == hook)
+			break;
+	if (hook_entry != NULL) {
+		printf("config_intrhook_establish: establishing an "
+		       "already established hook.\n");
+		return (1);
+	}
+	TAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links);
+	if (cold == 0)
+		/* XXX Sufficient for LKMs loaded after initial config??? */
+		run_interrupt_driven_config_hooks(NULL);	
+	return (0);
+}
+
+void
+config_intrhook_disestablish(hook)
+	struct intr_config_hook *hook;
+{
+	struct intr_config_hook *hook_entry;
+
+	for (hook_entry = intr_config_hook_list.tqh_first; hook_entry != NULL;
+	     hook_entry = hook_entry->ich_links.tqe_next)
+		if (hook_entry == hook)
+			break;
+	if (hook_entry == NULL)
+		panic("config_intrhook_disestablish: disestablishing an "
+		      "unestablished hook");
+
+	TAILQ_REMOVE(&intr_config_hook_list, hook, ich_links);
+	/* Wakeup anyone watching the list */
+	wakeup(&intr_config_hook_list);
+}
diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c
new file mode 100644
index 0000000..10af2ea
--- /dev/null
+++ b/sys/kern/subr_blist.c
@@ -0,0 +1,928 @@
+
+/*
+ * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
+ *
+ *	(c)Copyright 1998, Matthew Dillon.  Terms for use and redistribution
+ *	are covered by the BSD Copyright as found in /usr/src/COPYRIGHT.
+ *
+ *	This module implements a general bitmap allocator/deallocator.  The
+ *	allocator eats around 2 bits per 'block'.  The module does not 
+ *	try to interpret the meaning of a 'block' other then to return 
+ *	SWAPBLK_NONE on an allocation failure.
+ *
+ *	A radix tree is used to maintain the bitmap.  Two radix constants are
+ *	involved:  One for the bitmaps contained in the leaf nodes (typically
+ *	32), and one for the meta nodes (typically 16).  Both meta and leaf
+ *	nodes have a hint field.  This field gives us a hint as to the largest
+ *	free contiguous range of blocks under the node.  It may contain a
+ *	value that is too high, but will never contain a value that is too 
+ *	low.  When the radix tree is searched, allocation failures in subtrees
+ *	update the hint. 
+ *
+ *	The radix tree also implements two collapsed states for meta nodes:
+ *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
+ *	in either of these two states, all information contained underneath
+ *	the node is considered stale.  These states are used to optimize
+ *	allocation and freeing operations.
+ *
+ * 	The hinting greatly increases code efficiency for allocations while
+ *	the general radix structure optimizes both allocations and frees.  The
+ *	radix tree should be able to operate well no matter how much 
+ *	fragmentation there is and no matter how large a bitmap is used.
+ *
+ *	Unlike the rlist code, the blist code wires all necessary memory at
+ *	creation time.  Neither allocations nor frees require interaction with
+ *	the memory subsystem.  In contrast, the rlist code may allocate memory 
+ *	on an rlist_free() call.  The non-blocking features of the blist code
+ *	are used to great advantage in the swap code (vm/nswap_pager.c).  The
+ *	rlist code uses a little less overall memory then the blist code (but
+ *	due to swap interleaving not all that much less), but the blist code 
+ *	scales much, much better.
+ *
+ *	LAYOUT: The radix tree is layed out recursively using a
+ *	linear array.  Each meta node is immediately followed (layed out
+ *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
+ *	is a recursive structure but one that can be easily scanned through
+ *	a very simple 'skip' calculation.  In order to support large radixes, 
+ *	portions of the tree may reside outside our memory allocation.  We 
+ *	handle this with an early-termination optimization (when bighint is 
+ *	set to -1) on the scan.  The memory allocation is only large enough 
+ *	to cover the number of blocks requested at creation time even if it
+ *	must be encompassed in larger root-node radix.
+ *
+ *	NOTE: the allocator cannot currently allocate more then 
+ *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too 
+ *	large' if you try.  This is an area that could use improvement.  The 
+ *	radix is large enough that this restriction does not effect the swap 
+ *	system, though.  Currently only the allocation code is effected by
+ *	this algorithmic unfeature.  The freeing code can handle arbitrary
+ *	ranges.
+ *
+ *	This code can be compiled stand-alone for debugging.
+ */
+
+#ifdef KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/blist.h>
+#include <sys/malloc.h>
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+
+#else
+
+#ifndef BLIST_NO_DEBUG
+#define BLIST_DEBUG
+#endif
+
+#define SWAPBLK_NONE ((daddr_t)-1)
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#define malloc(a,b,c)	malloc(a)
+#define free(a,b)	free(a)
+
+typedef unsigned int u_daddr_t;
+
+#include <sys/blist.h>
+
+void panic(const char *ctl, ...);
+
+#endif
+
+/*
+ * static support functions
+ */
+
+static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
+static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, 
+				daddr_t count, daddr_t radix, int skip);
+static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
+static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 
+					daddr_t radix, int skip, daddr_t blk);
+static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 
+				daddr_t skip, blist_t dest, daddr_t count);
+static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, 
+						int skip, daddr_t count);
+#ifndef KERNEL
+static void	blst_radix_print(blmeta_t *scan, daddr_t blk, 
+					daddr_t radix, int skip, int tab);
+#endif
+
+#ifdef KERNEL
+static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
+#endif
+
+/*
+ * blist_create() - create a blist capable of handling up to the specified
+ *		    number of blocks
+ *
+ *	blocks must be greater then 0
+ *
+ *	The smallest blist consists of a single leaf node capable of 
+ *	managing BLIST_BMAP_RADIX blocks.
+ */
+
+blist_t 
+blist_create(daddr_t blocks)
+{
+	blist_t bl;
+	int radix;
+	int skip = 0;
+
+	/*
+	 * Calculate radix and skip field used for scanning.
+	 */
+	radix = BLIST_BMAP_RADIX;
+
+	while (radix < blocks) {
+		radix <<= BLIST_META_RADIX_SHIFT;
+		skip = (skip + 1) << BLIST_META_RADIX_SHIFT;
+	}
+
+	bl = malloc(sizeof(struct blist), M_SWAP, M_WAITOK);
+
+	bzero(bl, sizeof(*bl));
+
+	bl->bl_blocks = blocks;
+	bl->bl_radix = radix;
+	bl->bl_skip = skip;
+	bl->bl_rootblks = 1 +
+	    blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
+	bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, M_WAITOK);
+
+#if defined(BLIST_DEBUG)
+	printf(
+		"BLIST representing %d blocks (%d MB of swap)"
+		", requiring %dK of ram\n",
+		bl->bl_blocks,
+		bl->bl_blocks * 4 / 1024,
+		(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+	);
+	printf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks);
+#endif
+	blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
+
+	return(bl);
+}
+
+void 
+blist_destroy(blist_t bl)
+{
+	free(bl->bl_root, M_SWAP);
+	free(bl, M_SWAP);
+}
+
+/*
+ * blist_alloc() - reserve space in the block bitmap.  Return the base
+ *		     of a contiguous region or SWAPBLK_NONE if space could
+ *		     not be allocated.
+ */
+
+daddr_t 
+blist_alloc(blist_t bl, daddr_t count)
+{
+	daddr_t blk = SWAPBLK_NONE;
+
+	if (bl) {
+		if (bl->bl_radix == BLIST_BMAP_RADIX)
+			blk = blst_leaf_alloc(bl->bl_root, 0, count);
+		else
+			blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
+		if (blk != SWAPBLK_NONE)
+			bl->bl_free -= count;
+	}
+	return(blk);
+}
+
+/*
+ * blist_free() -	free up space in the block bitmap.  Return the base
+ *		     	of a contiguous region.  Panic if an inconsistancy is
+ *			found.
+ */
+
+void 
+blist_free(blist_t bl, daddr_t blkno, daddr_t count)
+{
+	if (bl) {
+		if (bl->bl_radix == BLIST_BMAP_RADIX)
+			blst_leaf_free(bl->bl_root, blkno, count);
+		else
+			blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
+		bl->bl_free += count;
+	}
+}
+
+/*
+ * blist_resize() -	resize an existing radix tree to handle the
+ *			specified number of blocks.  This will reallocate
+ *			the tree and transfer the previous bitmap to the new
+ *			one.  When extending the tree you can specify whether
+ *			the new blocks are to left allocated or freed.
+ */
+
+void
+blist_resize(blist_t *pbl, daddr_t count, int freenew)
+{
+    blist_t newbl = blist_create(count);
+    blist_t save = *pbl;
+
+    *pbl = newbl;
+    if (count > save->bl_blocks)
+	    count = save->bl_blocks;
+    blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
+
+    /*
+     * If resizing upwards, should we free the new space or not?
+     */
+    if (freenew && count < newbl->bl_blocks) {
+	    blist_free(newbl, count, newbl->bl_blocks - count);
+    }
+    blist_destroy(save);
+}
+
+#ifdef BLIST_DEBUG
+
+/*
+ * blist_print()    - dump radix tree
+ */
+
+void
+blist_print(blist_t bl)
+{
+	printf("BLIST {\n");
+	blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
+	printf("}\n");
+}
+
+#endif
+
+/************************************************************************
+ *			  ALLOCATION SUPPORT FUNCTIONS			*
+ ************************************************************************
+ *
+ *	These support functions do all the actual work.  They may seem 
+ *	rather longish, but that's because I've commented them up.  The
+ *	actual code is straight forward.
+ *
+ */
+
+/*
+ * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
+ *
+ *	This is the core of the allocator and is optimized for the 1 block
+ *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
+ *	somewhat slower.  The 1 block allocation case is log2 and extremely
+ *	quick.
+ */
+
+static daddr_t
+blst_leaf_alloc(
+	blmeta_t *scan,
+	daddr_t blk,
+	int count
+) {
+	u_daddr_t orig = scan->u.bmu_bitmap;
+
+	if (orig == 0) {
+		/*
+		 * Optimize bitmap all-allocated case.  Also, count = 1
+		 * case assumes at least 1 bit is free in the bitmap, so
+		 * we have to take care of this case here.
+		 */
+		scan->bm_bighint = 0;
+		return(SWAPBLK_NONE);
+	}
+	if (count == 1) {
+		/*
+		 * Optimized code to allocate one bit out of the bitmap
+		 */
+		u_daddr_t mask;
+		int j = BLIST_BMAP_RADIX/2;
+		int r = 0;
+
+		mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
+
+		while (j) {
+			if ((orig & mask) == 0) {
+			    r += j;
+			    orig >>= j;
+			}
+			j >>= 1;
+			mask >>= j;
+		}
+		scan->u.bmu_bitmap &= ~(1 << r);
+		return(blk + r);
+	}
+	if (count <= BLIST_BMAP_RADIX) {
+		/*
+		 * non-optimized code to allocate N bits out of the bitmap.
+		 * The more bits, the faster the code runs.  It will run
+		 * the slowest allocating 2 bits, but since there aren't any
+		 * memory ops in the core loop (or shouldn't be, anyway),
+		 * you probably won't notice the difference.
+		 */
+		int j;
+		int n = BLIST_BMAP_RADIX - count;
+		u_daddr_t mask;
+
+		mask = (u_daddr_t)-1 >> n;
+
+		for (j = 0; j <= n; ++j) {
+			if ((orig & mask) == mask) {
+				scan->u.bmu_bitmap &= ~mask;
+				return(blk + j);
+			}
+			mask = (mask << 1);
+		}
+	}
+	/*
+	 * We couldn't allocate count in this subtree, update bighint.
+	 */
+	scan->bm_bighint = count - 1;
+	return(SWAPBLK_NONE);
+}
+
+/*
+ * blist_meta_alloc() -	allocate at a meta in the radix tree.
+ *
+ *	Attempt to allocate at a meta node.  If we can't, we update
+ *	bighint and return a failure.  Updating bighint optimize future
+ *	calls that hit this node.  We have to check for our collapse cases
+ *	and we have a few optimizations strewn in as well.
+ */
+
+static daddr_t
+blst_meta_alloc(
+	blmeta_t *scan, 
+	daddr_t blk,
+	daddr_t count,
+	daddr_t radix, 
+	int skip
+) {
+	int i;
+	int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+	if (scan->u.bmu_avail == 0)  {
+		/*
+		 * ALL-ALLOCATED special case
+		 */
+		scan->bm_bighint = count;
+		return(SWAPBLK_NONE);
+	}
+
+	if (scan->u.bmu_avail == radix) {
+		radix >>= BLIST_META_RADIX_SHIFT;
+
+		/*
+		 * ALL-FREE special case, initialize uninitialize
+		 * sublevel.
+		 */
+		for (i = 1; i <= skip; i += next_skip) {
+			if (scan[i].bm_bighint == (daddr_t)-1)
+				break;
+			if (next_skip == 1) {
+				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+				scan[i].bm_bighint = BLIST_BMAP_RADIX;
+			} else {
+				scan[i].bm_bighint = radix;
+				scan[i].u.bmu_avail = radix;
+			}
+		}
+	} else {
+		radix >>= BLIST_META_RADIX_SHIFT;
+	}
+
+	for (i = 1; i <= skip; i += next_skip) {
+		if (count <= scan[i].bm_bighint) {
+			/*
+			 * count fits in object
+			 */
+			daddr_t r;
+			if (next_skip == 1) {
+				r = blst_leaf_alloc(&scan[i], blk, count);
+			} else {
+				r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
+			}
+			if (r != SWAPBLK_NONE) {
+				scan->u.bmu_avail -= count;
+				if (scan->bm_bighint > scan->u.bmu_avail)
+					scan->bm_bighint = scan->u.bmu_avail;
+				return(r);
+			}
+		} else if (scan[i].bm_bighint == (daddr_t)-1) {
+			/*
+			 * Terminator
+			 */
+			break;
+		} else if (count > radix) {
+			/*
+			 * count does not fit in object even if it were
+			 * complete free.
+			 */
+			panic("blist_meta_alloc: allocation too large");
+		}
+		blk += radix;
+	}
+
+	/*
+	 * We couldn't allocate count in this subtree, update bighint.
+	 */
+	if (scan->bm_bighint >= count)
+		scan->bm_bighint = count - 1;
+	return(SWAPBLK_NONE);
+}
+
+/*
+ * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
+ *
+ */
+
+static void
+blst_leaf_free(
+	blmeta_t *scan,
+	daddr_t blk,
+	int count
+) {
+	/*
+	 * free some data in this bitmap
+	 *
+	 * e.g.
+	 *	0000111111111110000
+	 *          \_________/\__/
+	 *		v        n
+	 */
+	int n = blk & (BLIST_BMAP_RADIX - 1);
+	u_daddr_t mask;
+
+	mask = ((u_daddr_t)-1 << n) &
+	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+	if (scan->u.bmu_bitmap & mask)
+		panic("blst_radix_free: freeing free block");
+	scan->u.bmu_bitmap |= mask;
+
+	/*
+	 * We could probably do a better job here.  We are required to make
+	 * bighint at least as large as the biggest contiguous block of 
+	 * data.  If we just shoehorn it, a little extra overhead will
+	 * be incured on the next allocation (but only that one typically).
+	 */
+	scan->bm_bighint = BLIST_BMAP_RADIX;
+}
+
+/*
+ * BLST_META_FREE() - free allocated blocks from radix tree meta info
+ *
+ *	This support routine frees a range of blocks from the bitmap.
+ *	The range must be entirely enclosed by this radix node.  If a
+ *	meta node, we break the range down recursively to free blocks
+ *	in subnodes (which means that this code can free an arbitrary
+ *	range whereas the allocation code cannot allocate an arbitrary
+ *	range).
+ */
+
+static void 
+blst_meta_free(
+	blmeta_t *scan, 
+	daddr_t freeBlk,
+	daddr_t count,
+	daddr_t radix, 
+	int skip,
+	daddr_t blk
+) {
+	int i;
+	int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+#if 0
+	printf("FREE (%x,%d) FROM (%x,%d)\n",
+	    freeBlk, count,
+	    blk, radix
+	);
+#endif
+
+	if (scan->u.bmu_avail == 0) {
+		/*
+		 * ALL-ALLOCATED special case, with possible
+		 * shortcut to ALL-FREE special case.
+		 */
+		scan->u.bmu_avail = count;
+		scan->bm_bighint = count;
+
+		if (count != radix)  {
+			for (i = 1; i <= skip; i += next_skip) {
+				if (scan[i].bm_bighint == (daddr_t)-1)
+					break;
+				scan[i].bm_bighint = 0;
+				if (next_skip == 1) {
+					scan[i].u.bmu_bitmap = 0;
+				} else {
+					scan[i].u.bmu_avail = 0;
+				}
+			}
+			/* fall through */
+		}
+	} else {
+		scan->u.bmu_avail += count;
+		/* scan->bm_bighint = radix; */
+	}
+
+	/*
+	 * ALL-FREE special case.
+	 */
+
+	if (scan->u.bmu_avail == radix)
+		return;
+#if !defined(MAX_PERF)
+	if (scan->u.bmu_avail > radix)
+		panic("blst_meta_free: freeing already free blocks (%d) %d/%d", count, scan->u.bmu_avail, radix);
+#endif
+
+	/*
+	 * Break the free down into its components
+	 */
+
+	radix >>= BLIST_META_RADIX_SHIFT;
+
+	i = (freeBlk - blk) / radix;
+	blk += i * radix;
+	i = i * next_skip + 1;
+
+	while (i <= skip && blk < freeBlk + count) {
+		daddr_t v;
+
+		v = blk + radix - freeBlk;
+		if (v > count)
+			v = count;
+
+		if (scan->bm_bighint == (daddr_t)-1)
+			panic("blst_meta_free: freeing unexpected range");
+
+		if (next_skip == 1) {
+			blst_leaf_free(&scan[i], freeBlk, v);
+		} else {
+			blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
+		}
+		if (scan->bm_bighint < scan[i].bm_bighint)
+		    scan->bm_bighint = scan[i].bm_bighint;
+		count -= v;
+		freeBlk += v;
+		blk += radix;
+		i += next_skip;
+	}
+}
+
+/*
+ * BLIST_RADIX_COPY() - copy one radix tree to another
+ *
+ *	Locates free space in the source tree and frees it in the destination
+ *	tree.  The space may not already be free in the destination.
+ */
+
+static void blst_copy(
+	blmeta_t *scan, 
+	daddr_t blk,
+	daddr_t radix, 
+	daddr_t skip, 
+	blist_t dest,
+	daddr_t count
+) {
+	int next_skip;
+	int i;
+
+	/*
+	 * Leaf node
+	 */
+
+	if (radix == BLIST_BMAP_RADIX) {
+		u_daddr_t v = scan->u.bmu_bitmap;
+
+		if (v == (u_daddr_t)-1) {
+			blist_free(dest, blk, count);
+		} else if (v != 0) {
+			int i;
+
+			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
+				if (v & (1 << i))
+					blist_free(dest, blk + i, 1);
+			}
+		}
+		return;
+	}
+
+	/*
+	 * Meta node
+	 */
+
+	if (scan->u.bmu_avail == 0) {
+		/*
+		 * Source all allocated, leave dest allocated
+		 */
+		return;
+	} 
+	if (scan->u.bmu_avail == radix) {
+		/*
+		 * Source all free, free entire dest
+		 */
+		if (count < radix)
+			blist_free(dest, blk, count);
+		else
+			blist_free(dest, blk, radix);
+		return;
+	}
+
+
+	radix >>= BLIST_META_RADIX_SHIFT;
+	next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+	for (i = 1; count && i <= skip; i += next_skip) {
+		if (scan[i].bm_bighint == (daddr_t)-1)
+			break;
+
+		if (count >= radix) {
+			blst_copy(
+			    &scan[i],
+			    blk,
+			    radix,
+			    next_skip - 1,
+			    dest,
+			    radix
+			);
+			count -= radix;
+		} else {
+			if (count) {
+				blst_copy(
+				    &scan[i],
+				    blk,
+				    radix,
+				    next_skip - 1,
+				    dest,
+				    count
+				);
+			}
+			count = 0;
+		}
+		blk += radix;
+	}
+}
+
+/*
+ * BLST_RADIX_INIT() - initialize radix tree
+ *
+ *	Initialize our meta structures and bitmaps and calculate the exact
+ *	amount of space required to manage 'count' blocks - this space may
+ *	be considerably less then the calculated radix due to the large
+ *	RADIX values we use.
+ */
+
+static daddr_t	
+blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
+{
+	int i;
+	int next_skip;
+	daddr_t memindex = 0;
+
+	/*
+	 * Leaf node
+	 */
+
+	if (radix == BLIST_BMAP_RADIX) {
+		if (scan) {
+			scan->bm_bighint = 0;
+			scan->u.bmu_bitmap = 0;
+		}
+		return(memindex);
+	}
+
+	/*
+	 * Meta node.  If allocating the entire object we can special
+	 * case it.  However, we need to figure out how much memory
+	 * is required to manage 'count' blocks, so we continue on anyway.
+	 */
+
+	if (scan) {
+		scan->bm_bighint = 0;
+		scan->u.bmu_avail = 0;
+	}
+
+	radix >>= BLIST_META_RADIX_SHIFT;
+	next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+	for (i = 1; i <= skip; i += next_skip) {
+		if (count >= radix) {
+			/*
+			 * Allocate the entire object
+			 */
+			memindex = i + blst_radix_init(
+			    ((scan) ? &scan[i] : NULL),
+			    radix,
+			    next_skip - 1,
+			    radix
+			);
+			count -= radix;
+		} else if (count > 0) {
+			/*
+			 * Allocate a partial object
+			 */
+			memindex = i + blst_radix_init(
+			    ((scan) ? &scan[i] : NULL),
+			    radix,
+			    next_skip - 1,
+			    count
+			);
+			count = 0;
+		} else {
+			/*
+			 * Add terminator and break out
+			 */
+			if (scan)
+				scan[i].bm_bighint = (daddr_t)-1;
+			break;
+		}
+	}
+	if (memindex < i)
+		memindex = i;
+	return(memindex);
+}
+
+#ifdef BLIST_DEBUG
+
+static void	
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
+{
+	int i;
+	int next_skip;
+	int lastState = 0;
+
+	if (radix == BLIST_BMAP_RADIX) {
+		printf(
+		    "%*.*s(%04x,%d): bitmap %08x big=%d\n", 
+		    tab, tab, "",
+		    blk, radix,
+		    scan->u.bmu_bitmap,
+		    scan->bm_bighint
+		);
+		return;
+	}
+
+	if (scan->u.bmu_avail == 0) {
+		printf(
+		    "%*.*s(%04x,%d) ALL ALLOCATED\n",
+		    tab, tab, "",
+		    blk,
+		    radix
+		);
+		return;
+	}
+	if (scan->u.bmu_avail == radix) {
+		printf(
+		    "%*.*s(%04x,%d) ALL FREE\n",
+		    tab, tab, "",
+		    blk,
+		    radix
+		);
+		return;
+	}
+
+	printf(
+	    "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n",
+	    tab, tab, "",
+	    blk, radix,
+	    scan->u.bmu_avail,
+	    radix,
+	    scan->bm_bighint
+	);
+
+	radix >>= BLIST_META_RADIX_SHIFT;
+	next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+	tab += 4;
+
+	for (i = 1; i <= skip; i += next_skip) {
+		if (scan[i].bm_bighint == (daddr_t)-1) {
+			printf(
+			    "%*.*s(%04x,%d): Terminator\n",
+			    tab, tab, "",
+			    blk, radix
+			);
+			lastState = 0;
+			break;
+		}
+		blst_radix_print(
+		    &scan[i],
+		    blk,
+		    radix,
+		    next_skip - 1,
+		    tab
+		);
+		blk += radix;
+	}
+	tab -= 4;
+
+	printf(
+	    "%*.*s}\n",
+	    tab, tab, ""
+	);
+}
+
+#endif
+
+#ifdef BLIST_DEBUG
+
+int
+main(int ac, char **av)
+{
+	int size = 1024;
+	int i;
+	blist_t bl;
+
+	for (i = 1; i < ac; ++i) {
+		const char *ptr = av[i];
+		if (*ptr != '-') {
+			size = strtol(ptr, NULL, 0);
+			continue;
+		}
+		ptr += 2;
+		fprintf(stderr, "Bad option: %s\n", ptr - 2);
+		exit(1);
+	}
+	bl = blist_create(size);
+	blist_free(bl, 0, size);
+
+	for (;;) {
+		char buf[1024];
+		daddr_t da = 0;
+		daddr_t count = 0;
+
+
+		printf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix);
+		fflush(stdout);
+		if (fgets(buf, sizeof(buf), stdin) == NULL)
+			break;
+		switch(buf[0]) {
+		case 'r':
+			if (sscanf(buf + 1, "%d", &count) == 1) {
+				blist_resize(&bl, count, 1);
+			} else {
+				printf("?\n");
+			}
+		case 'p':
+			blist_print(bl);
+			break;
+		case 'a':
+			if (sscanf(buf + 1, "%d", &count) == 1) {
+				daddr_t blk = blist_alloc(bl, count);
+				printf("    R=%04x\n", blk);
+			} else {
+				printf("?\n");
+			}
+			break;
+		case 'f':
+			if (sscanf(buf + 1, "%x %d", &da, &count) == 2) {
+				blist_free(bl, da, count);
+			} else {
+				printf("?\n");
+			}
+			break;
+		case '?':
+		case 'h':
+			puts(
+			    "p          -print\n"
+			    "a %d       -allocate\n"
+			    "f %x %d    -free\n"
+			    "r %d       -resize\n"
+			    "h/?        -help"
+			);
+			break;
+		default:
+			printf("?\n");
+			break;
+		}
+	}
+	return(0);
+}
+
+void
+panic(const char *ctl, ...)
+{
+	va_list va;
+
+	va_start(va, ctl);
+	vfprintf(stderr, ctl, va);
+	fprintf(stderr, "\n");
+	va_end(va);
+	exit(1);
+}
+
+#endif
+
diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c
new file mode 100644
index 0000000..dc4c88a
--- /dev/null
+++ b/sys/kern/subr_bus.c
@@ -0,0 +1,1572 @@
+/*-
+ * Copyright (c) 1997,1998 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: subr_bus.c,v 1.13 1999/01/10 22:04:05 n_hibma Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus_private.h>
+#include <sys/systm.h>
+#include <machine/stdarg.h>	/* for device_printf() */
+
+#include "opt_bus.h"
+
+#ifdef BUS_DEBUG
+#define PDEBUG(a)	(printf(__FUNCTION__ ":%d: ", __LINE__), printf a, printf("\n"))
+#define DEVICENAME(d)	((d)? device_get_name(d): "no device")
+#define DRIVERNAME(d)	((d)? d->name : "no driver")
+#define DEVCLANAME(d)	((d)? d->name : "no devclass")
+
+/* Produce the indenting, indent*2 spaces plus a '.' ahead of that to 
+ * prevent syslog from deleting initial spaces
+ */
+#define indentprintf(p)	do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf("  "); printf p ; } while(0)
+
+static void print_method_list(device_method_t *m, int indent);
+static void print_device_ops(device_ops_t ops, int indent);
+static void print_device_short(device_t dev, int indent);
+static void print_device(device_t dev, int indent);
+void print_device_tree_short(device_t dev, int indent);
+void print_device_tree(device_t dev, int indent);
+static void print_driver_short(driver_t *driver, int indent);
+static void print_driver(driver_t *driver, int indent);
+static void print_driver_list(driver_list_t drivers, int indent);
+static void print_devclass_short(devclass_t dc, int indent);
+static void print_devclass(devclass_t dc, int indent);
+void print_devclass_list_short(void);
+void print_devclass_list(void);
+
+#else
+/* Make the compiler ignore the function calls */
+#define PDEBUG(a)			/* nop */
+#define DEVICENAME(d)			/* nop */
+#define DRIVERNAME(d)			/* nop */
+#define DEVCLANAME(d)			/* nop */
+
+#define print_method_list(m,i)		/* nop */
+#define print_device_ops(o,i)		/* nop */
+#define print_device_short(d,i)		/* nop */
+#define print_device(d,i)		/* nop */
+#define print_device_tree_short(d,i)	/* nop */
+#define print_device_tree(d,i)		/* nop */
+#define print_driver_short(d,i)		/* nop */
+#define print_driver(d,i)		/* nop */
+#define print_driver_list(d,i)		/* nop */
+#define print_devclass_short(d,i)	/* nop */
+#define print_devclass(d,i)		/* nop */
+#define print_devclass_list_short()	/* nop */
+#define print_devclass_list()		/* nop */
+#endif
+
+
+/*
+ * Method table handling
+ */
+static int next_method_offset = 1;
+static int methods_count = 0;
+static int methods_size = 0;
+
+struct method {
+    int offset;
+    char* name;
+};
+
+static struct method *methods = 0;
+
+static void
+register_method(struct device_op_desc *desc)
+{
+    int i;
+    struct method* m;
+
+    for (i = 0; i < methods_count; i++)
+	if (!strcmp(methods[i].name, desc->name)) {
+	    desc->offset = methods[i].offset;
+	    PDEBUG(("methods[%d] has the same name, %s, with offset %d",
+	    		i, desc->name, desc->offset));
+	    return;
+	}
+
+    if (methods_count == methods_size) {
+	struct method* p;
+
+	methods_size += 10;
+	p = (struct method*) malloc(methods_size * sizeof(struct method),
+				     M_DEVBUF, M_NOWAIT);
+	if (!p)
+	    panic("register_method: out of memory");
+	if (methods) {
+	    bcopy(methods, p, methods_count * sizeof(struct method));
+	    free(methods, M_DEVBUF);
+	}
+	methods = p;
+    }
+    m = &methods[methods_count++];
+    m->name = malloc(strlen(desc->name) + 1, M_DEVBUF, M_NOWAIT);
+    if (!m->name)
+	    panic("register_method: out of memory");
+    strcpy(m->name, desc->name);
+    desc->offset = m->offset = next_method_offset++;
+}
+
+static int error_method(void)
+{
+    return ENXIO;
+}
+
+static struct device_ops null_ops = {
+    1, 
+    { error_method }
+};
+
+static void
+compile_methods(driver_t *driver)
+{
+    device_ops_t ops;
+    struct device_method *m;
+    int i;
+
+    /*
+     * First register any methods which need it.
+     */
+    for (i = 0, m = driver->methods; m->desc; i++, m++)
+	if (!m->desc->offset)
+	    register_method(m->desc);
+	else
+	    PDEBUG(("offset not equal to zero, method desc %d left as is", i));
+
+    /*
+     * Then allocate the compiled op table.
+     */
+    ops = malloc(sizeof(struct device_ops) + (next_method_offset-1) * sizeof(devop_t),
+		 M_DEVBUF, M_NOWAIT);
+    if (!ops)
+	panic("compile_methods: out of memory");
+
+    ops->maxoffset = next_method_offset;
+    for (i = 0; i < next_method_offset; i++)
+	ops->methods[i] = error_method;
+    for (i = 0, m = driver->methods; m->desc; i++, m++)
+	ops->methods[m->desc->offset] = m->func;
+    PDEBUG(("%s has %d method%s, wasting %d bytes",
+    		DRIVERNAME(driver), i, (i==1?"":"s"),
+		(next_method_offset-i)*sizeof(devop_t)));
+
+    driver->ops = ops;
+}
+
+/*
+ * Devclass implementation
+ */
+
+static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
+
+static devclass_t
+devclass_find_internal(const char *classname, int create)
+{
+    devclass_t dc;
+
+    PDEBUG(("looking for %s", classname));
+    if (!classname)
+	return NULL;
+
+    for (dc = TAILQ_FIRST(&devclasses); dc; dc = TAILQ_NEXT(dc, link))
+	if (!strcmp(dc->name, classname))
+	    return dc;
+
+    PDEBUG(("%s not found%s", classname, (create? ", creating": "")));
+    if (create) {
+	dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
+		    M_DEVBUF, M_NOWAIT);
+	if (!dc)
+	    return NULL;
+	dc->name = (char*) (dc + 1);
+	strcpy(dc->name, classname);
+	dc->devices = NULL;
+	dc->maxunit = 0;
+	dc->nextunit = 0;
+	TAILQ_INIT(&dc->drivers);
+	TAILQ_INSERT_TAIL(&devclasses, dc, link);
+    }
+
+    return dc;
+}
+
+devclass_t
+devclass_find(const char *classname)
+{
+    return devclass_find_internal(classname, FALSE);
+}
+
+int
+devclass_add_driver(devclass_t dc, driver_t *driver)
+{
+    PDEBUG(("%s", DRIVERNAME(driver)));
+    /*
+     * Compile the drivers methods.
+     */
+    compile_methods(driver);
+
+    /*
+     * Make sure the devclass which the driver is implementing exists.
+     */
+    devclass_find_internal(driver->name, TRUE);
+
+    TAILQ_INSERT_TAIL(&dc->drivers, driver, link);
+
+    return 0;
+}
+
+int
+devclass_delete_driver(devclass_t busclass, driver_t *driver)
+{
+    devclass_t dc = devclass_find(driver->name);
+    device_t dev;
+    int i;
+    int error;
+
+    PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
+
+    if (!dc)
+	return 0;
+
+    /*
+     * Disassociate from any devices.  We iterate through all the
+     * devices in the devclass of the driver and detach any which are
+     * using the driver.
+     */
+    for (i = 0; i < dc->maxunit; i++) {
+	if (dc->devices[i]) {
+	    dev = dc->devices[i];
+	    if (dev->driver == driver) {
+		if (error = device_detach(dev))
+		    return error;
+		device_set_driver(dev, NULL);
+	    }
+	}
+    }
+
+    TAILQ_REMOVE(&busclass->drivers, driver, link);
+    return 0;
+}
+
+driver_t *
+devclass_find_driver(devclass_t dc, const char *classname)
+{
+    driver_t *driver;
+
+    PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
+
+    for (driver = TAILQ_FIRST(&dc->drivers); driver;
+	 driver = TAILQ_NEXT(driver, link)) {
+	if (!strcmp(driver->name, classname))
+	    return driver;
+    }
+
+    PDEBUG(("not found"));
+    return NULL;
+}
+
+const char *
+devclass_get_name(devclass_t dc)
+{
+    return dc->name;
+}
+
+device_t
+devclass_get_device(devclass_t dc, int unit)
+{
+    if (unit < 0 || unit >= dc->maxunit)
+	return NULL;
+    return dc->devices[unit];
+}
+
+void *
+devclass_get_softc(devclass_t dc, int unit)
+{
+    device_t dev;
+
+    if (unit < 0 || unit >= dc->maxunit)
+	return NULL;
+    dev = dc->devices[unit];
+    if (!dev || dev->state < DS_ATTACHED)
+	return NULL;
+    return dev->softc;
+}
+
+int
+devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
+{
+    int i;
+    int count;
+    device_t *list;
+    
+    count = 0;
+    for (i = 0; i < dc->maxunit; i++)
+	if (dc->devices[i])
+	    count++;
+
+    list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT);
+    if (!list)
+	return ENOMEM;
+
+    count = 0;
+    for (i = 0; i < dc->maxunit; i++)
+	if (dc->devices[i]) {
+	    list[count] = dc->devices[i];
+	    count++;
+	}
+
+    *devlistp = list;
+    *devcountp = count;
+
+    return 0;
+}
+
+int
+devclass_get_maxunit(devclass_t dc)
+{
+    return dc->maxunit;
+}
+
+static int
+devclass_alloc_unit(devclass_t dc, int *unitp)
+{
+    int unit = *unitp;
+
+    PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+    /*
+     * If we have been given a wired unit number, check for existing
+     * device.
+     */
+    if (unit != -1) {
+	device_t dev;
+	dev = devclass_get_device(dc, unit);
+	if (dev) {
+	    printf("devclass_alloc_unit: %s%d already exists, using next available unit number\n", dc->name, unit);
+	    unit = -1;
+	}
+    }
+
+    if (unit == -1) {
+	unit = dc->nextunit;
+	dc->nextunit++;
+    } else if (dc->nextunit <= unit)
+	dc->nextunit = unit + 1;
+
+    if (unit >= dc->maxunit) {
+	device_t *newlist;
+	int newsize;
+
+	newsize = (dc->maxunit ? 2 * dc->maxunit
+		   : MINALLOCSIZE / sizeof(device_t));
+	newlist = malloc(sizeof(device_t) * newsize, M_DEVBUF, M_NOWAIT);
+	if (!newlist)
+	    return ENOMEM;
+	bcopy(dc->devices, newlist, sizeof(device_t) * dc->maxunit);
+	bzero(newlist + dc->maxunit,
+	      sizeof(device_t) * (newsize - dc->maxunit));
+	if (dc->devices)
+	    free(dc->devices, M_DEVBUF);
+	dc->devices = newlist;
+	dc->maxunit = newsize;
+    }
+    PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+    *unitp = unit;
+    return 0;
+}
+
+static int
+devclass_add_device(devclass_t dc, device_t dev)
+{
+    int error;
+
+    PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+    if (error = devclass_alloc_unit(dc, &dev->unit))
+	return error;
+    dc->devices[dev->unit] = dev;
+    dev->devclass = dc;
+    return 0;
+}
+
+static int
+devclass_delete_device(devclass_t dc, device_t dev)
+{
+    if (!dc || !dev)
+	return 0;
+
+    PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+    if (dev->devclass != dc
+	|| dc->devices[dev->unit] != dev)
+	panic("devclass_delete_device: inconsistent device class");
+    dc->devices[dev->unit] = NULL;
+    if (dev->flags & DF_WILDCARD)
+	dev->unit = -1;
+    dev->devclass = NULL;
+    while (dc->nextunit > 0 && dc->devices[dc->nextunit - 1] == NULL)
+	dc->nextunit--;
+    return 0;
+}
+
+static device_t
+make_device(device_t parent, const char *name,
+	    int unit, void *ivars)
+{
+    device_t dev;
+    devclass_t dc;
+    int error;
+
+    PDEBUG(("%s at %s as unit %d with%s ivars",
+    	    name, DEVICENAME(parent), unit, (ivars? "":"out")));
+
+    if (name) {
+	dc = devclass_find_internal(name, TRUE);
+	if (!dc) {
+	    printf("make_device: can't find device class %s\n", name);
+	    return NULL;
+	}
+
+	if (error = devclass_alloc_unit(dc, &unit))
+	    return NULL;
+    } else
+	dc = NULL;
+
+    dev = malloc(sizeof(struct device), M_DEVBUF, M_NOWAIT);
+    if (!dev)
+	return 0;
+
+    dev->parent = parent;
+    TAILQ_INIT(&dev->children);
+    dev->ops = &null_ops;
+    dev->driver = NULL;
+    dev->devclass = dc;
+    dev->unit = unit;
+    dev->desc = NULL;
+    dev->busy = 0;
+    dev->flags = DF_ENABLED;
+    if (unit == -1)
+	dev->flags |= DF_WILDCARD;
+    if (name)
+	dev->flags |= DF_FIXEDCLASS;
+    dev->ivars = ivars;
+    dev->softc = NULL;
+
+    if (dc)
+	dc->devices[unit] = dev;
+
+    dev->state = DS_NOTPRESENT;
+
+    return dev;
+}
+
+static void
+device_print_child(device_t dev, device_t child)
+{
+    printf("%s%d", device_get_name(child), device_get_unit(child));
+    if (device_is_alive(child)) {
+	if (device_get_desc(child))
+	    printf(": <%s>", device_get_desc(child));
+	BUS_PRINT_CHILD(dev, child);
+    } else
+	printf(" not found");
+    printf("\n");
+}
+
+device_t
+device_add_child(device_t dev, const char *name, int unit, void *ivars)
+{
+    device_t child;
+
+    PDEBUG(("%s at %s as unit %d with%s ivars",
+    	    name, DEVICENAME(dev), unit, (ivars? "":"out")));
+
+    child = make_device(dev, name, unit, ivars);
+
+    if (child)
+	TAILQ_INSERT_TAIL(&dev->children, child, link);
+    else
+	PDEBUG(("%s failed", name));
+
+    return child;
+}
+
+device_t
+device_add_child_after(device_t dev, device_t place, const char *name,
+		       int unit, void *ivars)
+{
+    device_t child;
+
+    PDEBUG(("%s at %s after %s as unit %d with%s ivars",
+    	    name, DEVICENAME(dev), DEVICENAME(place), unit, (ivars? "":"out")));
+
+    child = make_device(dev, name, unit, ivars);
+
+    if (place) {
+	TAILQ_INSERT_AFTER(&dev->children, place, dev, link);
+    } else {
+	TAILQ_INSERT_HEAD(&dev->children, dev, link);
+    }
+
+    return child;
+}
+
+int
+device_delete_child(device_t dev, device_t child)
+{
+    int error;
+    device_t grandchild;
+
+    PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
+
+    /* remove children first */
+    while ( (grandchild = TAILQ_FIRST(&child->children)) ) {
+        error = device_delete_child(child, grandchild);
+	if (error)
+	    return error;
+    }
+
+    if (error = device_detach(child))
+	return error;
+    if (child->devclass)
+	devclass_delete_device(child->devclass, child);
+    TAILQ_REMOVE(&dev->children, child, link);
+    free(child, M_DEVBUF);
+
+    return 0;
+}
+
+/*
+ * Find only devices attached to this bus.
+ */
+device_t
+device_find_child(device_t dev, const char *classname, int unit)
+{
+    devclass_t dc;
+    device_t child;
+
+    dc = devclass_find(classname);
+    if (!dc)
+	return NULL;
+
+    child = devclass_get_device(dc, unit);
+    if (child && child->parent == dev)
+	return child;
+    return NULL;
+}
+
+static driver_t *
+first_matching_driver(devclass_t dc, device_t dev)
+{
+    if (dev->devclass)
+	return devclass_find_driver(dc, dev->devclass->name);
+    else
+	return TAILQ_FIRST(&dc->drivers);
+}
+
+static driver_t *
+next_matching_driver(devclass_t dc, device_t dev, driver_t *last)
+{
+    if (dev->devclass) {
+	driver_t *driver;
+	for (driver = TAILQ_NEXT(last, link); driver;
+	     driver = TAILQ_NEXT(driver, link))
+	    if (!strcmp(dev->devclass->name, driver->name))
+		return driver;
+	return NULL;
+    } else
+	return TAILQ_NEXT(last, link);
+}
+
+static int
+device_probe_child(device_t dev, device_t child)
+{
+    devclass_t dc;
+    driver_t *driver;
+
+    dc = dev->devclass;
+    if (dc == NULL)
+	panic("device_probe_child: parent device has no devclass");
+
+    if (child->state == DS_ALIVE)
+	return 0;
+
+    for (driver = first_matching_driver(dc, child);
+	 driver;
+	 driver = next_matching_driver(dc, child, driver)) {
+	PDEBUG(("Trying %s", DRIVERNAME(driver)));
+	device_set_driver(child, driver);
+	if (DEVICE_PROBE(child) == 0) {
+	    if (!child->devclass)
+		device_set_devclass(child, driver->name);
+	    child->state = DS_ALIVE;
+	    return 0;
+	}
+    }
+
+    return ENXIO;
+}
+
+device_t
+device_get_parent(device_t dev)
+{
+    return dev->parent;
+}
+
+int
+device_get_children(device_t dev, device_t **devlistp, int *devcountp)
+{
+    int count;
+    device_t child;
+    device_t *list;
+    
+    count = 0;
+    for (child = TAILQ_FIRST(&dev->children); child;
+	 child = TAILQ_NEXT(child, link))
+	count++;
+
+    list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT);
+    if (!list)
+	return ENOMEM;
+
+    count = 0;
+    for (child = TAILQ_FIRST(&dev->children); child;
+	 child = TAILQ_NEXT(child, link)) {
+	list[count] = child;
+	count++;
+    }
+
+    *devlistp = list;
+    *devcountp = count;
+
+    return 0;
+}
+
+driver_t *
+device_get_driver(device_t dev)
+{
+    return dev->driver;
+}
+
+devclass_t
+device_get_devclass(device_t dev)
+{
+    return dev->devclass;
+}
+
+const char *
+device_get_name(device_t dev)
+{
+    if (dev->devclass)
+	return devclass_get_name(dev->devclass);
+    return NULL;
+}
+
+int
+device_get_unit(device_t dev)
+{
+    return dev->unit;
+}
+
+const char *
+device_get_desc(device_t dev)
+{
+    return dev->desc;
+}
+
+void
+device_print_prettyname(device_t dev)
+{
+	const char *name = device_get_name(dev);
+
+	if (name == 0)
+		name = "(no driver assigned)";
+	printf("%s%d: ", name, device_get_unit(dev));
+}
+
+void
+device_printf(device_t dev, const char * fmt, ...)
+{
+	va_list ap;
+
+	device_print_prettyname(dev);
+	va_start(ap, fmt);
+	vprintf(fmt, ap);
+	va_end(ap);
+}
+
+void
+device_set_desc(device_t dev, const char* desc)
+{
+    dev->desc = desc;
+}
+
+void *
+device_get_softc(device_t dev)
+{
+    return dev->softc;
+}
+
+void *
+device_get_ivars(device_t dev)
+{
+    return dev->ivars;
+}
+
+device_state_t
+device_get_state(device_t dev)
+{
+    return dev->state;
+}
+
+void
+device_enable(device_t dev)
+{
+    dev->flags |= DF_ENABLED;
+}
+
+void
+device_disable(device_t dev)
+{
+    dev->flags &= ~DF_ENABLED;
+}
+
+void
+device_busy(device_t dev)
+{
+    if (dev->state < DS_ATTACHED)
+	panic("device_busy: called for unattached device");
+    if (dev->busy == 0 && dev->parent)
+	device_busy(dev->parent);
+    dev->busy++;
+    dev->state = DS_BUSY;
+}
+
+void
+device_unbusy(device_t dev)
+{
+    if (dev->state != DS_BUSY)
+	panic("device_unbusy: called for non-busy device");
+    dev->busy--;
+    if (dev->busy == 0) {
+	if (dev->parent)
+	    device_unbusy(dev->parent);
+	dev->state = DS_ATTACHED;
+    }
+}
+
+int
+device_is_enabled(device_t dev)
+{
+    return (dev->flags & DF_ENABLED) != 0;
+}
+
+int
+device_is_alive(device_t dev)
+{
+    return dev->state >= DS_ALIVE;
+}
+
+int
+device_set_devclass(device_t dev, const char *classname)
+{
+    devclass_t dc;
+
+    if (dev->devclass) {
+	printf("device_set_devclass: device class already set\n");
+	return EINVAL;
+    }
+
+    dc = devclass_find_internal(classname, TRUE);
+    if (!dc)
+	return ENOMEM;
+
+    return devclass_add_device(dc, dev);
+}
+
+int
+device_set_driver(device_t dev, driver_t *driver)
+{
+    if (dev->state >= DS_ATTACHED)
+	return EBUSY;
+
+    if (dev->driver == driver)
+	return 0;
+
+    if (dev->softc) {
+	free(dev->softc, M_DEVBUF);
+	dev->softc = NULL;
+    }
+    dev->ops = &null_ops;
+    dev->driver = driver;
+    if (driver) {
+	dev->ops = driver->ops;
+	dev->softc = malloc(driver->softc, M_DEVBUF, M_NOWAIT);
+	if (!dev->softc) {
+	    dev->ops = &null_ops;
+	    dev->driver = NULL;
+	    return ENOMEM;
+	}
+	bzero(dev->softc, driver->softc);
+    }
+    return 0;
+}
+
+int
+device_probe_and_attach(device_t dev)
+{
+    device_t bus = dev->parent;
+    int error = 0;
+
+    if (dev->state >= DS_ALIVE)
+	return 0;
+
+    if (dev->flags & DF_ENABLED) {
+	error = device_probe_child(bus, dev);
+	if (!error) {
+	    device_print_child(bus, dev);
+	    error = DEVICE_ATTACH(dev);
+	    if (!error)
+		dev->state = DS_ATTACHED;
+	    else {
+		printf("device_probe_and_attach: %s%d attach returned %d\n",
+		       dev->driver->name, dev->unit, error);
+		device_set_driver(dev, NULL);
+		dev->state = DS_NOTPRESENT;
+	    }
+	}
+    } else {
+	    device_print_prettyname(dev);
+	    printf("not probed (disabled)\n");
+    }
+
+    return error;
+}
+
+int
+device_detach(device_t dev)
+{
+    int error;
+
+    PDEBUG(("%s", DEVICENAME(dev)));
+    if (dev->state == DS_BUSY)
+	return EBUSY;
+    if (dev->state != DS_ATTACHED)
+	return 0;
+
+    if (error = DEVICE_DETACH(dev))
+	    return error;
+
+    if (!(dev->flags & DF_FIXEDCLASS))
+	devclass_delete_device(dev->devclass, dev);
+
+    dev->state = DS_NOTPRESENT;
+    device_set_driver(dev, NULL);
+
+    return 0;
+}
+
+int
+device_shutdown(device_t dev)
+{
+    if (dev->state < DS_ATTACHED)
+	return 0;
+    return DEVICE_SHUTDOWN(dev);
+}
+
+/*
+ * Access functions for device resources.
+ */
+extern struct config_device devtab[];
+extern int devtab_count;
+
+static int
+resource_match_string(int i, char *resname, char *value)
+{
+	int j;
+	struct config_resource *res;
+
+	for (j = 0, res = devtab[i].resources;
+	     j < devtab[i].resource_count; j++, res++)
+		if (!strcmp(res->name, resname)
+		    && res->type == RES_STRING
+		    && !strcmp(res->u.stringval, value))
+			return TRUE;
+	return FALSE;
+}
+
+static int
+resource_find(const char *name, int unit, char *resname, 
+	      struct config_resource **result)
+{
+	int i, j;
+	struct config_resource *res;
+
+	/*
+	 * First check specific instances, then generic.
+	 */
+	for (i = 0; i < devtab_count; i++) {
+		if (devtab[i].unit < 0)
+			continue;
+		if (!strcmp(devtab[i].name, name) && devtab[i].unit == unit) {
+			res = devtab[i].resources;
+			for (j = 0; j < devtab[i].resource_count; j++, res++)
+				if (!strcmp(res->name, resname)) {
+					*result = res;
+					return 0;
+				}
+		}
+	}
+	for (i = 0; i < devtab_count; i++) {
+		if (devtab[i].unit >= 0)
+			continue;
+		if (!strcmp(devtab[i].name, name) && devtab[i].unit == unit) {
+			res = devtab[i].resources;
+			for (j = 0; j < devtab[i].resource_count; j++, res++)
+				if (!strcmp(res->name, resname)) {
+					*result = res;
+					return 0;
+				}
+		}
+	}
+	return ENOENT;
+}
+
+int
+resource_int_value(const char *name, int unit, char *resname, int *result)
+{
+	int error;
+	struct config_resource *res;
+	if ((error = resource_find(name, unit, resname, &res)) != 0)
+		return error;
+	if (res->type != RES_INT)
+		return EFTYPE;
+	*result = res->u.intval;
+	return 0;
+}
+
+int
+resource_long_value(const char *name, int unit, char *resname, long *result)
+{
+	int error;
+	struct config_resource *res;
+	if ((error = resource_find(name, unit, resname, &res)) != 0)
+		return error;
+	if (res->type != RES_LONG)
+		return EFTYPE;
+	*result = res->u.longval;
+	return 0;
+}
+
+int
+resource_string_value(const char *name, int unit, char *resname, char **result)
+{
+	int error;
+	struct config_resource *res;
+	if ((error = resource_find(name, unit, resname, &res)) != 0)
+		return error;
+	if (res->type != RES_STRING)
+		return EFTYPE;
+	*result = res->u.stringval;
+	return 0;
+}
+
+int
+resource_query_string(int i, char *resname, char *value)
+{
+	if (i < 0)
+		i = 0;
+	else
+		i = i + 1;
+	for (; i < devtab_count; i++)
+		if (resource_match_string(i, resname, value))
+			return i;
+	return -1;
+}
+
+char *
+resource_query_name(int i)
+{
+	return devtab[i].name;
+}
+
+int
+resource_query_unit(int i)
+{
+	return devtab[i].unit;
+}
+
+
+/*
+ * Some useful method implementations to make life easier for bus drivers.
+ */
+int
+bus_generic_attach(device_t dev)
+{
+    device_t child;
+
+    for (child = TAILQ_FIRST(&dev->children);
+	 child; child = TAILQ_NEXT(child, link))
+	device_probe_and_attach(child);
+
+    return 0;
+}
+
+int
+bus_generic_detach(device_t dev)
+{
+    device_t child;
+    int error;
+
+    if (dev->state != DS_ATTACHED)
+	return EBUSY;
+
+    for (child = TAILQ_FIRST(&dev->children);
+	 child; child = TAILQ_NEXT(child, link))
+	if (error = device_detach(child))
+	    return error;
+
+    return 0;
+}
+
+int
+bus_generic_shutdown(device_t dev)
+{
+    device_t child;
+
+    for (child = TAILQ_FIRST(&dev->children);
+	 child; child = TAILQ_NEXT(child, link))
+	DEVICE_SHUTDOWN(child);
+
+    return 0;
+}
+
+int
+bus_generic_suspend(device_t dev)
+{
+	int		error;
+	device_t	child, child2;
+
+	for (child = TAILQ_FIRST(&dev->children);
+	     child; child = TAILQ_NEXT(child, link)) {
+		error = DEVICE_SUSPEND(child);
+		if (error) {
+			for (child2 = TAILQ_FIRST(&dev->children);
+			     child2 && child2 != child; 
+			     child2 = TAILQ_NEXT(child2, link))
+				DEVICE_RESUME(child2);
+			return (error);
+		}
+	}
+	return 0;
+}
+
+int
+bus_generic_resume(device_t dev)
+{
+	device_t	child;
+
+	for (child = TAILQ_FIRST(&dev->children);
+	     child; child = TAILQ_NEXT(child, link)) {
+		DEVICE_RESUME(child);
+		/* if resume fails, there's nothing we can usefully do... */
+	}
+	return 0;
+}
+
+void
+bus_generic_print_child(device_t dev, device_t child)
+{
+	printf(" on %s%d", device_get_name(dev), device_get_unit(dev));
+}
+
+int
+bus_generic_read_ivar(device_t dev, device_t child, int index, 
+		      uintptr_t * result)
+{
+    return ENOENT;
+}
+
+int
+bus_generic_write_ivar(device_t dev, device_t child, int index, 
+		       uintptr_t value)
+{
+    return ENOENT;
+}
+
+int
+bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq, 
+		       driver_intr_t *intr, void *arg, void **cookiep)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_SETUP_INTR(dev->parent, child, irq, intr, arg, 
+				       cookiep));
+	else
+		return (EINVAL);
+}
+
+int
+bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
+			  void *cookie)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
+	else
+		return (EINVAL);
+}
+
+struct resource *
+bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
+			   u_long start, u_long end, u_long count, u_int flags)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid, 
+					   start, end, count, flags));
+	else
+		return (NULL);
+}
+
+int
+bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
+			     struct resource *r)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid, 
+					     r));
+	else
+		return (EINVAL);
+}
+
+int
+bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
+			      struct resource *r)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid, 
+					      r));
+	else
+		return (EINVAL);
+}
+
+int
+bus_generic_deactivate_resource(device_t dev, device_t child, int type,
+				int rid, struct resource *r)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
+						r));
+	else
+		return (EINVAL);
+}
+
+/*
+ * Some convenience functions to make it easier for drivers to use the
+ * resource-management functions.  All these really do is hide the
+ * indirection through the parent's method table, making for slightly
+ * less-wordy code.  In the future, it might make sense for this code
+ * to maintain some sort of a list of resources allocated by each device.
+ */
+struct resource *
+bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
+		   u_long count, u_int flags)
+{
+	if (dev->parent == 0)
+		return (0);
+	return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
+				   count, flags));
+}
+
+int
+bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+	if (dev->parent == 0)
+		return (EINVAL);
+	return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+int
+bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+	if (dev->parent == 0)
+		return (EINVAL);
+	return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+int
+bus_release_resource(device_t dev, int type, int rid, struct resource *r)
+{
+	if (dev->parent == 0)
+		return (EINVAL);
+	return (BUS_RELEASE_RESOURCE(dev->parent, dev,
+				     type, rid, r));
+}
+
+static void
+root_print_child(device_t dev, device_t child)
+{
+}
+
+static int
+root_setup_intr(device_t dev, device_t child, driver_intr_t *intr, void *arg,
+		void **cookiep)
+{
+	/*
+	 * If an interrupt mapping gets to here something bad has happened.
+	 */
+	panic("root_setup_intr");
+}
+
+static device_method_t root_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_suspend,	bus_generic_suspend),
+	DEVMETHOD(device_resume,	bus_generic_resume),
+
+	/* Bus interface */
+	DEVMETHOD(bus_print_child,	root_print_child),
+	DEVMETHOD(bus_read_ivar,	bus_generic_read_ivar),
+	DEVMETHOD(bus_write_ivar,	bus_generic_write_ivar),
+	DEVMETHOD(bus_setup_intr,	root_setup_intr),
+
+	{ 0, 0 }
+};
+
+static driver_t root_driver = {
+	"root",
+	root_methods,
+	DRIVER_TYPE_MISC,
+	1,			/* no softc */
+};
+
+device_t	root_bus;
+devclass_t	root_devclass;
+
+static int
+root_bus_module_handler(module_t mod, int what, void* arg)
+{
+    switch (what) {
+    case MOD_LOAD:
+	compile_methods(&root_driver);
+	root_bus = make_device(NULL, "root", 0, NULL);
+	root_bus->desc = "System root bus";
+	root_bus->ops = root_driver.ops;
+	root_bus->driver = &root_driver;
+	root_bus->state = DS_ATTACHED;
+	root_devclass = devclass_find_internal("root", FALSE);
+	return 0;
+    }
+
+    return 0;
+}
+
+static moduledata_t root_bus_mod = {
+	"rootbus",
+	root_bus_module_handler,
+	0
+};
+DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+
+void
+root_bus_configure(void)
+{
+    device_t dev;
+
+    PDEBUG(("."));
+
+    for (dev = TAILQ_FIRST(&root_bus->children); dev;
+	 dev = TAILQ_NEXT(dev, link)) {
+	device_probe_and_attach(dev);
+    }
+}
+
+int
+driver_module_handler(module_t mod, int what, void *arg)
+{
+	int error, i;
+	struct driver_module_data *dmd;
+	devclass_t bus_devclass;
+
+	dmd = (struct driver_module_data *)arg;
+	bus_devclass = devclass_find_internal(dmd->dmd_busname, TRUE);
+	error = 0;
+
+	switch (what) {
+	case MOD_LOAD:
+		for (i = 0; !error && i < dmd->dmd_ndrivers; i++) {
+			PDEBUG(("Loading module: driver %s on bus %s",
+				DRIVERNAME(dmd->dmd_drivers[i]), 
+				dmd->dmd_busname));
+			error = devclass_add_driver(bus_devclass,
+						    dmd->dmd_drivers[i]);
+		}
+		if (error)
+			break;
+
+		/*
+		 * The drivers loaded in this way are assumed to all
+		 * implement the same devclass.
+		 */
+		*dmd->dmd_devclass =
+			devclass_find_internal(dmd->dmd_drivers[0]->name,
+					       TRUE);
+		break;
+
+	case MOD_UNLOAD:
+		for (i = 0; !error && i < dmd->dmd_ndrivers; i++) {
+			PDEBUG(("Unloading module: driver %s from bus %s",
+				DRIVERNAME(dmd->dmd_drivers[i]), 
+				dmd->dmd_busname));
+			error = devclass_delete_driver(bus_devclass,
+						       dmd->dmd_drivers[i]);
+		}
+		break;
+	}
+
+	if (!error && dmd->dmd_chainevh)
+		error = dmd->dmd_chainevh(mod, what, dmd->dmd_chainarg);
+	return (error);
+}
+
+#ifdef BUS_DEBUG
+
+/* the _short versions avoid iteration by not calling anything that prints
+ * more than oneliners. I love oneliners.
+ */
+
+static void
+print_method_list(device_method_t *m, int indent)
+{
+	int i;
+
+	if (!m)
+		return;
+
+	for (i = 0; m->desc; i++, m++)
+		indentprintf(("method %d: %s, offset=%d\n",
+			i, m->desc->name, m->desc->offset));
+}
+
+static void
+print_device_ops(device_ops_t ops, int indent)
+{
+	int i;
+	int count = 0;
+
+	if (!ops)
+		return;
+
+	/* we present a list of the methods that are pointing to the
+	 * error_method, but ignore the 0'th elements; it is always
+	 * error_method.
+	 */
+	for (i = 1; i < ops->maxoffset; i++) {
+		if (ops->methods[i] == error_method) {
+			if (count == 0)
+				indentprintf(("error_method:"));
+			printf(" %d", i);
+			count++;
+		}
+	}
+	if (count)
+		printf("\n");
+
+	indentprintf(("(%d method%s, %d valid, %d error_method%s)\n",
+		ops->maxoffset-1, (ops->maxoffset-1 == 1? "":"s"),
+		ops->maxoffset-1-count,
+		count, (count == 1? "":"'s")));
+}
+
+static void
+print_device_short(device_t dev, int indent)
+{
+	if (!dev)
+		return;
+
+	indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%sivars,%ssoftc,busy=%d\n",
+		dev->unit, dev->desc,
+		(dev->parent? "":"no "),
+		(TAILQ_EMPTY(&dev->children)? "no ":""),
+		(dev->flags&DF_ENABLED? "enabled,":"disabled,"),
+		(dev->flags&DF_FIXEDCLASS? "fixed,":""),
+		(dev->flags&DF_WILDCARD? "wildcard,":""),
+		(dev->ivars? "":"no "),
+		(dev->softc? "":"no "),
+		dev->busy));
+}
+
+static void
+print_device(device_t dev, int indent)
+{
+	if (!dev)
+		return;
+
+	print_device_short(dev, indent);
+
+	indentprintf(("Parent:\n"));
+	print_device_short(dev->parent, indent+1);
+	indentprintf(("Methods:\n"));
+	print_device_ops(dev->ops, indent+1);
+	indentprintf(("Driver:\n"));
+	print_driver_short(dev->driver, indent+1);
+	indentprintf(("Devclass:\n"));
+	print_devclass_short(dev->devclass, indent+1);
+}
+
+void
+print_device_tree_short(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+	device_t child;
+
+	if (!dev)
+		return;
+
+	print_device_short(dev, indent);
+
+	for (child = TAILQ_FIRST(&dev->children); child;
+		 child = TAILQ_NEXT(child, link))
+		print_device_tree_short(child, indent+1);
+}
+
+void
+print_device_tree(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+	device_t child;
+
+	if (!dev)
+		return;
+
+	print_device(dev, indent);
+
+	for (child = TAILQ_FIRST(&dev->children); child;
+		 child = TAILQ_NEXT(child, link))
+		print_device_tree(child, indent+1);
+}
+
+static void
+print_driver_short(driver_t *driver, int indent)
+{
+	if (!driver)
+		return;
+
+	indentprintf(("driver %s: type = %s%s%s%s, softc size = %d\n",
+		driver->name,
+		/* yes, I know this looks silly, but going to bed at
+		 * two o'clock and having to get up at 7:30 again is silly
+		 * as well. As is sticking your head in a bucket of water.
+		 */
+		(driver->type == DRIVER_TYPE_TTY? "tty":""),
+		(driver->type == DRIVER_TYPE_BIO? "bio":""),
+		(driver->type == DRIVER_TYPE_NET? "net":""),
+		(driver->type == DRIVER_TYPE_MISC? "misc":""),
+		driver->softc));
+}
+
+static void
+print_driver(driver_t *driver, int indent)
+{
+	if (!driver)
+		return;
+
+	print_driver_short(driver, indent);
+	indentprintf(("Methods:\n"));
+	print_method_list(driver->methods, indent+1);
+	indentprintf(("Operations:\n"));
+	print_device_ops(driver->ops, indent+1);
+}
+
+
+static void
+print_driver_list(driver_list_t drivers, int indent)
+{
+	driver_t *driver;
+
+	for (driver = TAILQ_FIRST(&drivers); driver;
+	     driver = TAILQ_NEXT(driver, link))
+		print_driver(driver, indent);
+}
+
+static void
+print_devclass_short(devclass_t dc, int indent)
+{
+	if ( !dc )
+		return;
+
+	indentprintf(("devclass %s: max units = %d, next unit = %d\n",
+		dc->name, dc->maxunit, dc->nextunit));
+}
+
+static void
+print_devclass(devclass_t dc, int indent)
+{
+	int i;
+
+	if ( !dc )
+		return;
+
+	print_devclass_short(dc, indent);
+	indentprintf(("Drivers:\n"));
+	print_driver_list(dc->drivers, indent+1);
+
+	indentprintf(("Devices:\n"));
+	for (i = 0; i < dc->maxunit; i++)
+		if (dc->devices[i])
+			print_device(dc->devices[i], indent+1);
+}
+
+void
+print_devclass_list_short(void)
+{
+	devclass_t dc;
+
+	printf("Short listing of devclasses, drivers & devices:\n");
+	for (dc = TAILQ_FIRST(&devclasses); dc; dc = TAILQ_NEXT(dc, link))
+		print_devclass_short(dc, 0);
+}
+
+void
+print_devclass_list(void)
+{
+	devclass_t dc;
+
+	printf("Full listing of devclasses, drivers & devices:\n");
+	for (dc = TAILQ_FIRST(&devclasses); dc; dc = TAILQ_NEXT(dc, link))
+		print_devclass(dc, 0);
+}
+
+#endif
diff --git a/sys/kern/subr_clist.c b/sys/kern/subr_clist.c
new file mode 100644
index 0000000..593d00c
--- /dev/null
+++ b/sys/kern/subr_clist.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: tty_subr.c,v 1.29 1998/04/15 17:46:27 bde Exp $
+ */
+
+/*
+ * clist support routines
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/tty.h>
+#include <sys/clist.h>
+
+static void clist_init __P((void *));
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
+
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define	INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc __P((void));
+static void cblock_alloc_cblocks __P((int number));
+static void cblock_free __P((struct cblock *cblockp));
+static void cblock_free_cblocks __P((int number));
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
+{
+	printf(
+	"tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+	       ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount,
+	       cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE);
+}
+#endif /* DDB */
+
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+	void *dummy;
+{
+	/*
+	 * Allocate an initial base set of cblocks as a 'slush'.
+	 * We allocate non-slush cblocks with each initial ttyopen() and
+	 * deallocate them with each ttyclose().
+	 * We should adjust the slush allocation.  This can't be done in
+	 * the i/o routines because they are sometimes called from
+	 * interrupt handlers when it may be unsafe to call malloc().
+	 */
+	cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
+}
+
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static __inline struct cblock *
+cblock_alloc()
+{
+	struct cblock *cblockp;
+
+	cblockp = cfreelist;
+	if (cblockp == NULL)
+		panic("clist reservation botch");
+	cfreelist = cblockp->c_next;
+	cblockp->c_next = NULL;
+	cfreecount -= CBSIZE;
+	return (cblockp);
+}
+
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static __inline void
+cblock_free(cblockp)
+	struct cblock *cblockp;
+{
+	if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+		bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+	cblockp->c_next = cfreelist;
+	cfreelist = cblockp;
+	cfreecount += CBSIZE;
+}
+
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+	int number;
+{
+	int i;
+	struct cblock *cbp;
+
+	for (i = 0; i < number; ++i) {
+		cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+		if (cbp == NULL) {
+			printf(
+"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+			cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+		}
+		/*
+		 * Freed cblocks have zero quotes and garbage elsewhere.
+		 * Set the may-have-quote bit to force zeroing the quotes.
+		 */
+		setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+		cblock_free(cbp);
+	}
+	ctotcount += number;
+}
+
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+	struct clist *clistp;
+	int ccmax;
+	int ccreserved;
+{
+	int dcbr;
+
+	/*
+	 * Allow for wasted space at the head.
+	 */
+	if (ccmax != 0)
+		ccmax += CBSIZE - 1;
+	if (ccreserved != 0)
+		ccreserved += CBSIZE - 1;
+
+	clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+	dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+	if (dcbr >= 0)
+		cblock_alloc_cblocks(dcbr);
+	else {
+		if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+			dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+		cblock_free_cblocks(-dcbr);
+	}
+	clistp->c_cbreserved += dcbr;
+}
+
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+	int number;
+{
+	int i;
+
+	for (i = 0; i < number; ++i)
+		free(cblock_alloc(), M_TTYS);
+	ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
+void
+clist_free_cblocks(clistp)
+	struct clist *clistp;
+{
+	if (clistp->c_cbcount != 0)
+		panic("freeing active clist cblocks");
+	cblock_free_cblocks(clistp->c_cbreserved);
+	clistp->c_cbmax = 0;
+	clistp->c_cbreserved = 0;
+}
+
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+	struct clist *clistp;
+{
+	int chr = -1;
+	int s;
+	struct cblock *cblockp;
+
+	s = spltty();
+
+	/* If there are characters in the list, get one */
+	if (clistp->c_cc) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		chr = (u_char)*clistp->c_cf;
+
+		/*
+		 * If this char is quoted, set the flag.
+		 */
+		if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * Advance to next character.
+		 */
+		clistp->c_cf++;
+		clistp->c_cc--;
+		/*
+		 * If we have advanced the 'first' character pointer
+		 * past the end of this cblock, advance to the next one.
+		 * If there are no more characters, set the first and
+		 * last pointers to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+	struct clist *clistp;
+	char *dest;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	char *dest_orig = dest;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (clistp && amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		bcopy(clistp->c_cf, dest, numc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		dest += numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (dest - dest_orig);
+}
+
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+	struct clist *clistp;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+	int chr;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	int s;
+
+	s = spltty();
+
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("putc to a clist with no reserved cblocks\n");
+			return (-1);		/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+		if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = (cblockp - 1);
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (-1);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+	}
+
+	/*
+	 * If this character is quoted, set the quote bit, if not, clear it.
+	 */
+	if (chr & TTY_QUOTE) {
+		setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+		/*
+		 * Use one of the spare quote bits to record that something
+		 * may be quoted.
+		 */
+		setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+	} else
+		clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+	*clistp->c_cl++ = chr;
+	clistp->c_cc++;
+
+	splx(s);
+	return (0);
+}
+
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+	char *src;
+	int amount;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	char *firstbyte, *lastbyte;
+	u_char startmask, endmask;
+	int startbit, endbit, num_between, numc;
+	int s;
+
+	/*
+	 * Avoid allocating an initial cblock and then not using it.
+	 * c_cc == 0 must imply c_cbount == 0.
+	 */
+	if (amount <= 0)
+		return (amount);
+
+	s = spltty();
+
+	/*
+	 * If there are no cblocks assigned to this clist yet,
+	 * then get one.
+	 */
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("b_to_q to a clist with no reserved cblocks.\n");
+			return (amount);	/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+	}
+
+	while (amount) {
+		/*
+		 * Get another cblock if needed.
+		 */
+		if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = cblockp - 1;
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (amount);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+
+		/*
+		 * Copy a chunk of the linear buffer up to the end
+		 * of this cblock.
+		 */
+		numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+		bcopy(src, clistp->c_cl, numc);
+
+		/*
+		 * Clear quote bits if they aren't known to be clear.
+		 * The following could probably be made into a seperate
+		 * "bitzero()" routine, but why bother?
+		 */
+		if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+			startbit = clistp->c_cl - (char *)cblockp->c_info;
+			endbit = startbit + numc - 1;
+
+			firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+			lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+			/*
+			 * Calculate mask of bits to preserve in first and
+			 * last bytes.
+			 */
+			startmask = NBBY - (startbit % NBBY);
+			startmask = 0xff >> startmask;
+			endmask = (endbit % NBBY);
+			endmask = 0xff << (endmask + 1);
+
+			if (firstbyte != lastbyte) {
+				*firstbyte &= startmask;
+				*lastbyte &= endmask;
+
+				num_between = lastbyte - firstbyte - 1;
+				if (num_between)
+					bzero(firstbyte + 1, num_between);
+			} else {
+				*firstbyte &= (startmask | endmask);
+			}
+		}
+
+		/*
+		 * ...and update pointer for the next chunk.
+		 */
+		src += numc;
+		clistp->c_cl += numc;
+		clistp->c_cc += numc;
+		amount -= numc;
+		/*
+		 * If we go through the loop again, it's always
+		 * for data in the next cblock, so by adding one (cblock),
+		 * (which makes the pointer 1 beyond the end of this
+		 * cblock) we prepare for the assignment of 'prev'
+		 * above.
+		 */
+		cblockp += 1;
+
+	}
+
+	splx(s);
+	return (amount);
+}
+
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
+char *
+nextc(clistp, cp, dst)
+	struct clist *clistp;
+	char *cp;
+	int *dst;
+{
+	struct cblock *cblockp;
+
+	++cp;
+	/*
+	 * See if the next character is beyond the end of
+	 * the clist.
+	 */
+	if (clistp->c_cc && (cp != clistp->c_cl)) {
+		/*
+		 * If the next character is beyond the end of this
+		 * cblock, advance to the next cblock.
+		 */
+		if (((intptr_t)cp & CROUND) == 0)
+			cp = ((struct cblock *)cp - 1)->c_next->c_info;
+		cblockp = (struct cblock *)((intptr_t)cp & ~CROUND);
+
+		/*
+		 * Get the character. Set the quote flag if this character
+		 * is quoted.
+		 */
+		*dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+		return (cp);
+	}
+
+	return (NULL);
+}
+
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+	struct clist *clistp;
+{
+	struct cblock *cblockp = 0, *cbp = 0;
+	int s;
+	int chr = -1;
+
+
+	s = spltty();
+
+	if (clistp->c_cc) {
+		--clistp->c_cc;
+		--clistp->c_cl;
+
+		chr = (u_char)*clistp->c_cl;
+
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+
+		/*
+		 * Set quote flag if this character was quoted.
+		 */
+		if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * If all of the characters have been unput in this
+		 * cblock, then find the previous one and free this
+		 * one.
+		 */
+		if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+			cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+
+			while (cbp->c_next != cblockp)
+				cbp = cbp->c_next;
+
+			/*
+			 * When the previous cblock is at the end, the 'last'
+			 * pointer always points (invalidly) one past.
+			 */
+			clistp->c_cl = (char *)(cbp+1);
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+			cbp->c_next = NULL;
+		}
+	}
+
+	/*
+	 * If there are no more characters on the list, then
+	 * free the last cblock.
+	 */
+	if ((clistp->c_cc == 0) && clistp->c_cl) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+		cblock_free(cblockp);
+		if (--clistp->c_cbcount >= clistp->c_cbreserved)
+			++cslushcount;
+		clistp->c_cf = clistp->c_cl = NULL;
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
+void
+catq(src_clistp, dest_clistp)
+	struct clist *src_clistp, *dest_clistp;
+{
+	int chr, s;
+
+	s = spltty();
+	/*
+	 * If the destination clist is empty (has no cblocks atttached),
+	 * and there are no possible complications with the resource counters,
+	 * then we simply assign the current clist to the destination.
+	 */
+	if (!dest_clistp->c_cf
+	    && src_clistp->c_cbcount <= src_clistp->c_cbmax
+	    && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+		dest_clistp->c_cf = src_clistp->c_cf;
+		dest_clistp->c_cl = src_clistp->c_cl;
+		src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+		dest_clistp->c_cc = src_clistp->c_cc;
+		src_clistp->c_cc = 0;
+		dest_clistp->c_cbcount = src_clistp->c_cbcount;
+		src_clistp->c_cbcount = 0;
+
+		splx(s);
+		return;
+	}
+
+	splx(s);
+
+	/*
+	 * XXX  This should probably be optimized to more than one
+	 * character at a time.
+	 */
+	while ((chr = getc(src_clistp)) != -1)
+		putc(chr, dest_clistp);
+}
diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c
new file mode 100644
index 0000000..5fcf88e
--- /dev/null
+++ b/sys/kern/subr_devstat.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 1997, 1998 Kenneth D. Merry.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: subr_devstat.c,v 1.7 1998/12/04 22:54:51 archie Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+#include <sys/devicestat.h>
+
+static int devstat_num_devs;
+static long devstat_generation;
+static int devstat_version = DEVSTAT_VERSION;
+static int devstat_current_devnumber;
+
+STAILQ_HEAD(devstatlist, devstat) device_statq;
+
+/*
+ * Take a malloced and zeroed devstat structure given to us, fill it in 
+ * and add it to the queue of devices.  
+ */
+void
+devstat_add_entry(struct devstat *ds, const char *dev_name, 
+		  int unit_number, u_int32_t block_size,
+		  devstat_support_flags flags,
+		  devstat_type_flags device_type)
+{
+	int s;
+	struct devstatlist *devstat_head;
+
+	if (ds == NULL)
+		return;
+
+	if (devstat_num_devs == 0)
+		STAILQ_INIT(&device_statq);
+
+	devstat_generation++;
+	devstat_num_devs++;
+
+	devstat_head = &device_statq;
+
+	STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
+
+	ds->device_number = devstat_current_devnumber++;
+	ds->unit_number = unit_number;
+	strncpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
+	ds->device_name[DEVSTAT_NAME_LEN - 1] = 0;
+	ds->block_size = block_size;
+	ds->flags = flags;
+	ds->device_type = device_type;
+
+	s = splclock();
+	getmicrotime(&ds->dev_creation_time);
+	splx(s);
+}
+
+/*
+ * Remove a devstat structure from the list of devices.
+ */
+void
+devstat_remove_entry(struct devstat *ds)
+{
+	struct devstatlist *devstat_head;
+
+	if (ds == NULL)
+		return;
+
+	devstat_generation++;
+	devstat_num_devs--;
+
+	devstat_head = &device_statq;
+
+	/* Remove this entry from the devstat queue */
+	STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
+}
+
+/*
+ * Record a transaction start.
+ */
+void
+devstat_start_transaction(struct devstat *ds)
+{
+	int s;
+
+	/* sanity check */
+	if (ds == NULL)
+		return;
+
+	/*
+	 * We only want to set the start time when we are going from idle
+	 * to busy.  The start time is really the start of the latest busy
+	 * period.
+	 */
+	if (ds->busy_count == 0) {
+		s = splclock();	
+		getmicrouptime(&ds->start_time);
+		splx(s);
+	}
+	ds->busy_count++;
+}
+
+/*
+ * Record the ending of a transaction, and incrment the various counters.
+ */
+void
+devstat_end_transaction(struct devstat *ds, u_int32_t bytes, 
+			devstat_tag_type tag_type, devstat_trans_flags flags)
+{
+	int s;
+	struct timeval busy_time;
+
+	/* sanity check */
+	if (ds == NULL)
+		return;
+
+	s = splclock();
+	getmicrouptime(&ds->last_comp_time);
+	splx(s);
+
+	ds->busy_count--;
+
+	/*
+	 * There might be some transactions (DEVSTAT_NO_DATA) that don't
+	 * transfer any data.
+	 */
+	if (flags == DEVSTAT_READ) {
+		ds->bytes_read += bytes;
+		ds->num_reads++;
+	} else if (flags == DEVSTAT_WRITE) {
+		ds->bytes_written += bytes;
+		ds->num_writes++;
+	} else
+		ds->num_other++;
+
+	/*
+	 * Keep a count of the various tag types sent.
+	 */
+	if (tag_type != DEVSTAT_TAG_NONE)
+		ds->tag_types[tag_type]++;
+
+	/*
+	 * We only update the busy time when we go idle.  Otherwise, this
+	 * calculation would require many more clock cycles.
+	 */
+	if (ds->busy_count == 0) {
+		/* Calculate how long we were busy */
+		busy_time = ds->last_comp_time;
+		timevalsub(&busy_time, &ds->start_time);
+
+		/* Add our busy time to the total busy time. */
+		timevaladd(&ds->busy_time, &busy_time);
+	} else if (ds->busy_count < 0)
+		printf("devstat_end_transaction: HELP!! busy_count "
+		       "for %s%d is < 0 (%d)!\n", ds->device_name,
+		       ds->unit_number, ds->busy_count);
+}
+
+/*
+ * This is the sysctl handler for the devstat package.  The data pushed out
+ * on the kern.devstat.all sysctl variable consists of the current devstat
+ * generation number, and then an array of devstat structures, one for each
+ * device in the system.
+ *
+ * I'm really not too fond of this method of doing things, but there really
+ * aren't that many alternatives.  We must have some method of making sure
+ * that the generation number the user gets corresponds with the data the
+ * user gets.  If the user makes a separate sysctl call to get the
+ * generation, and then a sysctl call to get the device statistics, the
+ * device list could have changed in that brief period of time.  By
+ * supplying the generation number along with the statistics output, we can
+ * guarantee that the generation number and the statistics match up.
+ */
+static int
+sysctl_devstat SYSCTL_HANDLER_ARGS
+{
+	int error, i;
+	struct devstat *nds;
+	struct devstatlist *devstat_head;
+
+	if (devstat_num_devs == 0)
+		return(EINVAL);
+
+	error = 0;
+	devstat_head = &device_statq;
+
+	/*
+	 * First push out the generation number.
+	 */
+	error = SYSCTL_OUT(req, &devstat_generation, sizeof(long));
+
+	/*
+	 * Now push out all the devices.
+	 */
+	for (i = 0, nds = devstat_head->stqh_first; 
+	    (nds != NULL) && (i < devstat_num_devs) && (error == 0); 
+	     nds = nds->dev_links.stqe_next, i++)
+		error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
+
+	return(error);
+}
+
+/*
+ * Sysctl entries for devstat.  The first one is a node that all the rest
+ * hang off of. 
+ */
+SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, 0, "Device Statistics");
+
+SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE,
+	    0, 0, sysctl_devstat, "S,devstat", "All Devices");
+/*
+ * Export the number of devices in the system so that userland utilities
+ * can determine how much memory to allocate to hold all the devices.
+ */
+SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, &devstat_num_devs,
+	  0, "Number of devices in the devstat list");
+SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
+	    &devstat_generation, "Devstat list generation");
+SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, &devstat_version,
+	  0, "Devstat list version number");
diff --git a/sys/kern/subr_disklabel.c b/sys/kern/subr_disklabel.c
new file mode 100644
index 0000000..33f1d2a
--- /dev/null
+++ b/sys/kern/subr_disklabel.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
+ * $Id: ufs_disksubr.c,v 1.38 1998/10/17 07:49:04 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/syslog.h>
+
+/*
+ * Seek sort for disks.
+ *
+ * The buf_queue keep two queues, sorted in ascending block order.  The first
+ * queue holds those requests which are positioned after the current block
+ * (in the first request); the second, which starts at queue->switch_point,
+ * holds requests which came in after their block number was passed.  Thus
+ * we implement a one way scan, retracting after reaching the end of the drive
+ * to the first request on the second queue, at which time it becomes the
+ * first queue.
+ *
+ * A one-way scan is natural because of the way UNIX read-ahead blocks are
+ * allocated.
+ */
+
+void
+bufqdisksort(bufq, bp)
+	struct buf_queue_head *bufq;
+	struct buf *bp;
+{
+	struct buf *bq;
+	struct buf *bn;
+	struct buf *be;
+	
+	be = TAILQ_LAST(&bufq->queue, buf_queue);
+	/*
+	 * If the queue is empty or we are an
+	 * ordered transaction, then it's easy.
+	 */
+	if ((bq = bufq_first(bufq)) == NULL
+	 || (bp->b_flags & B_ORDERED) != 0) {
+		bufq_insert_tail(bufq, bp);
+		return;
+	} else if (bufq->insert_point != NULL) {
+
+		/*
+		 * A certain portion of the list is
+		 * "locked" to preserve ordering, so
+		 * we can only insert after the insert
+		 * point.
+		 */
+		bq = bufq->insert_point;
+	} else {
+
+		/*
+		 * If we lie before the last removed (currently active)
+		 * request, and are not inserting ourselves into the
+		 * "locked" portion of the list, then we must add ourselves
+		 * to the second request list.
+		 */
+		if (bp->b_pblkno < bufq->last_pblkno) {
+
+			bq = bufq->switch_point;
+			/*
+			 * If we are starting a new secondary list,
+			 * then it's easy.
+			 */
+			if (bq == NULL) {
+				bufq->switch_point = bp;
+				bufq_insert_tail(bufq, bp);
+				return;
+			}
+			/*
+			 * If we lie ahead of the current switch point,
+			 * insert us before the switch point and move
+			 * the switch point.
+			 */
+			if (bp->b_pblkno < bq->b_pblkno) {
+				bufq->switch_point = bp;
+				TAILQ_INSERT_BEFORE(bq, bp, b_act);
+				return;
+			}
+		} else {
+			if (bufq->switch_point != NULL)
+				be = TAILQ_PREV(bufq->switch_point,
+						buf_queue, b_act);
+			/*
+			 * If we lie between last_pblkno and bq,
+			 * insert before bq.
+			 */
+			if (bp->b_pblkno < bq->b_pblkno) {
+				TAILQ_INSERT_BEFORE(bq, bp, b_act);
+				return;
+			}
+		}
+	}
+
+	/*
+	 * Request is at/after our current position in the list.
+	 * Optimize for sequential I/O by seeing if we go at the tail.
+	 */
+	if (bp->b_pblkno > be->b_pblkno) {
+		TAILQ_INSERT_AFTER(&bufq->queue, be, bp, b_act);
+		return;
+	}
+
+	/* Otherwise, insertion sort */
+	while ((bn = TAILQ_NEXT(bq, b_act)) != NULL) {
+		
+		/*
+		 * We want to go after the current request if it is the end
+		 * of the first request list, or if the next request is a
+		 * larger cylinder than our request.
+		 */
+		if (bn == bufq->switch_point
+		 || bp->b_pblkno < bn->b_pblkno)
+			break;
+		bq = bn;
+	}
+	TAILQ_INSERT_AFTER(&bufq->queue, bq, bp, b_act);
+}
+
+
+/*
+ * Attempt to read a disk label from a device using the indicated strategy
+ * routine.  The label must be partly set up before this: secpercyl, secsize
+ * and anything required in the strategy routine (e.g., dummy bounds for the
+ * partition containing the label) must be filled in before calling us.
+ * Returns NULL on success and an error string on failure.
+ */
+char *
+readdisklabel(dev, strat, lp)
+	dev_t dev;
+	d_strategy_t *strat;
+	register struct disklabel *lp;
+{
+	register struct buf *bp;
+	struct disklabel *dlp;
+	char *msg = NULL;
+
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dev;
+	bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+	bp->b_bcount = lp->d_secsize;
+	bp->b_flags &= ~B_INVAL;
+	bp->b_flags |= B_BUSY | B_READ;
+	(*strat)(bp);
+	if (biowait(bp))
+		msg = "I/O error";
+	else for (dlp = (struct disklabel *)bp->b_data;
+	    dlp <= (struct disklabel *)((char *)bp->b_data +
+	    lp->d_secsize - sizeof(*dlp));
+	    dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+		if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) {
+			if (msg == NULL)
+				msg = "no disk label";
+		} else if (dlp->d_npartitions > MAXPARTITIONS ||
+			   dkcksum(dlp) != 0)
+			msg = "disk label corrupted";
+		else {
+			*lp = *dlp;
+			msg = NULL;
+			break;
+		}
+	}
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	return (msg);
+}
+
+/*
+ * Check new disk label for sensibility before setting it.
+ */
+int
+setdisklabel(olp, nlp, openmask)
+	register struct disklabel *olp, *nlp;
+	u_long openmask;
+{
+	register int i;
+	register struct partition *opp, *npp;
+
+	/*
+	 * Check it is actually a disklabel we are looking at.
+	 */
+	if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC ||
+	    dkcksum(nlp) != 0)
+		return (EINVAL);
+	/*
+	 * For each partition that we think is open,
+	 */
+	while ((i = ffs((long)openmask)) != 0) {
+		i--;
+		/*
+	 	 * Check it is not changing....
+	 	 */
+		openmask &= ~(1 << i);
+		if (nlp->d_npartitions <= i)
+			return (EBUSY);
+		opp = &olp->d_partitions[i];
+		npp = &nlp->d_partitions[i];
+		if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size)
+			return (EBUSY);
+		/*
+		 * Copy internally-set partition information
+		 * if new label doesn't include it.		XXX
+		 * (If we are using it then we had better stay the same type)
+		 * This is possibly dubious, as someone else noted (XXX)
+		 */
+		if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) {
+			npp->p_fstype = opp->p_fstype;
+			npp->p_fsize = opp->p_fsize;
+			npp->p_frag = opp->p_frag;
+			npp->p_cpg = opp->p_cpg;
+		}
+	}
+ 	nlp->d_checksum = 0;
+ 	nlp->d_checksum = dkcksum(nlp);
+	*olp = *nlp;
+	return (0);
+}
+
+/*
+ * Write disk label back to device after modification.
+ */
+int
+writedisklabel(dev, strat, lp)
+	dev_t dev;
+	d_strategy_t *strat;
+	register struct disklabel *lp;
+{
+	struct buf *bp;
+	struct disklabel *dlp;
+	int error = 0;
+
+	if (lp->d_partitions[RAW_PART].p_offset != 0)
+		return (EXDEV);			/* not quite right */
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dkmodpart(dev, RAW_PART);
+	bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+	bp->b_bcount = lp->d_secsize;
+#if 1
+	/*
+	 * We read the label first to see if it's there,
+	 * in which case we will put ours at the same offset into the block..
+	 * (I think this is stupid [Julian])
+	 * Note that you can't write a label out over a corrupted label!
+	 * (also stupid.. how do you write the first one? by raw writes?)
+	 */
+	bp->b_flags &= ~B_INVAL;
+	bp->b_flags |= B_BUSY | B_READ;
+	(*strat)(bp);
+	error = biowait(bp);
+	if (error)
+		goto done;
+	for (dlp = (struct disklabel *)bp->b_data;
+	    dlp <= (struct disklabel *)
+	      ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp));
+	    dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+		if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC &&
+		    dkcksum(dlp) == 0) {
+			*dlp = *lp;
+			bp->b_flags &= ~(B_DONE | B_READ);
+			bp->b_flags |= B_BUSY | B_WRITE;
+#ifdef __alpha__
+			alpha_fix_srm_checksum(bp);
+#endif
+			(*strat)(bp);
+			error = biowait(bp);
+			goto done;
+		}
+	}
+	error = ESRCH;
+done:
+#else
+	bzero(bp->b_data, lp->d_secsize);
+	dlp = (struct disklabel *)bp->b_data;
+	*dlp = *lp;
+	bp->b_flags &= ~B_INVAL;
+	bp->b_flags |= B_BUSY | B_WRITE;
+	(*strat)(bp);
+	error = biowait(bp);
+#endif
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	return (error);
+}
+
+/*
+ * Compute checksum for disk label.
+ */
+u_int
+dkcksum(lp)
+	register struct disklabel *lp;
+{
+	register u_short *start, *end;
+	register u_short sum = 0;
+
+	start = (u_short *)lp;
+	end = (u_short *)&lp->d_partitions[lp->d_npartitions];
+	while (start < end)
+		sum ^= *start++;
+	return (sum);
+}
+
+/*
+ * Disk error is the preface to plaintive error messages
+ * about failing disk transfers.  It prints messages of the form
+
+hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
+
+ * if the offset of the error in the transfer and a disk label
+ * are both available.  blkdone should be -1 if the position of the error
+ * is unknown; the disklabel pointer may be null from drivers that have not
+ * been converted to use them.  The message is printed with printf
+ * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
+ * The message should be completed (with at least a newline) with printf
+ * or addlog, respectively.  There is no trailing space.
+ */
+void
+diskerr(bp, dname, what, pri, blkdone, lp)
+	register struct buf *bp;
+	char *dname, *what;
+	int pri, blkdone;
+	register struct disklabel *lp;
+{
+	int unit = dkunit(bp->b_dev);
+	int slice = dkslice(bp->b_dev);
+	int part = dkpart(bp->b_dev);
+	register int (*pr) __P((const char *, ...));
+	char partname[2];
+	char *sname;
+	daddr_t sn;
+
+	if (pri != LOG_PRINTF) {
+		log(pri, "%s", "");
+		pr = addlog;
+	} else
+		pr = printf;
+	sname = dsname(dname, unit, slice, part, partname);
+	(*pr)("%s%s: %s %sing fsbn ", sname, partname, what,
+	      bp->b_flags & B_READ ? "read" : "writ");
+	sn = bp->b_blkno;
+	if (bp->b_bcount <= DEV_BSIZE)
+		(*pr)("%ld", (long)sn);
+	else {
+		if (blkdone >= 0) {
+			sn += blkdone;
+			(*pr)("%ld of ", (long)sn);
+		}
+		(*pr)("%ld-%ld", (long)bp->b_blkno,
+		    (long)(bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE));
+	}
+	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
+#ifdef tahoe
+		sn *= DEV_BSIZE / lp->d_secsize;		/* XXX */
+#endif
+		sn += lp->d_partitions[part].p_offset;
+		/*
+		 * XXX should add slice offset and not print the slice,
+		 * but we don't know the slice pointer.
+		 * XXX should print bp->b_pblkno so that this will work
+		 * independent of slices, labels and bad sector remapping,
+		 * but some drivers don't set bp->b_pblkno.
+		 */
+		(*pr)(" (%s bn %ld; cn %ld", sname, (long)sn,
+		    (long)(sn / lp->d_secpercyl));
+		sn %= (long)lp->d_secpercyl;
+		(*pr)(" tn %ld sn %ld)", (long)(sn / lp->d_nsectors),
+		    (long)(sn % lp->d_nsectors));
+	}
+}
diff --git a/sys/kern/subr_diskmbr.c b/sys/kern/subr_diskmbr.c
new file mode 100644
index 0000000..adfd39c
--- /dev/null
+++ b/sys/kern/subr_diskmbr.c
@@ -0,0 +1,445 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)ufs_disksubr.c	7.16 (Berkeley) 5/4/91
+ *	from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ *	$Id: diskslice_machdep.c,v 1.31 1998/08/10 07:22:14 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#define	DOSPTYP_EXTENDED	5
+#define	DOSPTYP_EXTENDEDX	15
+#define	DOSPTYP_ONTRACK		84
+#include <sys/diskslice.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#define TRACE(str)	do { if (dsi_debug) printf str; } while (0)
+
+static volatile u_char dsi_debug;
+
+static struct dos_partition historical_bogus_partition_table[NDOSPART] = {
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, },
+};
+
+static int check_part __P((char *sname, struct dos_partition *dp,
+			   u_long offset, int nsectors, int ntracks,
+			   u_long mbr_offset));
+static void extended __P((char *dname, dev_t dev, d_strategy_t *strat,
+			  struct disklabel *lp, struct diskslices *ssp,
+			  u_long ext_offset, u_long ext_size,
+			  u_long base_ext_offset, int nsectors, int ntracks,
+			  u_long mbr_offset));
+
+static int
+check_part(sname, dp, offset, nsectors, ntracks, mbr_offset )
+	char	*sname;
+	struct dos_partition *dp;
+	u_long	offset;
+	int	nsectors;
+	int	ntracks;
+	u_long	mbr_offset;
+{
+	int	chs_ecyl;
+	int	chs_esect;
+	int	chs_scyl;
+	int	chs_ssect;
+	int	error;
+	u_long	esector;
+	u_long	esector1;
+	u_long	secpercyl;
+	u_long	ssector;
+	u_long	ssector1;
+
+	secpercyl = (u_long)nsectors * ntracks;
+	chs_scyl = DPCYL(dp->dp_scyl, dp->dp_ssect);
+	chs_ssect = DPSECT(dp->dp_ssect);
+	ssector = chs_ssect - 1 + dp->dp_shd * nsectors + chs_scyl * secpercyl
+		  + mbr_offset;
+	ssector1 = offset + dp->dp_start;
+
+	/*
+	 * If ssector1 is on a cylinder >= 1024, then ssector can't be right.
+	 * Allow the C/H/S for it to be 1023/ntracks-1/nsectors, or correct
+	 * apart from the cylinder being reduced modulo 1024.  Always allow
+	 * 1023/255/63.
+	 */
+	if (ssector < ssector1
+	    && ((chs_ssect == nsectors && dp->dp_shd == ntracks - 1
+		 && chs_scyl == 1023)
+		|| (secpercyl != 0
+		    && (ssector1 - ssector) % (1024 * secpercyl) == 0))
+	    || (dp->dp_scyl == 255 && dp->dp_shd == 255
+		&& dp->dp_ssect == 255)) {
+		TRACE(("%s: C/H/S start %d/%d/%d, start %lu: allow\n",
+		       sname, chs_scyl, dp->dp_shd, chs_ssect, ssector1));
+		ssector = ssector1;
+	}
+
+	chs_ecyl = DPCYL(dp->dp_ecyl, dp->dp_esect);
+	chs_esect = DPSECT(dp->dp_esect);
+	esector = chs_esect - 1 + dp->dp_ehd * nsectors + chs_ecyl * secpercyl
+		  + mbr_offset;
+	esector1 = ssector1 + dp->dp_size - 1;
+
+	/* Allow certain bogus C/H/S values for esector, as above. */
+	if (esector < esector1
+	    && ((chs_esect == nsectors && dp->dp_ehd == ntracks - 1
+		 && chs_ecyl == 1023)
+		|| (secpercyl != 0
+		    && (esector1 - esector) % (1024 * secpercyl) == 0))
+	    || (dp->dp_ecyl == 255 && dp->dp_ehd == 255
+		&& dp->dp_esect == 255)) {
+		TRACE(("%s: C/H/S end %d/%d/%d, end %lu: allow\n",
+		       sname, chs_ecyl, dp->dp_ehd, chs_esect, esector1));
+		esector = esector1;
+	}
+
+	error = (ssector == ssector1 && esector == esector1) ? 0 : EINVAL;
+	if (bootverbose)
+		printf("%s: type 0x%x, start %lu, end = %lu, size %lu %s\n",
+		       sname, dp->dp_typ, ssector1, esector1,
+		       (u_long)dp->dp_size, error ? "" : ": OK");
+	if (ssector != ssector1 && bootverbose)
+		printf("%s: C/H/S start %d/%d/%d (%lu) != start %lu: invalid\n",
+		       sname, chs_scyl, dp->dp_shd, chs_ssect,
+		       ssector, ssector1);
+	if (esector != esector1 && bootverbose)
+		printf("%s: C/H/S end %d/%d/%d (%lu) != end %lu: invalid\n",
+		       sname, chs_ecyl, dp->dp_ehd, chs_esect,
+		       esector, esector1);
+	return (error);
+}
+
+int
+dsinit(dname, dev, strat, lp, sspp)
+	char	*dname;
+	dev_t	dev;
+	d_strategy_t *strat;
+	struct disklabel *lp;
+	struct diskslices **sspp;
+{
+	struct buf *bp;
+	u_char	*cp;
+	int	dospart;
+	struct dos_partition *dp;
+	struct dos_partition *dp0;
+	int	error;
+	int	max_ncyls;
+	int	max_nsectors;
+	int	max_ntracks;
+	u_long	mbr_offset;
+	char	partname[2];
+	u_long	secpercyl;
+	char	*sname;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+
+	mbr_offset = DOSBBSECTOR;
+reread_mbr:
+	/* Read master boot record. */
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+	bp->b_blkno = mbr_offset;
+	bp->b_bcount = lp->d_secsize;
+	bp->b_flags |= B_BUSY | B_READ;
+	(*strat)(bp);
+	if (biowait(bp) != 0) {
+		diskerr(bp, dname, "error reading primary partition table",
+		    LOG_PRINTF, 0, (struct disklabel *)NULL);
+		printf("\n");
+		error = EIO;
+		goto done;
+	}
+
+	/* Weakly verify it. */
+	cp = bp->b_data;
+	sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART,
+		       partname);
+	if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+		if (bootverbose)
+			printf("%s: invalid primary partition table: no magic\n",
+			       sname);
+		error = EINVAL;
+		goto done;
+	}
+	dp0 = (struct dos_partition *)(cp + DOSPARTOFF);
+
+	/* Check for "Ontrack Diskmanager". */
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+		if (dp->dp_typ == DOSPTYP_ONTRACK) {
+			if (bootverbose)
+				printf(
+	    "%s: Found \"Ontrack Disk Manager\" on this disk.\n", sname);
+			bp->b_flags |= B_INVAL | B_AGE;
+			brelse(bp);
+			mbr_offset = 63;
+			goto reread_mbr;
+		}
+	}
+
+	if (bcmp(dp0, historical_bogus_partition_table,
+		 sizeof historical_bogus_partition_table) == 0) {
+		TRACE(("%s: invalid primary partition table: historical\n",
+		       sname));
+		error = EINVAL;
+		goto done;
+	}
+
+	/* Guess the geometry. */
+	/*
+	 * TODO:
+	 * Perhaps skip entries with 0 size.
+	 * Perhaps only look at entries of type DOSPTYP_386BSD.
+	 */
+	max_ncyls = 0;
+	max_nsectors = 0;
+	max_ntracks = 0;
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+		int	ncyls;
+		int	nsectors;
+		int	ntracks;
+
+		ncyls = DPCYL(dp->dp_ecyl, dp->dp_esect) + 1;
+		if (max_ncyls < ncyls)
+			max_ncyls = ncyls;
+		nsectors = DPSECT(dp->dp_esect);
+		if (max_nsectors < nsectors)
+			max_nsectors = nsectors;
+		ntracks = dp->dp_ehd + 1;
+		if (max_ntracks < ntracks)
+			max_ntracks = ntracks;
+	}
+
+	/*
+	 * Check that we have guessed the geometry right by checking the
+	 * partition entries.
+	 */
+	/*
+	 * TODO:
+	 * As above.
+	 * Check for overlaps.
+	 * Check against d_secperunit if the latter is reliable.
+	 */
+	error = 0;
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+		if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+		    && dp->dp_start == 0 && dp->dp_size == 0)
+			continue;
+		sname = dsname(dname, dkunit(dev), BASE_SLICE + dospart,
+			       RAW_PART, partname);
+
+		/*
+		 * Temporarily ignore errors from this check.  We could
+		 * simplify things by accepting the table eariler if we
+		 * always ignore errors here.  Perhaps we should always
+		 * accept the table if the magic is right but not let
+		 * bad entries affect the geometry.
+		 */
+		check_part(sname, dp, mbr_offset, max_nsectors, max_ntracks,
+			   mbr_offset);
+	}
+	if (error != 0)
+		goto done;
+
+	/*
+	 * Accept the DOS partition table.
+	 * First adjust the label (we have been careful not to change it
+	 * before we can guarantee success).
+	 */
+	secpercyl = (u_long)max_nsectors * max_ntracks;
+	if (secpercyl != 0) {
+		u_long	secperunit;
+
+		lp->d_nsectors = max_nsectors;
+		lp->d_ntracks = max_ntracks;
+		lp->d_secpercyl = secpercyl;
+		secperunit = secpercyl * max_ncyls;
+		if (lp->d_secperunit < secperunit)
+			lp->d_secperunit = secperunit;
+		lp->d_ncylinders = lp->d_secperunit / secpercyl;
+	}
+
+	/*
+	 * We are passed a pointer to a suitably initialized minimal
+	 * slices "struct" with no dangling pointers in it.  Replace it
+	 * by a maximal one.  This usually oversizes the "struct", but
+	 * enlarging it while searching for logical drives would be
+	 * inconvenient.
+	 */
+	free(*sspp, M_DEVBUF);
+	ssp = dsmakeslicestruct(MAX_SLICES, lp);
+	*sspp = ssp;
+
+	/* Initialize normal slices. */
+	sp = &ssp->dss_slices[BASE_SLICE];
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++, sp++) {
+		sp->ds_offset = mbr_offset + dp->dp_start;
+		sp->ds_size = dp->dp_size;
+		sp->ds_type = dp->dp_typ;
+#if 0
+		lp->d_subtype |= (lp->d_subtype & 3) | dospart
+				 | DSTYPE_INDOSPART;
+#endif
+	}
+	ssp->dss_nslices = BASE_SLICE + NDOSPART;
+
+	/* Handle extended partitions. */
+	sp -= NDOSPART;
+	for (dospart = 0; dospart < NDOSPART; dospart++, sp++)
+		if (sp->ds_type == DOSPTYP_EXTENDED || 
+                    sp->ds_type == DOSPTYP_EXTENDEDX)
+			extended(dname, bp->b_dev, strat, lp, ssp,
+				 sp->ds_offset, sp->ds_size, sp->ds_offset,
+				 max_nsectors, max_ntracks, mbr_offset);
+
+done:
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	if (error == EINVAL)
+		error = 0;
+	return (error);
+}
+
+void
+extended(dname, dev, strat, lp, ssp, ext_offset, ext_size, base_ext_offset,
+	 nsectors, ntracks, mbr_offset)
+	char	*dname;
+	dev_t	dev;
+	struct disklabel *lp;
+	d_strategy_t *strat;
+	struct diskslices *ssp;
+	u_long	ext_offset;
+	u_long	ext_size;
+	u_long	base_ext_offset;
+	int	nsectors;
+	int	ntracks;
+	u_long	mbr_offset;
+{
+	struct buf *bp;
+	u_char	*cp;
+	int	dospart;
+	struct dos_partition *dp;
+	u_long	ext_offsets[NDOSPART];
+	u_long	ext_sizes[NDOSPART];
+	char	partname[2];
+	int	slice;
+	char	*sname;
+	struct diskslice *sp;
+
+	/* Read extended boot record. */
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dev;
+	bp->b_blkno = ext_offset;
+	bp->b_bcount = lp->d_secsize;
+	bp->b_flags |= B_BUSY | B_READ;
+	(*strat)(bp);
+	if (biowait(bp) != 0) {
+		diskerr(bp, dname, "error reading extended partition table",
+		    LOG_PRINTF, 0, (struct disklabel *)NULL);
+		printf("\n");
+		goto done;
+	}
+
+	/* Weakly verify it. */
+	cp = bp->b_data;
+	if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+		sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART,
+			       partname);
+		if (bootverbose)
+			printf("%s: invalid extended partition table: no magic\n",
+			       sname);
+		goto done;
+	}
+
+	for (dospart = 0,
+	     dp = (struct dos_partition *)(bp->b_data + DOSPARTOFF),
+	     slice = ssp->dss_nslices, sp = &ssp->dss_slices[slice];
+	     dospart < NDOSPART; dospart++, dp++) {
+		ext_sizes[dospart] = 0;
+		if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+		    && dp->dp_start == 0 && dp->dp_size == 0)
+			continue;
+		if (dp->dp_typ == DOSPTYP_EXTENDED || 
+                    dp->dp_typ == DOSPTYP_EXTENDEDX) {
+			char buf[32];
+
+			sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE,
+				       RAW_PART, partname);
+			snprintf(buf, sizeof(buf), "%s", sname);
+			if (strlen(buf) < sizeof buf - 11)
+				strcat(buf, "<extended>");
+			check_part(buf, dp, base_ext_offset, nsectors,
+				   ntracks, mbr_offset);
+			ext_offsets[dospart] = base_ext_offset + dp->dp_start;
+			ext_sizes[dospart] = dp->dp_size;
+		} else {
+			sname = dsname(dname, dkunit(dev), slice, RAW_PART,
+				       partname);
+			check_part(sname, dp, ext_offset, nsectors, ntracks,
+				   mbr_offset);
+			if (slice >= MAX_SLICES) {
+				printf("%s: too many slices\n", sname);
+				slice++;
+				continue;
+			}
+			sp->ds_offset = ext_offset + dp->dp_start;
+			sp->ds_size = dp->dp_size;
+			sp->ds_type = dp->dp_typ;
+			ssp->dss_nslices++;
+			slice++;
+			sp++;
+		}
+	}
+
+	/* If we found any more slices, recursively find all the subslices. */
+	for (dospart = 0; dospart < NDOSPART; dospart++)
+		if (ext_sizes[dospart] != 0)
+			extended(dname, dev, strat, lp, ssp,
+				 ext_offsets[dospart], ext_sizes[dospart],
+				 base_ext_offset, nsectors, ntracks,
+				 mbr_offset);
+
+done:
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+}
diff --git a/sys/kern/subr_diskslice.c b/sys/kern/subr_diskslice.c
new file mode 100644
index 0000000..fa0e4a4
--- /dev/null
+++ b/sys/kern/subr_diskslice.c
@@ -0,0 +1,1192 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)wd.c	7.2 (Berkeley) 5/9/91
+ *	from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $
+ *	from: @(#)ufs_disksubr.c	7.16 (Berkeley) 5/4/91
+ *	from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ *	$Id: subr_diskslice.c,v 1.60 1998/12/04 22:54:51 archie Exp $
+ */
+
+#include "opt_devfs.h"
+
+#include <stddef.h>
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/dkbad.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/stat.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <ufs/ffs/fs.h>
+
+#define TRACE(str)	do { if (ds_debug) printf str; } while (0)
+
+typedef	u_char	bool_t;
+
+static volatile bool_t ds_debug;
+
+static struct disklabel *clone_label __P((struct disklabel *lp));
+static void dsiodone __P((struct buf *bp));
+static char *fixlabel __P((char *sname, struct diskslice *sp,
+			   struct disklabel *lp, int writeflag));
+static void free_ds_label __P((struct diskslices *ssp, int slice));
+#ifdef DEVFS
+static void free_ds_labeldevs __P((struct diskslices *ssp, int slice));
+#endif
+static void partition_info __P((char *sname, int part, struct partition *pp));
+static void slice_info __P((char *sname, struct diskslice *sp));
+static void set_ds_bad __P((struct diskslices *ssp, int slice,
+			    struct dkbad_intern *btp));
+static void set_ds_label __P((struct diskslices *ssp, int slice,
+			      struct disklabel *lp));
+#ifdef DEVFS
+static void set_ds_labeldevs __P((char *dname, dev_t dev,
+				  struct diskslices *ssp));
+static void set_ds_labeldevs_unaliased __P((char *dname, dev_t dev,
+					    struct diskslices *ssp));
+#endif
+static void set_ds_wlabel __P((struct diskslices *ssp, int slice,
+			       int wlabel));
+
+/*
+ * Duplicate a label for the whole disk, and initialize defaults in the
+ * copy for fields that are not already initialized.  The caller only
+ * needs to initialize d_secsize and d_secperunit, and zero the fields
+ * that are to be defaulted.
+ */
+static struct disklabel *
+clone_label(lp)
+	struct disklabel *lp;
+{
+	struct disklabel *lp1;
+
+	lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK);
+	*lp1 = *lp;
+	lp = NULL;
+	if (lp1->d_typename[0] == '\0')
+		strncpy(lp1->d_typename, "amnesiac", sizeof(lp1->d_typename));
+	if (lp1->d_packname[0] == '\0')
+		strncpy(lp1->d_packname, "fictitious", sizeof(lp1->d_packname));
+	if (lp1->d_nsectors == 0)
+		lp1->d_nsectors = 32;
+	if (lp1->d_ntracks == 0)
+		lp1->d_ntracks = 64;
+	lp1->d_secpercyl = lp1->d_nsectors * lp1->d_ntracks;
+	lp1->d_ncylinders = lp1->d_secperunit / lp1->d_secpercyl;
+	if (lp1->d_rpm == 0)
+		lp1->d_rpm = 3600;
+	if (lp1->d_interleave == 0)
+		lp1->d_interleave = 1;
+	if (lp1->d_npartitions < RAW_PART + 1)
+		lp1->d_npartitions = MAXPARTITIONS;
+	if (lp1->d_bbsize == 0)
+		lp1->d_bbsize = BBSIZE;
+	if (lp1->d_sbsize == 0)
+		lp1->d_sbsize = SBSIZE;
+	lp1->d_partitions[RAW_PART].p_size = lp1->d_secperunit;
+	lp1->d_magic = DISKMAGIC;
+	lp1->d_magic2 = DISKMAGIC;
+	lp1->d_checksum = dkcksum(lp1);
+	return (lp1);
+}
+
+/*
+ * Determine the size of the transfer, and make sure it is
+ * within the boundaries of the partition. Adjust transfer
+ * if needed, and signal errors or early completion.
+ *
+ * XXX TODO:
+ *	o Do bad sector remapping.  May need to split buffer.
+ *	o Split buffers that are too big for the device.
+ *	o Check for overflow.
+ *	o Finish cleaning this up.
+ */
+int
+dscheck(bp, ssp)
+	struct buf *bp;
+	struct diskslices *ssp;
+{
+	daddr_t	blkno;
+	u_long	endsecno;
+	daddr_t	labelsect;
+	struct disklabel *lp;
+	char *msg;
+	long	nsec;
+	struct partition *pp;
+	daddr_t	secno;
+	daddr_t	slicerel_secno;
+	struct diskslice *sp;
+	int s;
+
+	blkno = bp->b_blkno;
+	if (blkno < 0) {
+		printf("dscheck: negative b_blkno %ld\n", (long)blkno);
+		bp->b_error = EINVAL;
+		goto bad;
+	}
+	sp = &ssp->dss_slices[dkslice(bp->b_dev)];
+	lp = sp->ds_label;
+	if (ssp->dss_secmult == 1) {
+		if (bp->b_bcount % (u_long)DEV_BSIZE)
+			goto bad_bcount;
+		secno = blkno;
+		nsec = bp->b_bcount >> DEV_BSHIFT;
+	} else if (ssp->dss_secshift != -1) {
+		if (bp->b_bcount & (ssp->dss_secsize - 1))
+			goto bad_bcount;
+		if (blkno & (ssp->dss_secmult - 1))
+			goto bad_blkno;
+		secno = blkno >> ssp->dss_secshift;
+		nsec = bp->b_bcount >> (DEV_BSHIFT + ssp->dss_secshift);
+	} else {
+		if (bp->b_bcount % ssp->dss_secsize)
+			goto bad_bcount;
+		if (blkno % ssp->dss_secmult)
+			goto bad_blkno;
+		secno = blkno / ssp->dss_secmult;
+		nsec = bp->b_bcount / ssp->dss_secsize;
+	}
+	if (lp == NULL) {
+		labelsect = -LABELSECTOR - 1;
+		endsecno = sp->ds_size;
+		slicerel_secno = secno;
+	} else {
+		labelsect = lp->d_partitions[LABEL_PART].p_offset;
+if (labelsect != 0) Debugger("labelsect != 0 in dscheck()");
+		pp = &lp->d_partitions[dkpart(bp->b_dev)];
+		endsecno = pp->p_size;
+		slicerel_secno = pp->p_offset + secno;
+		if (sp->ds_bad != NULL && ds_debug) {
+			daddr_t	newsecno;
+
+			newsecno = transbad144(sp->ds_bad, slicerel_secno);
+			if (newsecno != slicerel_secno)
+				printf("should map bad sector %ld -> %ld\n",
+				       (long)slicerel_secno, (long)newsecno);
+		}
+	}
+
+	/* overwriting disk label ? */
+	/* XXX should also protect bootstrap in first 8K */
+	if (slicerel_secno <= LABELSECTOR + labelsect &&
+#if LABELSECTOR != 0
+	    slicerel_secno + nsec > LABELSECTOR + labelsect &&
+#endif
+	    (bp->b_flags & B_READ) == 0 && sp->ds_wlabel == 0) {
+		bp->b_error = EROFS;
+		goto bad;
+	}
+
+#if defined(DOSBBSECTOR) && defined(notyet)
+	/* overwriting master boot record? */
+	if (slicerel_secno <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 &&
+	    sp->ds_wlabel == 0) {
+		bp->b_error = EROFS;
+		goto bad;
+	}
+#endif
+
+	/* beyond partition? */
+	if (secno + nsec > endsecno) {
+		/* if exactly at end of disk, return an EOF */
+		if (secno == endsecno) {
+			bp->b_resid = bp->b_bcount;
+			return (0);
+		}
+		/* or truncate if part of it fits */
+		nsec = endsecno - secno;
+		if (nsec <= 0) {
+			bp->b_error = EINVAL;
+			goto bad;
+		}
+		bp->b_bcount = nsec * ssp->dss_secsize;
+	}
+
+	bp->b_pblkno = sp->ds_offset + slicerel_secno;
+
+	/*
+	 * Snoop on label accesses if the slice offset is nonzero.  Fudge
+	 * offsets in the label to keep the in-core label coherent with
+	 * the on-disk one.
+	 */
+	if (slicerel_secno <= LABELSECTOR + labelsect
+#if LABELSECTOR != 0
+	    && slicerel_secno + nsec > LABELSECTOR + labelsect
+#endif
+	    && sp->ds_offset != 0) {
+		struct iodone_chain *ic;
+
+		ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK);
+		ic->ic_prev_flags = bp->b_flags;
+		ic->ic_prev_iodone = bp->b_iodone;
+		ic->ic_prev_iodone_chain = bp->b_iodone_chain;
+		ic->ic_args[0].ia_long = (LABELSECTOR + labelsect -
+		    slicerel_secno) * ssp->dss_secsize;
+		ic->ic_args[1].ia_ptr = sp;
+		bp->b_flags |= B_CALL;
+		bp->b_iodone = dsiodone;
+		bp->b_iodone_chain = ic;
+		if (!(bp->b_flags & B_READ)) {
+			/*
+			 * XXX even disklabel(8) writes directly so we need
+			 * to adjust writes.  Perhaps we should drop support
+			 * for DIOCWLABEL (always write protect labels) and
+			 * require the use of DIOCWDINFO.
+			 *
+			 * XXX probably need to copy the data to avoid even
+			 * temporarily corrupting the in-core copy.
+			 */
+			if (bp->b_vp != NULL) {
+				s = splbio();
+				bp->b_vp->v_numoutput++;
+				splx(s);
+			}
+			/* XXX need name here. */
+			msg = fixlabel((char *)NULL, sp,
+				       (struct disklabel *)
+				       (bp->b_data + ic->ic_args[0].ia_long),
+				       TRUE);
+			if (msg != NULL) {
+				printf("%s\n", msg);
+				bp->b_error = EROFS;
+				goto bad;
+			}
+		}
+	}
+	return (1);
+
+bad_bcount:
+	printf("dscheck: b_bcount %ld is not on a sector boundary (ssize %d)\n",
+	    bp->b_bcount, ssp->dss_secsize);
+	bp->b_error = EINVAL;
+	goto bad;
+
+bad_blkno:
+	printf("dscheck: b_blkno %ld is not on a sector boundary (ssize %d)\n",
+	    (long)blkno, ssp->dss_secsize);
+	bp->b_error = EINVAL;
+	goto bad;
+
+bad:
+	bp->b_resid = bp->b_bcount;
+	bp->b_flags |= B_ERROR;
+	return (-1);
+}
+
+void
+dsclose(dev, mode, ssp)
+	dev_t	dev;
+	int	mode;
+	struct diskslices *ssp;
+{
+	u_char	mask;
+	struct diskslice *sp;
+
+	sp = &ssp->dss_slices[dkslice(dev)];
+	mask = 1 << dkpart(dev);
+	switch (mode) {
+	case S_IFBLK:
+		sp->ds_bopenmask &= ~mask;
+		break;
+	case S_IFCHR:
+		sp->ds_copenmask &= ~mask;
+		break;
+	}
+	sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask;
+}
+
+void
+dsgone(sspp)
+	struct diskslices **sspp;
+{
+	int	slice;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+
+	for (slice = 0, ssp = *sspp; slice < ssp->dss_nslices; slice++) {
+		sp = &ssp->dss_slices[slice];
+		if (sp->ds_bad != NULL) {
+			free(sp->ds_bad, M_DEVBUF);
+			set_ds_bad(ssp, slice, (struct dkbad_intern *)NULL);
+		}
+#ifdef DEVFS
+		if (sp->ds_bdev != NULL)
+			devfs_remove_dev(sp->ds_bdev);
+		if (sp->ds_cdev != NULL)
+			devfs_remove_dev(sp->ds_cdev);
+#endif
+		free_ds_label(ssp, slice);
+	}
+	free(ssp, M_DEVBUF);
+	*sspp = NULL;
+}
+
+/*
+ * For the "write" commands (DIOCSBAD, DIOCSDINFO and DIOCWDINFO), this
+ * is subject to the same restriction as dsopen().
+ */
+int
+dsioctl(dname, dev, cmd, data, flags, sspp, strat, setgeom)
+	char	*dname;
+	dev_t	dev;
+	u_long	cmd;
+	caddr_t	data;
+	int	flags;
+	struct diskslices **sspp;
+	d_strategy_t *strat;
+	ds_setgeom_t *setgeom;
+{
+	int	error;
+	struct disklabel *lp;
+	int	old_wlabel;
+	u_char	openmask;
+	int	part;
+	int	slice;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+
+	slice = dkslice(dev);
+	ssp = *sspp;
+	sp = &ssp->dss_slices[slice];
+	lp = sp->ds_label;
+	switch (cmd) {
+
+	case DIOCGDINFO:
+		if (lp == NULL)
+			return (EINVAL);
+		*(struct disklabel *)data = *lp;
+		return (0);
+
+#ifdef notyet
+	case DIOCGDINFOP:
+		if (lp == NULL)
+			return (EINVAL);
+		*(struct disklabel **)data = lp;
+		return (0);
+#endif
+
+	case DIOCGPART:
+		if (lp == NULL)
+			return (EINVAL);
+		((struct partinfo *)data)->disklab = lp;
+		((struct partinfo *)data)->part
+			= &lp->d_partitions[dkpart(dev)];
+		return (0);
+
+	case DIOCGSLICEINFO:
+		bcopy(ssp, data, (char *)&ssp->dss_slices[ssp->dss_nslices] -
+				 (char *)ssp);
+		return (0);
+
+	case DIOCSBAD:
+		if (slice == WHOLE_DISK_SLICE)
+			return (ENODEV);
+		if (!(flags & FWRITE))
+			return (EBADF);
+		if (lp == NULL)
+			return (EINVAL);
+		if (sp->ds_bad != NULL)
+			free(sp->ds_bad, M_DEVBUF);
+		set_ds_bad(ssp, slice, internbad144((struct dkbad *)data, lp));
+		return (0);
+
+	case DIOCSDINFO:
+		if (slice == WHOLE_DISK_SLICE)
+			return (ENODEV);
+		if (!(flags & FWRITE))
+			return (EBADF);
+		lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+		if (sp->ds_label == NULL)
+			bzero(lp, sizeof *lp);
+		else
+			bcopy(sp->ds_label, lp, sizeof *lp);
+		if (sp->ds_label == NULL)
+			openmask = 0;
+		else {
+			openmask = sp->ds_openmask;
+			if (slice == COMPATIBILITY_SLICE)
+				openmask |= ssp->dss_slices[
+				    ssp->dss_first_bsd_slice].ds_openmask;
+			else if (slice == ssp->dss_first_bsd_slice)
+				openmask |= ssp->dss_slices[
+				    COMPATIBILITY_SLICE].ds_openmask;
+		}
+		error = setdisklabel(lp, (struct disklabel *)data,
+				     (u_long)openmask);
+		/* XXX why doesn't setdisklabel() check this? */
+		if (error == 0 && lp->d_partitions[RAW_PART].p_offset != 0)
+			error = EXDEV;
+		if (error == 0) {
+			if (lp->d_secperunit > sp->ds_size)
+				error = ENOSPC;
+			for (part = 0; part < lp->d_npartitions; part++)
+				if (lp->d_partitions[part].p_size > sp->ds_size)
+					error = ENOSPC;
+		}
+#if 0 /* XXX */
+		if (error != 0 && setgeom != NULL)
+			error = setgeom(lp);
+#endif
+		if (error != 0) {
+			free(lp, M_DEVBUF);
+			return (error);
+		}
+		free_ds_label(ssp, slice);
+		set_ds_label(ssp, slice, lp);
+#ifdef DEVFS
+		set_ds_labeldevs(dname, dev, ssp);
+#endif
+		return (0);
+
+	case DIOCSYNCSLICEINFO:
+		if (slice != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART)
+			return (EINVAL);
+		if (!*(int *)data)
+			for (slice = 0; slice < ssp->dss_nslices; slice++) {
+				openmask = ssp->dss_slices[slice].ds_openmask;
+				if (openmask
+				    && (slice != WHOLE_DISK_SLICE
+					|| openmask & ~(1 << RAW_PART)))
+					return (EBUSY);
+			}
+
+		/*
+		 * Temporarily forget the current slices struct and read
+		 * the current one.
+		 * XXX should wait for current accesses on this disk to
+		 * complete, then lock out future accesses and opens.
+		 */
+		*sspp = NULL;
+		lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+		*lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label;
+		error = dsopen(dname, dev,
+			       ssp->dss_slices[WHOLE_DISK_SLICE].ds_copenmask
+			       & (1 << RAW_PART) ? S_IFCHR : S_IFBLK,
+			       ssp->dss_oflags, sspp, lp, strat, setgeom,
+			       ssp->dss_cdevsw);
+		if (error != 0) {
+			free(lp, M_DEVBUF);
+			*sspp = ssp;
+			return (error);
+		}
+
+		/*
+		 * Reopen everything.  This is a no-op except in the "force"
+		 * case and when the raw bdev and cdev are both open.  Abort
+		 * if anything fails.
+		 */
+		for (slice = 0; slice < ssp->dss_nslices; slice++) {
+			for (openmask = ssp->dss_slices[slice].ds_bopenmask,
+			     part = 0; openmask; openmask >>= 1, part++) {
+				if (!(openmask & 1))
+					continue;
+				error = dsopen(dname,
+					       dkmodslice(dkmodpart(dev, part),
+							  slice),
+					       S_IFBLK, ssp->dss_oflags, sspp,
+					       lp, strat, setgeom,
+					       ssp->dss_cdevsw);
+				if (error != 0) {
+					/* XXX should free devfs toks. */
+					free(lp, M_DEVBUF);
+					/* XXX should restore devfs toks. */
+					*sspp = ssp;
+					return (EBUSY);
+				}
+			}
+			for (openmask = ssp->dss_slices[slice].ds_copenmask,
+			     part = 0; openmask; openmask >>= 1, part++) {
+				if (!(openmask & 1))
+					continue;
+				error = dsopen(dname,
+					       dkmodslice(dkmodpart(dev, part),
+							  slice),
+					       S_IFCHR, ssp->dss_oflags, sspp,
+					       lp, strat, setgeom,
+					       ssp->dss_cdevsw);
+				if (error != 0) {
+					/* XXX should free devfs toks. */
+					free(lp, M_DEVBUF);
+					/* XXX should restore devfs toks. */
+					*sspp = ssp;
+					return (EBUSY);
+				}
+			}
+		}
+
+		/* XXX devfs tokens? */
+		free(lp, M_DEVBUF);
+		dsgone(&ssp);
+		return (0);
+
+	case DIOCWDINFO:
+		error = dsioctl(dname, dev, DIOCSDINFO, data, flags, &ssp,
+				strat, setgeom);
+		if (error != 0)
+			return (error);
+		/*
+		 * XXX this used to hack on dk_openpart to fake opening
+		 * partition 0 in case that is used instead of dkpart(dev).
+		 */
+		old_wlabel = sp->ds_wlabel;
+		set_ds_wlabel(ssp, slice, TRUE);
+		error = writedisklabel(dev, strat, sp->ds_label);
+		/* XXX should invalidate in-core label if write failed. */
+		set_ds_wlabel(ssp, slice, old_wlabel);
+		return (error);
+
+	case DIOCWLABEL:
+		if (slice == WHOLE_DISK_SLICE)
+			return (ENODEV);
+		if (!(flags & FWRITE))
+			return (EBADF);
+		set_ds_wlabel(ssp, slice, *(int *)data != 0);
+		return (0);
+
+	default:
+		return (ENOIOCTL);
+	}
+}
+
+static void
+dsiodone(bp)
+	struct buf *bp;
+{
+	struct iodone_chain *ic;
+	char *msg;
+
+	ic = bp->b_iodone_chain;
+	bp->b_flags = (ic->ic_prev_flags & B_CALL)
+		      | (bp->b_flags & ~(B_CALL | B_DONE));
+	bp->b_iodone = ic->ic_prev_iodone;
+	bp->b_iodone_chain = ic->ic_prev_iodone_chain;
+	if (!(bp->b_flags & B_READ)
+	    || (!(bp->b_flags & B_ERROR) && bp->b_error == 0)) {
+		msg = fixlabel((char *)NULL, ic->ic_args[1].ia_ptr,
+			       (struct disklabel *)
+			       (bp->b_data + ic->ic_args[0].ia_long),
+			       FALSE);
+		if (msg != NULL)
+			printf("%s\n", msg);
+	}
+	free(ic, M_DEVBUF);
+	biodone(bp);
+}
+
+int
+dsisopen(ssp)
+	struct diskslices *ssp;
+{
+	int	slice;
+
+	if (ssp == NULL)
+		return (0);
+	for (slice = 0; slice < ssp->dss_nslices; slice++)
+		if (ssp->dss_slices[slice].ds_openmask)
+			return (1);
+	return (0);
+}
+
+/*
+ * Allocate a slices "struct" and initialize it to contain only an empty
+ * compatibility slice (pointing to itself), a whole disk slice (covering
+ * the disk as described by the label), and (nslices - BASE_SLICES) empty
+ * slices beginning at BASE_SLICE.
+ */
+struct diskslices *
+dsmakeslicestruct(nslices, lp)
+	int nslices;
+	struct disklabel *lp;
+{
+	struct diskslice *sp;
+	struct diskslices *ssp;
+
+	ssp = malloc(offsetof(struct diskslices, dss_slices) +
+		     nslices * sizeof *sp, M_DEVBUF, M_WAITOK);
+	ssp->dss_cdevsw = NULL;
+	ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE;
+	ssp->dss_nslices = nslices;
+	ssp->dss_oflags = 0;
+	ssp->dss_secmult = lp->d_secsize / DEV_BSIZE;
+	if (ssp->dss_secmult & (ssp->dss_secmult - 1))
+		ssp->dss_secshift = -1;
+	else
+		ssp->dss_secshift = ffs(ssp->dss_secmult) - 1;
+	ssp->dss_secsize = lp->d_secsize;
+	sp = &ssp->dss_slices[0];
+	bzero(sp, nslices * sizeof *sp);
+	sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit;
+	return (ssp);
+}
+
+char *
+dsname(dname, unit, slice, part, partname)
+	char	*dname;
+	int	unit;
+	int	slice;
+	int	part;
+	char	*partname;
+{
+	static char name[32];
+
+	if (strlen(dname) > 16)
+		dname = "nametoolong";
+	snprintf(name, sizeof(name), "%s%d", dname, unit);
+	partname[0] = '\0';
+	if (slice != WHOLE_DISK_SLICE || part != RAW_PART) {
+		partname[0] = 'a' + part;
+		partname[1] = '\0';
+		if (slice != COMPATIBILITY_SLICE)
+			snprintf(name + strlen(name),
+			    sizeof(name) - strlen(name), "s%d", slice - 1);
+	}
+	return (name);
+}
+
+/*
+ * This should only be called when the unit is inactive and the strategy
+ * routine should not allow it to become active unless we call it.  Our
+ * strategy routine must be special to allow activity.
+ */
+int
+dsopen(dname, dev, mode, flags, sspp, lp, strat, setgeom, cdevsw)
+	char	*dname;
+	dev_t	dev;
+	int	mode;
+	u_int	flags;
+	struct diskslices **sspp;
+	struct disklabel *lp;
+	d_strategy_t *strat;
+	ds_setgeom_t *setgeom;
+	struct cdevsw *cdevsw;
+{
+	struct dkbad *btp;
+	dev_t	dev1;
+	int	error;
+	struct disklabel *lp1;
+	char	*msg;
+	u_char	mask;
+#ifdef DEVFS
+	int	mynor;
+#endif
+	bool_t	need_init;
+	int	part;
+	char	partname[2];
+	int	slice;
+	char	*sname;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+	int	unit;
+
+	if (lp->d_secsize % DEV_BSIZE)
+		return (EINVAL);
+
+	/*
+	 * XXX reinitialize the slice table unless there is an open device
+	 * on the unit.  This should only be done if the media has changed.
+	 */
+	ssp = *sspp;
+	need_init = !dsisopen(ssp);
+	if (ssp != NULL && need_init)
+		dsgone(sspp);
+	if (need_init) {
+		/*
+		 * Allocate a minimal slices "struct".  This will become
+		 * the final slices "struct" if we don't want real slices
+		 * or if we can't find any real slices.
+		 */
+		*sspp = dsmakeslicestruct(BASE_SLICE, lp);
+
+		if (!(flags & DSO_ONESLICE)) {
+			TRACE(("dsinit\n"));
+			error = dsinit(dname, dev, strat, lp, sspp);
+			if (error != 0) {
+				dsgone(sspp);
+				return (error);
+			}
+		}
+		ssp = *sspp;
+		ssp->dss_oflags = flags;
+#ifdef DEVFS
+		ssp->dss_cdevsw = cdevsw;
+#endif
+
+		/*
+		 * If there are no real slices, then make the compatiblity
+		 * slice cover the whole disk.
+		 */
+		if (ssp->dss_nslices == BASE_SLICE)
+			ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+				= lp->d_secperunit;
+
+		/* Point the compatibility slice at the BSD slice, if any. */
+		for (slice = BASE_SLICE; slice < ssp->dss_nslices; slice++) {
+			sp = &ssp->dss_slices[slice];
+			if (sp->ds_type == DOSPTYP_386BSD /* XXX */) {
+				ssp->dss_first_bsd_slice = slice;
+				ssp->dss_slices[COMPATIBILITY_SLICE].ds_offset
+					= sp->ds_offset;
+				ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+					= sp->ds_size;
+				ssp->dss_slices[COMPATIBILITY_SLICE].ds_type
+					= sp->ds_type;
+				break;
+			}
+		}
+
+		ssp->dss_slices[WHOLE_DISK_SLICE].ds_label = clone_label(lp);
+		ssp->dss_slices[WHOLE_DISK_SLICE].ds_wlabel = TRUE;
+		if (setgeom != NULL) {
+			error = setgeom(lp);
+			if (error != 0) {
+				dsgone(sspp);
+				return (error);
+			}
+		}
+	}
+
+	unit = dkunit(dev);
+
+	/*
+	 * Initialize secondary info for all slices.  It is needed for more
+	 * than the current slice in the DEVFS case.
+	 */
+	for (slice = 0; slice < ssp->dss_nslices; slice++) {
+		sp = &ssp->dss_slices[slice];
+		if (sp->ds_label != NULL)
+			continue;
+		dev1 = dkmodslice(dkmodpart(dev, RAW_PART), slice);
+		sname = dsname(dname, unit, slice, RAW_PART, partname);
+#ifdef DEVFS
+		if (slice != COMPATIBILITY_SLICE && sp->ds_bdev == NULL
+		    && sp->ds_size != 0) {
+			mynor = minor(dev1);
+			sp->ds_bdev =
+				devfs_add_devswf(bdevsw, mynor, DV_BLK,
+						 UID_ROOT, GID_OPERATOR, 0640,
+						 "%s", sname);
+			sp->ds_cdev =
+				devfs_add_devswf(cdevsw, mynor, DV_CHR,
+						 UID_ROOT, GID_OPERATOR, 0640,
+						 "r%s", sname);
+		}
+#endif
+		/*
+		 * XXX this should probably only be done for the need_init
+		 * case, but there may be a problem with DIOCSYNCSLICEINFO.
+		 */
+		set_ds_wlabel(ssp, slice, TRUE);	/* XXX invert */
+		lp1 = clone_label(lp);
+		TRACE(("readdisklabel\n"));
+		if (flags & DSO_NOLABELS)
+			msg = NULL;
+		else
+			msg = readdisklabel(dev1, strat, lp1);
+#if 0 /* XXX */
+		if (msg == NULL && setgeom != NULL && setgeom(lp1) != 0)
+			msg = "setgeom failed";
+#endif
+		if (msg == NULL)
+			msg = fixlabel(sname, sp, lp1, FALSE);
+		if (msg == NULL && lp1->d_secsize != ssp->dss_secsize)
+			msg = "inconsistent sector size";
+		if (msg != NULL) {
+			free(lp1, M_DEVBUF);
+			if (sp->ds_type == DOSPTYP_386BSD /* XXX */)
+				log(LOG_WARNING, "%s: cannot find label (%s)\n",
+				    sname, msg);
+			continue;
+		}
+		if (lp1->d_flags & D_BADSECT) {
+			btp = malloc(sizeof *btp, M_DEVBUF, M_WAITOK);
+			TRACE(("readbad144\n"));
+			msg = readbad144(dev1, strat, lp1, btp);
+			if (msg != NULL) {
+				log(LOG_WARNING,
+				    "%s: cannot find bad sector table (%s)\n",
+				    sname, msg);
+				free(btp, M_DEVBUF);
+				free(lp1, M_DEVBUF);
+				continue;
+			}
+			set_ds_bad(ssp, slice, internbad144(btp, lp1));
+			free(btp, M_DEVBUF);
+			if (sp->ds_bad == NULL) {
+				free(lp1, M_DEVBUF);
+				continue;
+			}
+		}
+		set_ds_label(ssp, slice, lp1);
+#ifdef DEVFS
+		set_ds_labeldevs(dname, dev1, ssp);
+#endif
+		set_ds_wlabel(ssp, slice, FALSE);
+	}
+
+	slice = dkslice(dev);
+	if (slice >= ssp->dss_nslices)
+		return (ENXIO);
+	sp = &ssp->dss_slices[slice];
+	part = dkpart(dev);
+	if (part != RAW_PART
+	    && (sp->ds_label == NULL || part >= sp->ds_label->d_npartitions))
+		return (EINVAL);	/* XXX needs translation */
+	mask = 1 << part;
+	switch (mode) {
+	case S_IFBLK:
+		sp->ds_bopenmask |= mask;
+		break;
+	case S_IFCHR:
+		sp->ds_copenmask |= mask;
+		break;
+	}
+	sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask;
+	return (0);
+}
+
+int
+dssize(dev, sspp, dopen, dclose)
+	dev_t	dev;
+	struct diskslices **sspp;
+	d_open_t dopen;
+	d_close_t dclose;
+{
+	struct disklabel *lp;
+	int	part;
+	int	slice;
+	struct diskslices *ssp;
+
+	slice = dkslice(dev);
+	part = dkpart(dev);
+	ssp = *sspp;
+	if (ssp == NULL || slice >= ssp->dss_nslices
+	    || !(ssp->dss_slices[slice].ds_bopenmask & (1 << part))) {
+		if (dopen(dev, FREAD, S_IFBLK, (struct proc *)NULL) != 0)
+			return (-1);
+		dclose(dev, FREAD, S_IFBLK, (struct proc *)NULL);
+		ssp = *sspp;
+	}
+	lp = ssp->dss_slices[slice].ds_label;
+	if (lp == NULL)
+		return (-1);
+	return ((int)lp->d_partitions[part].p_size);
+}
+
+static void
+free_ds_label(ssp, slice)
+	struct diskslices *ssp;
+	int	slice;
+{
+	struct disklabel *lp;
+	struct diskslice *sp;
+
+	sp = &ssp->dss_slices[slice];
+	lp = sp->ds_label;
+	if (lp == NULL)
+		return;
+#ifdef DEVFS
+	free_ds_labeldevs(ssp, slice);
+	if (slice == COMPATIBILITY_SLICE)
+		free_ds_labeldevs(ssp, ssp->dss_first_bsd_slice);
+	else if (slice == ssp->dss_first_bsd_slice)
+		free_ds_labeldevs(ssp, COMPATIBILITY_SLICE);
+#endif
+	free(lp, M_DEVBUF);
+	set_ds_label(ssp, slice, (struct disklabel *)NULL);
+}
+
+#ifdef DEVFS
+static void
+free_ds_labeldevs(ssp, slice)
+	struct diskslices *ssp;
+	int	slice;
+{
+	struct disklabel *lp;
+	int	part;
+	struct diskslice *sp;
+
+	sp = &ssp->dss_slices[slice];
+	lp = sp->ds_label;
+	if (lp == NULL)
+		return;
+	for (part = 0; part < lp->d_npartitions; part++) {
+		if (sp->ds_bdevs[part] != NULL) {
+			devfs_remove_dev(sp->ds_bdevs[part]);
+			sp->ds_bdevs[part] = NULL;
+		}
+		if (sp->ds_cdevs[part] != NULL) {
+			devfs_remove_dev(sp->ds_cdevs[part]);
+			sp->ds_cdevs[part] = NULL;
+		}
+	}
+}
+#endif
+
+static char *
+fixlabel(sname, sp, lp, writeflag)
+	char	*sname;
+	struct diskslice *sp;
+	struct disklabel *lp;
+	int	writeflag;
+{
+	u_long	end;
+	u_long	offset;
+	int	part;
+	struct partition *pp;
+	u_long	start;
+	bool_t	warned;
+
+	/* These errors "can't happen" so don't bother reporting details. */
+	if (lp->d_magic != DISKMAGIC || lp->d_magic2 != DISKMAGIC)
+		return ("fixlabel: invalid magic");
+	if (dkcksum(lp) != 0)
+		return ("fixlabel: invalid checksum");
+
+	pp = &lp->d_partitions[RAW_PART];
+	if (writeflag) {
+		start = 0;
+		offset = sp->ds_offset;
+	} else {
+		start = sp->ds_offset;
+		offset = -sp->ds_offset;
+	}
+	if (pp->p_offset != start) {
+		if (sname != NULL) {
+			printf(
+"%s: rejecting BSD label: raw partition offset != slice offset\n",
+			       sname);
+			slice_info(sname, sp);
+			partition_info(sname, RAW_PART, pp);
+		}
+		return ("fixlabel: raw partition offset != slice offset");
+	}
+	if (pp->p_size != sp->ds_size) {
+		if (sname != NULL) {
+			printf("%s: raw partition size != slice size\n", sname);
+			slice_info(sname, sp);
+			partition_info(sname, RAW_PART, pp);
+		}
+		if (pp->p_size > sp->ds_size) {
+			if (sname == NULL)
+				return ("fixlabel: raw partition size > slice size");
+			printf("%s: truncating raw partition\n", sname);
+			pp->p_size = sp->ds_size;
+		}
+	}
+	end = start + sp->ds_size;
+	if (start > end)
+		return ("fixlabel: slice wraps");
+	if (lp->d_secpercyl <= 0)
+		return ("fixlabel: d_secpercyl <= 0");
+	pp -= RAW_PART;
+	warned = FALSE;
+	for (part = 0; part < lp->d_npartitions; part++, pp++) {
+		if (pp->p_offset != 0 || pp->p_size != 0) {
+			if (pp->p_offset < start
+			    || pp->p_offset + pp->p_size > end
+			    || pp->p_offset + pp->p_size < pp->p_offset) {
+				if (sname != NULL) {
+					printf(
+"%s: rejecting partition in BSD label: it isn't entirely within the slice\n",
+					       sname);
+					if (!warned) {
+						slice_info(sname, sp);
+						warned = TRUE;
+					}
+					partition_info(sname, part, pp);
+				}
+				/* XXX else silently discard junk. */
+				bzero(pp, sizeof *pp);
+			} else
+				pp->p_offset += offset;
+		}
+	}
+	lp->d_ncylinders = sp->ds_size / lp->d_secpercyl;
+	lp->d_secperunit = sp->ds_size;
+ 	lp->d_checksum = 0;
+ 	lp->d_checksum = dkcksum(lp);
+	return (NULL);
+}
+
+static void
+partition_info(sname, part, pp)
+	char	*sname;
+	int	part;
+	struct partition *pp;
+{
+	printf("%s%c: start %lu, end %lu, size %lu\n", sname, 'a' + part,
+	       (u_long)pp->p_offset, (u_long)(pp->p_offset + pp->p_size - 1),
+	       (u_long)pp->p_size);
+}
+
+static void
+slice_info(sname, sp)
+	char	*sname;
+	struct diskslice *sp;
+{
+	printf("%s: start %lu, end %lu, size %lu\n", sname,
+	       sp->ds_offset, sp->ds_offset + sp->ds_size - 1, sp->ds_size);
+}
+
+/*
+ * Most changes to ds_bad, ds_label and ds_wlabel are made using the
+ * following functions to ensure coherency of the compatibility slice
+ * with the first BSD slice.  The openmask fields are _not_ shared and
+ * the other fields (ds_offset and ds_size) aren't changed after they
+ * are initialized.
+ */
+static void
+set_ds_bad(ssp, slice, btp)
+	struct diskslices *ssp;
+	int	slice;
+	struct dkbad_intern *btp;
+{
+	ssp->dss_slices[slice].ds_bad = btp;
+	if (slice == COMPATIBILITY_SLICE)
+		ssp->dss_slices[ssp->dss_first_bsd_slice].ds_bad = btp;
+	else if (slice == ssp->dss_first_bsd_slice)
+		ssp->dss_slices[COMPATIBILITY_SLICE].ds_bad = btp;
+}
+
+static void
+set_ds_label(ssp, slice, lp)
+	struct diskslices *ssp;
+	int	slice;
+	struct disklabel *lp;
+{
+	ssp->dss_slices[slice].ds_label = lp;
+	if (slice == COMPATIBILITY_SLICE)
+		ssp->dss_slices[ssp->dss_first_bsd_slice].ds_label = lp;
+	else if (slice == ssp->dss_first_bsd_slice)
+		ssp->dss_slices[COMPATIBILITY_SLICE].ds_label = lp;
+}
+
+#ifdef DEVFS
+static void
+set_ds_labeldevs(dname, dev, ssp)
+	char	*dname;
+	dev_t	dev;
+	struct diskslices *ssp;
+{
+	int	slice;
+
+	set_ds_labeldevs_unaliased(dname, dev, ssp);
+	if (ssp->dss_first_bsd_slice == COMPATIBILITY_SLICE)
+		return;
+	slice = dkslice(dev);
+	if (slice == COMPATIBILITY_SLICE)
+		set_ds_labeldevs_unaliased(dname,
+			dkmodslice(dev, ssp->dss_first_bsd_slice), ssp);
+	else if (slice == ssp->dss_first_bsd_slice)
+		set_ds_labeldevs_unaliased(dname,
+			dkmodslice(dev, COMPATIBILITY_SLICE), ssp);
+}
+
+static void
+set_ds_labeldevs_unaliased(dname, dev, ssp)
+	char	*dname;
+	dev_t	dev;
+	struct diskslices *ssp;
+{
+	struct disklabel *lp;
+	int	mynor;
+	int	part;
+	char	partname[2];
+	struct partition *pp;
+	int	slice;
+	char	*sname;
+	struct diskslice *sp;
+ 
+	slice = dkslice(dev);
+	sp = &ssp->dss_slices[slice];
+	if (sp->ds_size == 0)
+		return;
+	lp = sp->ds_label;
+	for (part = 0; part < lp->d_npartitions; part++) {
+		pp = &lp->d_partitions[part];
+		if (pp->p_size == 0)
+			continue;
+		sname = dsname(dname, dkunit(dev), slice, part, partname);
+		if (part == RAW_PART && sp->ds_bdev != NULL) {
+			sp->ds_bdevs[part] =
+				devfs_makelink(sp->ds_bdev,
+					   "%s%s", sname, partname);
+			sp->ds_cdevs[part] =
+				devfs_makelink(sp->ds_cdev,
+					   "r%s%s", sname, partname);
+		} else {
+			mynor = minor(dkmodpart(dev, part));
+			sp->ds_bdevs[part] =
+				devfs_add_devswf(ssp->dss_cdevsw, mynor, DV_BLK,
+						 UID_ROOT, GID_OPERATOR, 0640,
+						 "%s%s", sname, partname);
+			sp->ds_cdevs[part] =
+				devfs_add_devswf(ssp->dss_cdevsw, mynor, DV_CHR,
+						 UID_ROOT, GID_OPERATOR, 0640,
+						 "r%s%s", sname, partname);
+		}
+	}
+}
+#endif /* DEVFS */
+
+static void
+set_ds_wlabel(ssp, slice, wlabel)
+	struct diskslices *ssp;
+	int	slice;
+	int	wlabel;
+{
+	ssp->dss_slices[slice].ds_wlabel = wlabel;
+	if (slice == COMPATIBILITY_SLICE)
+		ssp->dss_slices[ssp->dss_first_bsd_slice].ds_wlabel = wlabel;
+	else if (slice == ssp->dss_first_bsd_slice)
+		ssp->dss_slices[COMPATIBILITY_SLICE].ds_wlabel = wlabel;
+}
diff --git a/sys/kern/subr_dkbad.c b/sys/kern/subr_dkbad.c
new file mode 100644
index 0000000..4686a17
--- /dev/null
+++ b/sys/kern/subr_dkbad.c
@@ -0,0 +1,160 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)wd.c	7.2 (Berkeley) 5/9/91
+ *	from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $
+ *	from: @(#)ufs_disksubr.c	7.16 (Berkeley) 5/4/91
+ *	from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ *	$Id: subr_dkbad.c,v 1.7 1997/11/24 04:14:21 dyson Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#include <sys/dkbad.h>
+#include <sys/malloc.h>
+
+/*
+ * Internalize the bad sector table.
+ * TODO:
+ *	o Fix types.
+ *	  Type long should be daddr_t since we compare with blkno's.
+ *	  Sentinel -1 should be ((daddr_t)-1).
+ *	o Can remove explicit test for sentinel if it is a positive
+ *	  (unsigned or not) value larger than all possible blkno's.
+ *	o Check that the table is sorted.
+ *	o Use faster searches.
+ *	o Use the internal table in wddump().
+ *	o Don't duplicate so much code.
+ *	o Do all bad block handing in a driver-independent file.
+ *	o Remove limit of 126 spare sectors.
+ */
+struct dkbad_intern *
+internbad144(btp, lp)
+	struct dkbad *btp;
+	struct disklabel *lp;
+{
+	struct dkbad_intern *bip;
+	int i;
+
+	bip = malloc(sizeof *bip, M_DEVBUF, M_WAITOK);
+	/*
+	 * Spare sectors are allocated beginning with the last sector of
+	 * the second last track of the disk (the last track is used for
+	 * the bad sector list).
+	 */
+	bip->bi_maxspare = lp->d_secperunit - lp->d_nsectors - 1;
+	bip->bi_nbad = DKBAD_MAXBAD;
+	i = 0;
+	for (; i < DKBAD_MAXBAD && btp->bt_bad[i].bt_cyl != DKBAD_NOCYL; i++)
+		bip->bi_bad[i] = btp->bt_bad[i].bt_cyl * lp->d_secpercyl
+				 + (btp->bt_bad[i].bt_trksec >> 8)
+				   * lp->d_nsectors
+				 + (btp->bt_bad[i].bt_trksec & 0x00ff);
+	bip->bi_bad[i] = -1;
+	return (bip);
+}
+
+char *
+readbad144(dev, strat, lp, bdp)
+	dev_t	dev;
+	d_strategy_t *strat;
+	struct disklabel *lp;
+	struct dkbad *bdp;
+{
+	struct buf *bp;
+	struct dkbad *db;
+	int	i;
+	char	*msg;
+
+	bp = geteblk((int)lp->d_secsize);
+	i = 0;
+	do {
+		/* Read a bad sector table. */
+		bp->b_dev = dev;
+		bp->b_blkno = lp->d_secperunit - lp->d_nsectors + i;
+		if (lp->d_secsize > DEV_BSIZE)
+			bp->b_blkno *= lp->d_secsize / DEV_BSIZE;
+		else
+			bp->b_blkno /= DEV_BSIZE / lp->d_secsize;
+		bp->b_bcount = lp->d_secsize;
+		bp->b_flags |= B_BUSY | B_READ;
+		bp->b_flags &= ~B_ERROR;
+		(*strat)(bp);
+
+		/* If successful, validate, otherwise try another. */
+		if (biowait(bp) == 0) {
+			db = (struct dkbad *)(bp->b_data);
+			if (db->bt_mbz == 0 && db->bt_flag == DKBAD_MAGIC) {
+				msg = NULL;
+				*bdp = *db;
+				break;
+			}
+			msg = "bad sector table corrupted";
+		} else
+			msg = "bad sector table I/O error";
+	} while ((bp->b_flags & B_ERROR) && (i += 2) < 10 &&
+		 i < lp->d_nsectors);
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	return (msg);
+}
+
+daddr_t
+transbad144(bip, blkno)
+	struct dkbad_intern *bip;
+	daddr_t	blkno;
+{
+	int	i;
+
+	/*
+	 * List is sorted, so the search can terminate when it is past our
+	 * sector.
+	 */
+	for (i = 0; bip->bi_bad[i] != -1 && bip->bi_bad[i] <= blkno; i++)
+		if (bip->bi_bad[i] == blkno)
+			/*
+			 * Spare sectors are allocated in decreasing order.
+			 */
+			return (bip->bi_maxspare - i);
+	return (blkno);
+}
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
new file mode 100644
index 0000000..1204376
--- /dev/null
+++ b/sys/kern/subr_log.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_log.c	8.1 (Berkeley) 6/10/93
+ * $Id: subr_log.c,v 1.32 1998/11/11 10:55:56 truckman Exp $
+ */
+
+/*
+ * Error log buffer for kernel printf's.
+ */
+
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/msgbuf.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/poll.h>
+#include <sys/filedesc.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#define LOG_RDPRI	(PZERO + 1)
+
+#define LOG_ASYNC	0x04
+#define LOG_RDWAIT	0x08
+
+static	d_open_t	logopen;
+static	d_close_t	logclose;
+static	d_read_t	logread;
+static	d_ioctl_t	logioctl;
+static	d_poll_t	logpoll;
+
+#define CDEV_MAJOR 7
+static struct cdevsw log_cdevsw = 
+	{ logopen,	logclose,	logread,	nowrite,	/*7*/
+	  logioctl,	nostop,		nullreset,	nodevtotty,/* klog */
+	  logpoll,	nommap,		NULL,	"log",	NULL,	-1 };
+
+static struct logsoftc {
+	int	sc_state;		/* see above for possibilities */
+	struct	selinfo sc_selp;	/* process waiting on select call */
+	struct  sigio *sc_sigio;	/* information for async I/O */
+} logsoftc;
+
+int	log_open;			/* also used in log() */
+
+/*ARGSUSED*/
+static	int
+logopen(dev, flags, mode, p)
+	dev_t dev;
+	int flags, mode;
+	struct proc *p;
+{
+	if (log_open)
+		return (EBUSY);
+	log_open = 1;
+	fsetown(p->p_pid, &logsoftc.sc_sigio);	/* signal process only */
+	return (0);
+}
+
+/*ARGSUSED*/
+static	int
+logclose(dev, flag, mode, p)
+	dev_t dev;
+	int flag, mode;
+	struct proc *p;
+{
+
+	log_open = 0;
+	logsoftc.sc_state = 0;
+	funsetown(logsoftc.sc_sigio);
+	return (0);
+}
+
+/*ARGSUSED*/
+static	int
+logread(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	register struct msgbuf *mbp = msgbufp;
+	register long l;
+	register int s;
+	int error = 0;
+
+	s = splhigh();
+	while (mbp->msg_bufr == mbp->msg_bufx) {
+		if (flag & IO_NDELAY) {
+			splx(s);
+			return (EWOULDBLOCK);
+		}
+		logsoftc.sc_state |= LOG_RDWAIT;
+		if ((error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH,
+		    "klog", 0))) {
+			splx(s);
+			return (error);
+		}
+	}
+	splx(s);
+	logsoftc.sc_state &= ~LOG_RDWAIT;
+
+	while (uio->uio_resid > 0) {
+		l = mbp->msg_bufx - mbp->msg_bufr;
+		if (l < 0)
+			l = mbp->msg_size - mbp->msg_bufr;
+		l = min(l, uio->uio_resid);
+		if (l == 0)
+			break;
+		error = uiomove((caddr_t)msgbufp->msg_ptr + mbp->msg_bufr,
+		    (int)l, uio);
+		if (error)
+			break;
+		mbp->msg_bufr += l;
+		if (mbp->msg_bufr >= mbp->msg_size)
+			mbp->msg_bufr = 0;
+	}
+	return (error);
+}
+
+/*ARGSUSED*/
+static	int
+logpoll(dev, events, p)
+	dev_t dev;
+	int events;
+	struct proc *p;
+{
+	int s;
+	int revents = 0;
+
+	s = splhigh();
+
+	if (events & (POLLIN | POLLRDNORM))
+		if (msgbufp->msg_bufr != msgbufp->msg_bufx)
+			revents |= events & (POLLIN | POLLRDNORM);
+		else
+			selrecord(p, &logsoftc.sc_selp);
+
+	splx(s);
+	return (revents);
+}
+
+void
+logwakeup()
+{
+	if (!log_open)
+		return;
+	selwakeup(&logsoftc.sc_selp);
+	if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL)
+		pgsigio(logsoftc.sc_sigio, SIGIO, 0);
+	if (logsoftc.sc_state & LOG_RDWAIT) {
+		wakeup((caddr_t)msgbufp);
+		logsoftc.sc_state &= ~LOG_RDWAIT;
+	}
+}
+
+/*ARGSUSED*/
+static	int
+logioctl(dev, com, data, flag, p)
+	dev_t dev;
+	u_long com;
+	caddr_t data;
+	int flag;
+	struct proc *p;
+{
+	long l;
+	int s;
+
+	switch (com) {
+
+	/* return number of characters immediately available */
+	case FIONREAD:
+		s = splhigh();
+		l = msgbufp->msg_bufx - msgbufp->msg_bufr;
+		splx(s);
+		if (l < 0)
+			l += msgbufp->msg_size;
+		*(int *)data = l;
+		break;
+
+	case FIONBIO:
+		break;
+
+	case FIOASYNC:
+		if (*(int *)data)
+			logsoftc.sc_state |= LOG_ASYNC;
+		else
+			logsoftc.sc_state &= ~LOG_ASYNC;
+		break;
+
+	case FIOSETOWN:
+		return (fsetown(*(int *)data, &logsoftc.sc_sigio));
+
+	case FIOGETOWN:
+		*(int *)data = fgetown(logsoftc.sc_sigio);
+		break;
+
+	/* This is deprecated, FIOSETOWN should be used instead. */
+	case TIOCSPGRP:
+		return (fsetown(-(*(int *)data), &logsoftc.sc_sigio));
+
+	/* This is deprecated, FIOGETOWN should be used instead */
+	case TIOCGPGRP:
+		*(int *)data = -fgetown(logsoftc.sc_sigio);
+		break;
+
+	default:
+		return (ENOTTY);
+	}
+	return (0);
+}
+
+static	int	log_devsw_installed;
+#ifdef DEVFS
+static	void	*log_devfs_token;
+#endif
+
+static void log_drvinit __P((void *unused));
+static void
+log_drvinit(unused)
+	void *unused;
+{
+	dev_t dev;
+
+	if( ! log_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR,0);
+		cdevsw_add(&dev,&log_cdevsw,NULL);
+		log_devsw_installed = 1;
+#ifdef DEVFS
+		log_devfs_token = devfs_add_devswf(&log_cdevsw, 0, DV_CHR,
+						   UID_ROOT, GID_WHEEL, 0600,
+						   "klog");
+#endif
+    	}
+}
+
+SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,log_drvinit,NULL)
diff --git a/sys/kern/subr_module.c b/sys/kern/subr_module.c
new file mode 100644
index 0000000..7eb635a
--- /dev/null
+++ b/sys/kern/subr_module.c
@@ -0,0 +1,267 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: subr_module.c,v 1.3 1998/10/12 09:03:48 peter Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/linker.h>
+
+/*
+ * Preloaded module support
+ */
+
+caddr_t	preload_metadata;
+
+/*
+ * Search for the preloaded module (name)
+ */
+caddr_t
+preload_search_by_name(const char *name)
+{
+    caddr_t	curp;
+    u_int32_t	*hdr;
+    int		next;
+    
+    if (preload_metadata != NULL) {
+	
+	curp = preload_metadata;
+	for (;;) {
+	    hdr = (u_int32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Search for a MODINFO_NAME field */
+	    if ((hdr[0] == MODINFO_NAME) &&
+		!strcmp(name, curp + sizeof(u_int32_t) * 2))
+		return(curp);
+
+	    /* skip to next field */
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+    return(NULL);
+}
+
+/*
+ * Search for the first preloaded module of (type)
+ */
+caddr_t
+preload_search_by_type(const char *type)
+{
+    caddr_t	curp, lname;
+    u_int32_t	*hdr;
+    int		next;
+
+    if (preload_metadata != NULL) {
+
+	curp = preload_metadata;
+	lname = NULL;
+	for (;;) {
+	    hdr = (u_int32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* remember the start of each record */
+	    if (hdr[0] == MODINFO_NAME)
+		lname = curp;
+
+	    /* Search for a MODINFO_TYPE field */
+	    if ((hdr[0] == MODINFO_TYPE) &&
+		!strcmp(type, curp + sizeof(u_int32_t) * 2))
+		return(lname);
+
+	    /* skip to next field */
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+    return(NULL);
+}
+
+/*
+ * Walk through the preloaded module list
+ */
+caddr_t
+preload_search_next_name(caddr_t base)
+{
+    caddr_t	curp;
+    u_int32_t	*hdr;
+    int		next;
+    
+    if (preload_metadata != NULL) {
+	
+	/* Pick up where we left off last time */
+	if (base) {
+	    /* skip to next field */
+	    curp = base;
+	    hdr = (u_int32_t *)curp;
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	} else
+	    curp = preload_metadata;
+
+	for (;;) {
+	    hdr = (u_int32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Found a new record? */
+	    if (hdr[0] == MODINFO_NAME)
+		return curp;
+
+	    /* skip to next field */
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+    return(NULL);
+}
+
+/*
+ * Given a preloaded module handle (mod), return a pointer
+ * to the data for the attribute (inf).
+ */
+caddr_t
+preload_search_info(caddr_t mod, int inf)
+{
+    caddr_t	curp;
+    u_int32_t	*hdr;
+    u_int32_t	type = 0;
+    int		next;
+
+    curp = mod;
+    for (;;) {
+	hdr = (u_int32_t *)curp;
+	/* end of module data? */
+	if (hdr[0] == 0 && hdr[1] == 0)
+	    break;
+	/* 
+	 * We give up once we've looped back to what we were looking at 
+	 * first - this should normally be a MODINFO_NAME field.
+	 */
+	if (type == 0) {
+	    type = hdr[0];
+	} else {
+	    if (hdr[0] == type)
+		break;
+	}
+	
+	/* 
+	 * Attribute match? Return pointer to data.
+	 * Consumer may safely assume that size value preceeds	
+	 * data.
+	 */
+	if (hdr[0] == inf)
+	    return(curp + (sizeof(u_int32_t) * 2));
+
+	/* skip to next field */
+	next = sizeof(u_int32_t) * 2 + hdr[1];
+	next = roundup(next, sizeof(u_long));
+	curp += next;
+    }
+    return(NULL);
+}
+
+/*
+ * Delete a preload record by name.
+ */
+void
+preload_delete_name(const char *name)
+{
+    caddr_t	curp;
+    u_int32_t	*hdr;
+    int		next;
+    int		clearing;
+    
+    if (preload_metadata != NULL) {
+	
+	clearing = 0;
+	curp = preload_metadata;
+	for (;;) {
+	    hdr = (u_int32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Search for a MODINFO_NAME field */
+	    if (hdr[0] == MODINFO_NAME) {
+		if (!strcmp(name, curp + sizeof(u_int32_t) * 2))
+		    clearing = 1;	/* got it, start clearing */
+		else if (clearing)
+		    clearing = 0;	/* at next one now.. better stop */
+	    }
+	    if (clearing)
+		hdr[0] = MODINFO_EMPTY;
+
+	    /* skip to next field */
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+}
+
+/* Called from locore on i386.  Convert physical pointers to kvm. Sigh. */
+void
+preload_bootstrap_relocate(vm_offset_t offset)
+{
+    caddr_t	curp;
+    u_int32_t	*hdr;
+    vm_offset_t	*ptr;
+    int		next;
+    
+    if (preload_metadata != NULL) {
+	
+	curp = preload_metadata;
+	for (;;) {
+	    hdr = (u_int32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Deal with the ones that we know we have to fix */
+	    switch (hdr[0]) {
+	    case MODINFO_ADDR:
+	    case MODINFO_METADATA|MODINFOMD_SSYM:
+	    case MODINFO_METADATA|MODINFOMD_ESYM:
+		ptr = (vm_offset_t *)(curp + (sizeof(u_int32_t) * 2));
+		*ptr += offset;
+		break;
+	    }
+	    /* The rest is beyond us for now */
+
+	    /* skip to next field */
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+}
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
new file mode 100644
index 0000000..ef98c59
--- /dev/null
+++ b/sys/kern/subr_param.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 1980, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)param.c	8.3 (Berkeley) 8/20/94
+ * $Id: param.c,v 1.31 1998/11/05 14:28:17 dg Exp $
+ */
+
+#include <stddef.h>
+
+#include "opt_sysvipc.h"
+#include "opt_param.h"
+
+#include <sys/param.h>
+
+#ifdef SYSVSHM
+#include <machine/vmparam.h>
+#include <sys/shm.h>
+#endif
+#ifdef SYSVSEM
+#include <sys/sem.h>
+#endif
+#ifdef SYSVMSG
+#include <sys/msg.h>
+#endif
+
+/*
+ * System parameter formulae.
+ *
+ * This file is copied into each directory where we compile
+ * the kernel; it should be modified there to suit local taste
+ * if necessary.
+ *
+ * Compiled with -DMAXUSERS=xx
+ */
+
+#ifndef HZ
+#define	HZ 100
+#endif
+int	hz = HZ;
+int	tick = 1000000 / HZ;
+int	tickadj = howmany(30000, 60 * HZ);	/* can adjust 30ms in 60s */
+#define	NPROC (20 + 16 * MAXUSERS)
+#define MAXFILES (NPROC*2)
+int	maxproc = NPROC;			/* maximum # of processes */
+int	maxprocperuid = NPROC-1;		/* maximum # of processes per user */
+int	maxfiles = MAXFILES;			/* system wide open files limit */
+int	maxfilesperproc = MAXFILES;		/* per-process open files limit */
+int	ncallout = 16 + NPROC + MAXFILES;	/* maximum # of timer events */
+
+/* maximum # of mbuf clusters */
+#ifndef NMBCLUSTERS
+#define	NMBCLUSTERS (512 + MAXUSERS * 16)
+#endif
+int	nmbclusters = NMBCLUSTERS;
+
+#if MAXFILES > NMBCLUSTERS
+#define	MAXSOCKETS MAXFILES
+#else
+#define	MAXSOCKETS NMBCLUSTERS
+#endif
+int	maxsockets = MAXSOCKETS;
+
+/* allocate 1/4th amount of virtual address space for mbufs XXX */
+int	nmbufs = NMBCLUSTERS * 4;
+
+/* maximum # of sf_bufs (sendfile(2) zero-copy virtual buffers) */
+#ifndef NSFBUFS
+#define	NSFBUFS (512 + MAXUSERS * 16)
+#endif
+int	nsfbufs = NSFBUFS;
+
+/*
+ * Values in support of System V compatible shared memory.	XXX
+ */
+#ifdef SYSVSHM
+#ifndef SHMMAX
+#define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
+#endif
+#ifndef SHMMIN
+#define	SHMMIN	1
+#endif
+#ifndef SHMMNI
+#define	SHMMNI	32			/* <= SHMMMNI in shm.h */
+#endif
+#ifndef SHMSEG
+#define	SHMSEG	8
+#endif
+#ifndef SHMALL
+#define	SHMALL	(SHMMAXPGS)
+#endif
+
+struct	shminfo shminfo = {
+	SHMMAX,
+	SHMMIN,
+	SHMMNI,
+	SHMSEG,
+	SHMALL
+};
+#endif
+
+/*
+ * Values in support of System V compatible semaphores.
+ */
+
+#ifdef SYSVSEM
+
+struct seminfo seminfo = {
+                SEMMAP,         /* # of entries in semaphore map */
+                SEMMNI,         /* # of semaphore identifiers */
+                SEMMNS,         /* # of semaphores in system */
+                SEMMNU,         /* # of undo structures in system */
+                SEMMSL,         /* max # of semaphores per id */
+                SEMOPM,         /* max # of operations per semop call */
+                SEMUME,         /* max # of undo entries per process */
+                SEMUSZ,         /* size in bytes of undo structure */
+                SEMVMX,         /* semaphore maximum value */
+                SEMAEM          /* adjust on exit max value */
+};
+#endif
+
+/*
+ * Values in support of System V compatible messages.
+ */
+
+#ifdef SYSVMSG
+
+struct msginfo msginfo = {
+                MSGMAX,         /* max chars in a message */
+                MSGMNI,         /* # of message queue identifiers */
+                MSGMNB,         /* max chars in a queue */
+                MSGTQL,         /* max messages in system */
+                MSGSSZ,         /* size of a message segment */
+                		/* (must be small power of 2 greater than 4) */
+                MSGSEG          /* number of message segments */
+};
+#endif
+
+/*
+ * These may be set to nonzero here or by patching.
+ * If they are nonzero at bootstrap time then they are
+ * initialized to values dependent on the memory size.
+ */
+#ifdef	NBUF
+int	nbuf = NBUF;
+#else
+int	nbuf = 0;
+#endif
+int	nswbuf = 0;
+
+/*
+ * These have to be allocated somewhere; allocating
+ * them here forces loader errors if this file is omitted
+ * (if they've been externed everywhere else; hah!).
+ */
+struct	buf *swbuf;
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
new file mode 100644
index 0000000..424ac9f
--- /dev/null
+++ b/sys/kern/subr_prf.c
@@ -0,0 +1,716 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_prf.c	8.3 (Berkeley) 1/21/94
+ * $Id: subr_prf.c,v 1.50 1998/09/06 06:25:04 ache Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/msgbuf.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/tprintf.h>
+#include <sys/syslog.h>
+#include <machine/cons.h>
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define TOCONS	0x01
+#define TOTTY	0x02
+#define TOLOG	0x04
+
+struct	tty *constty;			/* pointer to console "window" tty */
+
+struct putchar_arg {
+	int flags;
+	struct tty *tty;
+};
+
+struct snprintf_arg {
+	char *str;
+	size_t remain;
+};
+
+static void (*v_putc)(int) = cnputc;	/* routine to putc on virtual console */
+static void  logpri __P((int level));
+static void  msglogchar(int c, void *dummyarg);
+static void  putchar __P((int ch, void *arg));
+static char *ksprintn __P((u_long num, int base, int *len));
+static void  snprintf_func __P((int ch, void *arg));
+
+static int consintr = 1;		/* Ok to handle console interrupts? */
+static int msgbufmapped;		/* Set when safe to use msgbuf */
+
+/*
+ * Warn that a system table is full.
+ */
+void
+tablefull(tab)
+	const char *tab;
+{
+
+	log(LOG_ERR, "%s: table is full\n", tab);
+}
+
+/*
+ * Uprintf prints to the controlling terminal for the current process.
+ * It may block if the tty queue is overfull.  No message is printed if
+ * the queue does not clear in a reasonable time.
+ */
+void
+uprintf(const char *fmt, ...)
+{
+	struct proc *p = curproc;
+	va_list ap;
+	struct putchar_arg pca;
+
+	if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
+		va_start(ap, fmt);
+		pca.tty = p->p_session->s_ttyp;
+		pca.flags = TOTTY;
+		kvprintf(fmt, putchar, &pca, 10, ap);
+		va_end(ap);
+	}
+}
+
+tpr_t
+tprintf_open(p)
+	register struct proc *p;
+{
+
+	if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
+		SESSHOLD(p->p_session);
+		return ((tpr_t) p->p_session);
+	}
+	return ((tpr_t) NULL);
+}
+
+void
+tprintf_close(sess)
+	tpr_t sess;
+{
+
+	if (sess)
+		SESSRELE((struct session *) sess);
+}
+
+/*
+ * tprintf prints on the controlling terminal associated
+ * with the given session.
+ */
+void
+tprintf(tpr_t tpr, const char *fmt, ...)
+{
+	register struct session *sess = (struct session *)tpr;
+	struct tty *tp = NULL;
+	int flags = TOLOG;
+	va_list ap;
+	struct putchar_arg pca;
+
+	logpri(LOG_INFO);
+	if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp, 0)) {
+		flags |= TOTTY;
+		tp = sess->s_ttyp;
+	}
+	va_start(ap, fmt);
+	pca.tty = tp;
+	pca.flags = flags;
+	kvprintf(fmt, putchar, &pca, 10, ap);
+	va_end(ap);
+	logwakeup();
+}
+
+/*
+ * Ttyprintf displays a message on a tty; it should be used only by
+ * the tty driver, or anything that knows the underlying tty will not
+ * be revoke(2)'d away.  Other callers should use tprintf.
+ */
+void
+ttyprintf(struct tty *tp, const char *fmt, ...)
+{
+	va_list ap;
+	struct putchar_arg pca;
+	va_start(ap, fmt);
+	pca.tty = tp;
+	pca.flags = TOTTY;
+	kvprintf(fmt, putchar, &pca, 10, ap);
+	va_end(ap);
+}
+
+extern	int log_open;
+
+/*
+ * Log writes to the log buffer, and guarantees not to sleep (so can be
+ * called by interrupt routines).  If there is no process reading the
+ * log yet, it writes to the console also.
+ */
+void
+log(int level, const char *fmt, ...)
+{
+	register int s;
+	va_list ap;
+
+	s = splhigh();
+	logpri(level);
+	va_start(ap, fmt);
+
+	kvprintf(fmt, msglogchar, NULL, 10, ap);
+	va_end(ap);
+
+	splx(s);
+	if (!log_open) {
+		struct putchar_arg pca;
+		va_start(ap, fmt);
+		pca.tty = NULL;
+		pca.flags = TOCONS;
+		kvprintf(fmt, putchar, &pca, 10, ap);
+		va_end(ap);
+	}
+	logwakeup();
+}
+
+static void
+logpri(level)
+	int level;
+{
+	register char *p;
+
+	msglogchar('<', NULL);
+	for (p = ksprintn((u_long)level, 10, NULL); *p;)
+		msglogchar(*p--, NULL);
+	msglogchar('>', NULL);
+}
+
+int
+addlog(const char *fmt, ...)
+{
+	register int s;
+	va_list ap;
+	int retval;
+
+	s = splhigh();
+	va_start(ap, fmt);
+	retval = kvprintf(fmt, msglogchar, NULL, 10, ap);
+	splx(s);
+	va_end(ap);
+	if (!log_open) {
+		struct putchar_arg pca;
+		va_start(ap, fmt);
+		pca.tty = NULL;
+		pca.flags = TOCONS;
+		kvprintf(fmt, putchar, &pca, 10, ap);
+		va_end(ap);
+	}
+	logwakeup();
+	return (retval);
+}
+
+int
+printf(const char *fmt, ...)
+{
+	va_list ap;
+	register int savintr;
+	struct putchar_arg pca;
+	int retval;
+
+	savintr = consintr;		/* disable interrupts */
+	consintr = 0;
+	va_start(ap, fmt);
+	pca.tty = NULL;
+	pca.flags = TOCONS | TOLOG;
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+	va_end(ap);
+	if (!panicstr)
+		logwakeup();
+	consintr = savintr;		/* reenable interrupts */
+	return retval;
+}
+
+void
+vprintf(const char *fmt, va_list ap)
+{
+	register int savintr;
+	struct putchar_arg pca;
+
+	savintr = consintr;		/* disable interrupts */
+	consintr = 0;
+	pca.tty = NULL;
+	pca.flags = TOCONS | TOLOG;
+	kvprintf(fmt, putchar, &pca, 10, ap);
+	if (!panicstr)
+		logwakeup();
+	consintr = savintr;		/* reenable interrupts */
+}
+
+/*
+ * Print a character on console or users terminal.  If destination is
+ * the console then the last bunch of characters are saved in msgbuf for
+ * inspection later.
+ */
+static void
+putchar(int c, void *arg)
+{
+	struct putchar_arg *ap = (struct putchar_arg*) arg;
+	int flags = ap->flags;
+	struct tty *tp = ap->tty;
+	if (panicstr)
+		constty = NULL;
+	if ((flags & TOCONS) && tp == NULL && constty) {
+		tp = constty;
+		flags |= TOTTY;
+	}
+	if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 &&
+	    (flags & TOCONS) && tp == constty)
+		constty = NULL;
+	if ((flags & TOLOG))
+		msglogchar(c, NULL);
+	if ((flags & TOCONS) && constty == NULL && c != '\0')
+		(*v_putc)(c);
+}
+
+/*
+ * Scaled down version of sprintf(3).
+ */
+int
+sprintf(char *buf, const char *cfmt, ...)
+{
+	int retval;
+	va_list ap;
+
+	va_start(ap, cfmt);
+	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+	buf[retval] = '\0';
+	va_end(ap);
+	return retval;
+}
+
+/*
+ * Scaled down version of vsprintf(3).
+ */
+int
+vsprintf(char *buf, const char *cfmt, va_list ap)
+{
+	int retval;
+
+	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+	buf[retval] = '\0';
+	return retval;
+}
+
+/*
+ * Scaled down version of snprintf(3).
+ */
+int
+snprintf(char *str, size_t size, const char *format, ...)
+{
+	int retval;
+	va_list ap;
+
+	va_start(ap, format);
+	retval = vsnprintf(str, size, format, ap);
+	va_end(ap);
+	return(retval);
+}
+
+/*
+ * Scaled down version of vsnprintf(3).
+ */
+int
+vsnprintf(char *str, size_t size, const char *format, va_list ap)
+{
+	struct snprintf_arg info;
+	int retval;
+
+	info.str = str;
+	info.remain = size;
+	retval = kvprintf(format, snprintf_func, &info, 10, ap);
+	if (info.remain >= 1)
+		*info.str++ = '\0';
+	return retval;
+}
+
+static void
+snprintf_func(int ch, void *arg)
+{
+	struct snprintf_arg *const info = arg;
+
+	if (info->remain >= 2) {
+		*info->str++ = ch;
+		info->remain--;
+	}
+}
+
+/*
+ * Put a number (base <= 16) in a buffer in reverse order; return an
+ * optional length and a pointer to the NULL terminated (preceded?)
+ * buffer.
+ */
+static char *
+ksprintn(ul, base, lenp)
+	register u_long ul;
+	register int base, *lenp;
+{					/* A long in base 8, plus NULL. */
+	static char buf[sizeof(long) * NBBY / 3 + 2];
+	register char *p;
+
+	p = buf;
+	do {
+		*++p = hex2ascii(ul % base);
+	} while (ul /= base);
+	if (lenp)
+		*lenp = p - buf;
+	return (p);
+}
+
+/*
+ * Scaled down version of printf(3).
+ *
+ * Two additional formats:
+ *
+ * The format %b is supported to decode error registers.
+ * Its usage is:
+ *
+ *	printf("reg=%b\n", regval, "<base><arg>*");
+ *
+ * where <base> is the output base expressed as a control character, e.g.
+ * \10 gives octal; \20 gives hex.  Each arg is a sequence of characters,
+ * the first of which gives the bit number to be inspected (origin 1), and
+ * the next characters (up to a control character, i.e. a character <= 32),
+ * give the name of the register.  Thus:
+ *
+ *	kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
+ *
+ * would produce output:
+ *
+ *	reg=3<BITTWO,BITONE>
+ *
+ * XXX:  %D  -- Hexdump, takes pointer and separator string:
+ *		("%6D", ptr, ":")   -> XX:XX:XX:XX:XX:XX
+ *		("%*D", len, ptr, " " -> XX XX XX XX ...
+ */
+int
+kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
+{
+#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
+	char *p, *q, *d;
+	u_char *up;
+	int ch, n;
+	u_long ul;
+	int base, lflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
+	int dwidth;
+	char padc;
+	int retval = 0;
+
+	if (!func)
+		d = (char *) arg;
+	else
+		d = NULL;
+
+	if (fmt == NULL)
+		fmt = "(fmt null)\n";
+
+	if (radix < 2 || radix > 36)
+		radix = 10;
+
+	for (;;) {
+		padc = ' ';
+		width = 0;
+		while ((ch = (u_char)*fmt++) != '%') {
+			if (ch == '\0') 
+				return retval;
+			PCHAR(ch);
+		}
+		lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
+		sign = 0; dot = 0; dwidth = 0;
+reswitch:	switch (ch = (u_char)*fmt++) {
+		case '.':
+			dot = 1;
+			goto reswitch;
+		case '#':
+			sharpflag = 1;
+			goto reswitch;
+		case '+':
+			sign = 1;
+			goto reswitch;
+		case '-':
+			ladjust = 1;
+			goto reswitch;
+		case '%':
+			PCHAR(ch);
+			break;
+		case '*':
+			if (!dot) {
+				width = va_arg(ap, int);
+				if (width < 0) {
+					ladjust = !ladjust;
+					width = -width;
+				}
+			} else {
+				dwidth = va_arg(ap, int);
+			}
+			goto reswitch;
+		case '0':
+			if (!dot) {
+				padc = '0';
+				goto reswitch;
+			}
+		case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+				for (n = 0;; ++fmt) {
+					n = n * 10 + ch - '0';
+					ch = *fmt;
+					if (ch < '0' || ch > '9')
+						break;
+				}
+			if (dot)
+				dwidth = n;
+			else
+				width = n;
+			goto reswitch;
+		case 'b':
+			ul = va_arg(ap, int);
+			p = va_arg(ap, char *);
+			for (q = ksprintn(ul, *p++, NULL); *q;)
+				PCHAR(*q--);
+
+			if (!ul)
+				break;
+
+			for (tmp = 0; *p;) {
+				n = *p++;
+				if (ul & (1 << (n - 1))) {
+					PCHAR(tmp ? ',' : '<');
+					for (; (n = *p) > ' '; ++p)
+						PCHAR(n);
+					tmp = 1;
+				} else
+					for (; *p > ' '; ++p)
+						continue;
+			}
+			if (tmp)
+				PCHAR('>');
+			break;
+		case 'c':
+			PCHAR(va_arg(ap, int));
+			break;
+		case 'D':
+			up = va_arg(ap, u_char *);
+			p = va_arg(ap, char *);
+			if (!width)
+				width = 16;
+			while(width--) {
+				PCHAR(hex2ascii(*up >> 4));
+				PCHAR(hex2ascii(*up & 0x0f));
+				up++;
+				if (width)
+					for (q=p;*q;q++)
+						PCHAR(*q);
+			}
+			break;
+		case 'd':
+			ul = lflag ? va_arg(ap, long) : va_arg(ap, int);
+			sign = 1;
+			base = 10;
+			goto number;
+		case 'l':
+			lflag = 1;
+			goto reswitch;
+		case 'o':
+			ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
+			base = 8;
+			goto nosign;
+		case 'p':
+			ul = (uintptr_t)va_arg(ap, void *);
+			base = 16;
+			sharpflag = (width == 0);
+			goto nosign;
+		case 'n':
+		case 'r':
+			ul = lflag ? va_arg(ap, u_long) :
+			    sign ? (u_long)va_arg(ap, int) : va_arg(ap, u_int);
+			base = radix;
+			goto number;
+		case 's':
+			p = va_arg(ap, char *);
+			if (p == NULL)
+				p = "(null)";
+			if (!dot)
+				n = strlen (p);
+			else
+				for (n = 0; n < dwidth && p[n]; n++)
+					continue;
+
+			width -= n;
+
+			if (!ladjust && width > 0)
+				while (width--)
+					PCHAR(padc);
+			while (n--)
+				PCHAR(*p++);
+			if (ladjust && width > 0)
+				while (width--)
+					PCHAR(padc);
+			break;
+		case 'u':
+			ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
+			base = 10;
+			goto nosign;
+		case 'x':
+			ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
+			base = 16;
+			goto nosign;
+		case 'z':
+			ul = lflag ? va_arg(ap, u_long) :
+			    sign ? (u_long)va_arg(ap, int) : va_arg(ap, u_int);
+			base = 16;
+			goto number;
+nosign:			sign = 0;
+number:			if (sign && (long)ul < 0L) {
+				neg = 1;
+				ul = -(long)ul;
+			}
+			p = ksprintn(ul, base, &tmp);
+			if (sharpflag && ul != 0) {
+				if (base == 8)
+					tmp++;
+				else if (base == 16)
+					tmp += 2;
+			}
+			if (neg)
+				tmp++;
+
+			if (!ladjust && width && (width -= tmp) > 0)
+				while (width--)
+					PCHAR(padc);
+			if (neg)
+				PCHAR('-');
+			if (sharpflag && ul != 0) {
+				if (base == 8) {
+					PCHAR('0');
+				} else if (base == 16) {
+					PCHAR('0');
+					PCHAR('x');
+				}
+			}
+
+			while (*p)
+				PCHAR(*p--);
+
+			if (ladjust && width && (width -= tmp) > 0)
+				while (width--)
+					PCHAR(padc);
+
+			break;
+		default:
+			PCHAR('%');
+			if (lflag)
+				PCHAR('l');
+			PCHAR(ch);
+			break;
+		}
+	}
+#undef PCHAR
+}
+
+/*
+ * Put character in log buffer.
+ */
+static void
+msglogchar(int c, void *dummyarg)
+{
+	struct msgbuf *mbp;
+
+	if (c != '\0' && c != '\r' && c != 0177 && msgbufmapped) {
+		mbp = msgbufp;
+		mbp->msg_ptr[mbp->msg_bufx++] = c;
+		if (mbp->msg_bufx >= mbp->msg_size)
+			mbp->msg_bufx = 0;
+		/* If the buffer is full, keep the most recent data. */
+		if (mbp->msg_bufr == mbp->msg_bufx) {
+			if (++mbp->msg_bufr >= mbp->msg_size)
+				mbp->msg_bufr = 0;
+		}
+	}
+}
+
+void
+msgbufinit(void *ptr, size_t size)
+{
+	char *cp;
+
+	cp = (char *)ptr;
+	msgbufp = (struct msgbuf *) (cp + size - sizeof(*msgbufp));
+	if (msgbufp->msg_magic != MSG_MAGIC || msgbufp->msg_ptr != cp) {
+		bzero(cp, size);
+		msgbufp->msg_magic = MSG_MAGIC;
+		msgbufp->msg_size = (char *)msgbufp - cp;
+		msgbufp->msg_ptr = cp;
+	}
+	msgbufmapped = 1;
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(msgbuf, db_show_msgbuf)
+{
+	int i, j;
+
+	if (!msgbufmapped) {
+		db_printf("msgbuf not mapped yet\n");
+		return;
+	}
+	db_printf("msgbufp = %p\n", msgbufp);
+	db_printf("magic = %x, size = %d, r= %d, w = %d, ptr = %p\n",
+	    msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_bufr,
+	    msgbufp->msg_bufx, msgbufp->msg_ptr);
+	for (i = 0; i < msgbufp->msg_size; i++) {
+		j = (i + msgbufp->msg_bufr) % msgbufp->msg_size;
+		db_printf("%c", msgbufp->msg_ptr[j]);
+	}
+	db_printf("\n");
+}
+
+#endif /* DDB */
diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c
new file mode 100644
index 0000000..d0ecad7
--- /dev/null
+++ b/sys/kern/subr_prof.c
@@ -0,0 +1,457 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_prof.c	8.3 (Berkeley) 9/23/93
+ * $Id: subr_prof.c,v 1.27 1998/07/14 05:09:46 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+
+#ifdef GPROF
+#include <sys/malloc.h>
+#include <sys/gmon.h>
+
+static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
+
+static void kmstartup __P((void *));
+SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL)
+
+struct gmonparam _gmonparam = { GMON_PROF_OFF };
+
+#ifdef GUPROF
+void
+nullfunc_loop_profiled()
+{
+	int i;
+
+	for (i = 0; i < CALIB_SCALE; i++)
+		nullfunc_profiled();
+}
+
+#define	nullfunc_loop_profiled_end	nullfunc_profiled	/* XXX */
+
+void
+nullfunc_profiled()
+{
+}
+#endif /* GUPROF */
+
+static void
+kmstartup(dummy)
+	void *dummy;
+{
+	char *cp;
+	struct gmonparam *p = &_gmonparam;
+#ifdef GUPROF
+	int cputime_overhead;
+	int empty_loop_time;
+	int i;
+	int mcount_overhead;
+	int mexitcount_overhead;
+	int nullfunc_loop_overhead;
+	int nullfunc_loop_profiled_time;
+	uintfptr_t tmp_addr;
+#endif
+
+	/*
+	 * Round lowpc and highpc to multiples of the density we're using
+	 * so the rest of the scaling (here and in gprof) stays in ints.
+	 */
+	p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
+	p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
+	p->textsize = p->highpc - p->lowpc;
+	printf("Profiling kernel, textsize=%lu [%x..%x]\n",
+	       p->textsize, p->lowpc, p->highpc);
+	p->kcountsize = p->textsize / HISTFRACTION;
+	p->hashfraction = HASHFRACTION;
+	p->fromssize = p->textsize / HASHFRACTION;
+	p->tolimit = p->textsize * ARCDENSITY / 100;
+	if (p->tolimit < MINARCS)
+		p->tolimit = MINARCS;
+	else if (p->tolimit > MAXARCS)
+		p->tolimit = MAXARCS;
+	p->tossize = p->tolimit * sizeof(struct tostruct);
+	cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize,
+	    M_GPROF, M_NOWAIT);
+	if (cp == 0) {
+		printf("No memory for profiling.\n");
+		return;
+	}
+	bzero(cp, p->kcountsize + p->tossize + p->fromssize);
+	p->tos = (struct tostruct *)cp;
+	cp += p->tossize;
+	p->kcount = (HISTCOUNTER *)cp;
+	cp += p->kcountsize;
+	p->froms = (u_short *)cp;
+
+#ifdef GUPROF
+	/* Initialize pointers to overhead counters. */
+	p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
+	p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
+	p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));
+
+	/*
+	 * Disable interrupts to avoid interference while we calibrate
+	 * things.
+	 */
+	disable_intr();
+
+	/*
+	 * Determine overheads.
+	 * XXX this needs to be repeated for each useful timer/counter.
+	 */
+	cputime_overhead = 0;
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+		cputime_overhead += cputime();
+
+	empty_loop();
+	startguprof(p);
+	empty_loop();
+	empty_loop_time = cputime();
+
+	nullfunc_loop_profiled();
+
+	/*
+	 * Start profiling.  There won't be any normal function calls since
+	 * interrupts are disabled, but we will call the profiling routines
+	 * directly to determine their overheads.
+	 */
+	p->state = GMON_PROF_HIRES;
+
+	startguprof(p);
+	nullfunc_loop_profiled();
+
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+#if defined(__i386__) && __GNUC__ >= 2
+		__asm("pushl %0; call __mcount; popl %%ecx"
+		      :
+		      : "i" (profil)
+		      : "ax", "bx", "cx", "dx", "memory");
+#else
+#error
+#endif
+	mcount_overhead = KCOUNT(p, PC_TO_I(p, profil));
+
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+#if defined(__i386__) && __GNUC__ >= 2
+		    __asm("call mexitcount; 1:"
+			  : : : "ax", "bx", "cx", "dx", "memory");
+	__asm("movl $1b,%0" : "=rm" (tmp_addr));
+#else
+#error
+#endif
+	mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));
+
+	p->state = GMON_PROF_OFF;
+	stopguprof(p);
+
+	enable_intr();
+
+	nullfunc_loop_profiled_time = 0;
+	for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled;
+	     tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end;
+	     tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
+		nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
+#define CALIB_DOSCALE(count)	(((count) + CALIB_SCALE / 3) / CALIB_SCALE)
+#define	c2n(count, freq)	((int)((count) * 1000000000LL / freq))
+	printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
+	       CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
+	       CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
+	       CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
+	       CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
+	       CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
+	cputime_overhead -= empty_loop_time;
+	mcount_overhead -= empty_loop_time;
+	mexitcount_overhead -= empty_loop_time;
+
+	/*-
+	 * Profiling overheads are determined by the times between the
+	 * following events:
+	 *	MC1: mcount() is called
+	 *	MC2: cputime() (called from mcount()) latches the timer
+	 *	MC3: mcount() completes
+	 *	ME1: mexitcount() is called
+	 *	ME2: cputime() (called from mexitcount()) latches the timer
+	 *	ME3: mexitcount() completes.
+	 * The times between the events vary slightly depending on instruction
+	 * combination and cache misses, etc.  Attempt to determine the
+	 * minimum times.  These can be subtracted from the profiling times
+	 * without much risk of reducing the profiling times below what they
+	 * would be when profiling is not configured.  Abbreviate:
+	 *	ab = minimum time between MC1 and MC3
+	 *	a  = minumum time between MC1 and MC2
+	 *	b  = minimum time between MC2 and MC3
+	 *	cd = minimum time between ME1 and ME3
+	 *	c  = minimum time between ME1 and ME2
+	 *	d  = minimum time between ME2 and ME3.
+	 * These satisfy the relations:
+	 *	ab            <= mcount_overhead		(just measured)
+	 *	a + b         <= ab
+	 *	        cd    <= mexitcount_overhead		(just measured)
+	 *	        c + d <= cd
+	 *	a         + d <= nullfunc_loop_profiled_time	(just measured)
+	 *	a >= 0, b >= 0, c >= 0, d >= 0.
+	 * Assume that ab and cd are equal to the minimums.
+	 */
+	p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
+	p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
+	p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
+					       - cputime_overhead);
+	nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
+	p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
+						     - nullfunc_loop_overhead)
+						    / 4);
+	p->mexitcount_pre_overhead = p->mexitcount_overhead
+				     + p->cputime_overhead
+				     - p->mexitcount_post_overhead;
+	p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
+				 - p->mexitcount_post_overhead;
+	p->mcount_post_overhead = p->mcount_overhead
+				  + p->cputime_overhead
+				  - p->mcount_pre_overhead;
+	printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
+	       c2n(p->cputime_overhead, p->profrate),
+	       c2n(p->mcount_overhead, p->profrate),
+	       c2n(p->mcount_pre_overhead, p->profrate),
+	       c2n(p->mcount_post_overhead, p->profrate),
+	       c2n(p->cputime_overhead, p->profrate),
+	       c2n(p->mexitcount_overhead, p->profrate),
+	       c2n(p->mexitcount_pre_overhead, p->profrate),
+	       c2n(p->mexitcount_post_overhead, p->profrate));
+	printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
+	       p->cputime_overhead, p->mcount_overhead,
+	       p->mcount_pre_overhead, p->mcount_post_overhead,
+	       p->cputime_overhead, p->mexitcount_overhead,
+	       p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
+#endif /* GUPROF */
+}
+
+/*
+ * Return kernel profiling information.
+ */
+static int
+sysctl_kern_prof SYSCTL_HANDLER_ARGS
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	struct gmonparam *gp = &_gmonparam;
+	int error;
+	int state;
+
+	/* all sysctl names at this level are terminal */
+	if (namelen != 1)
+		return (ENOTDIR);		/* overloaded */
+
+	switch (name[0]) {
+	case GPROF_STATE:
+		state = gp->state;
+		error = sysctl_handle_int(oidp, &state, 0, req);
+		if (error)
+			return (error);
+		if (!req->newptr)
+			return (0);
+		if (state == GMON_PROF_OFF) {
+			gp->state = state;
+			stopprofclock(&proc0);
+			stopguprof(gp);
+		} else if (state == GMON_PROF_ON) {
+			gp->state = GMON_PROF_OFF;
+			stopguprof(gp);
+			gp->profrate = profhz;
+			startprofclock(&proc0);
+			gp->state = state;
+#ifdef GUPROF
+		} else if (state == GMON_PROF_HIRES) {
+			gp->state = GMON_PROF_OFF;
+			stopprofclock(&proc0);
+			startguprof(gp);
+			gp->state = state;
+#endif
+		} else if (state != gp->state)
+			return (EINVAL);
+		return (0);
+	case GPROF_COUNT:
+		return (sysctl_handle_opaque(oidp, 
+			gp->kcount, gp->kcountsize, req));
+	case GPROF_FROMS:
+		return (sysctl_handle_opaque(oidp, 
+			gp->froms, gp->fromssize, req));
+	case GPROF_TOS:
+		return (sysctl_handle_opaque(oidp, 
+			gp->tos, gp->tossize, req));
+	case GPROF_GMONPARAM:
+		return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
+	default:
+		return (EOPNOTSUPP);
+	}
+	/* NOTREACHED */
+}
+
+SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
+#endif /* GPROF */
+
+/*
+ * Profiling system call.
+ *
+ * The scale factor is a fixed point number with 16 bits of fraction, so that
+ * 1.0 is represented as 0x10000.  A scale factor of 0 turns off profiling.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct profil_args {
+	caddr_t	samples;
+	size_t	size;
+	size_t	offset;
+	u_int	scale;
+};
+#endif
+/* ARGSUSED */
+int
+profil(p, uap)
+	struct proc *p;
+	register struct profil_args *uap;
+{
+	register struct uprof *upp;
+	int s;
+
+	if (uap->scale > (1 << 16))
+		return (EINVAL);
+	if (uap->scale == 0) {
+		stopprofclock(p);
+		return (0);
+	}
+	upp = &p->p_stats->p_prof;
+
+	/* Block profile interrupts while changing state. */
+	s = splstatclock();
+	upp->pr_off = uap->offset;
+	upp->pr_scale = uap->scale;
+	upp->pr_base = uap->samples;
+	upp->pr_size = uap->size;
+	startprofclock(p);
+	splx(s);
+
+	return (0);
+}
+
+/*
+ * Scale is a fixed-point number with the binary point 16 bits
+ * into the value, and is <= 1.0.  pc is at most 32 bits, so the
+ * intermediate result is at most 48 bits.
+ */
+#define	PC_TO_INDEX(pc, prof) \
+	((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
+	    (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
+
+/*
+ * Collect user-level profiling statistics; called on a profiling tick,
+ * when a process is running in user-mode.  This routine may be called
+ * from an interrupt context.  We try to update the user profiling buffers
+ * cheaply with fuswintr() and suswintr().  If that fails, we revert to
+ * an AST that will vector us to trap() with a context in which copyin
+ * and copyout will work.  Trap will then call addupc_task().
+ *
+ * Note that we may (rarely) not get around to the AST soon enough, and
+ * lose profile ticks when the next tick overwrites this one, but in this
+ * case the system is overloaded and the profile is probably already
+ * inaccurate.
+ */
+void
+addupc_intr(p, pc, ticks)
+	register struct proc *p;
+	register u_long pc;
+	u_int ticks;
+{
+	register struct uprof *prof;
+	register caddr_t addr;
+	register u_int i;
+	register int v;
+
+	if (ticks == 0)
+		return;
+	prof = &p->p_stats->p_prof;
+	if (pc < prof->pr_off ||
+	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
+		return;			/* out of range; ignore */
+
+	addr = prof->pr_base + i;
+	if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
+		prof->pr_addr = pc;
+		prof->pr_ticks = ticks;
+		need_proftick(p);
+	}
+}
+
+/*
+ * Much like before, but we can afford to take faults here.  If the
+ * update fails, we simply turn off profiling.
+ */
+void
+addupc_task(p, pc, ticks)
+	register struct proc *p;
+	register u_long pc;
+	u_int ticks;
+{
+	register struct uprof *prof;
+	register caddr_t addr;
+	register u_int i;
+	u_short v;
+
+	/* Testing P_PROFIL may be unnecessary, but is certainly safe. */
+	if ((p->p_flag & P_PROFIL) == 0 || ticks == 0)
+		return;
+
+	prof = &p->p_stats->p_prof;
+	if (pc < prof->pr_off ||
+	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
+		return;
+
+	addr = prof->pr_base + i;
+	if (copyin(addr, (caddr_t)&v, sizeof(v)) == 0) {
+		v += ticks;
+		if (copyout((caddr_t)&v, addr, sizeof(v)) == 0)
+			return;
+	}
+	stopprofclock(p);
+}
diff --git a/sys/kern/subr_rlist.c b/sys/kern/subr_rlist.c
index 3adf5a8..80a39cf 100644
--- a/sys/kern/subr_rlist.c
+++ b/sys/kern/subr_rlist.c
@@ -12,25 +12,25 @@
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
- *	This software is a component of "386BSD" developed by 
-	William F. Jolitz, TeleMuse.
+ *	This software is a component of "386BSD" developed by
+ *	William F. Jolitz, TeleMuse.
  * 4. Neither the name of the developer nor the name "386BSD"
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
- * THIS SOFTWARE IS A COMPONENT OF 386BSD DEVELOPED BY WILLIAM F. JOLITZ 
- * AND IS INTENDED FOR RESEARCH AND EDUCATIONAL PURPOSES ONLY. THIS 
- * SOFTWARE SHOULD NOT BE CONSIDERED TO BE A COMMERCIAL PRODUCT. 
- * THE DEVELOPER URGES THAT USERS WHO REQUIRE A COMMERCIAL PRODUCT 
+ * THIS SOFTWARE IS A COMPONENT OF 386BSD DEVELOPED BY WILLIAM F. JOLITZ
+ * AND IS INTENDED FOR RESEARCH AND EDUCATIONAL PURPOSES ONLY. THIS
+ * SOFTWARE SHOULD NOT BE CONSIDERED TO BE A COMMERCIAL PRODUCT.
+ * THE DEVELOPER URGES THAT USERS WHO REQUIRE A COMMERCIAL PRODUCT
  * NOT MAKE USE THIS WORK.
  *
  * FOR USERS WHO WISH TO UNDERSTAND THE 386BSD SYSTEM DEVELOPED
- * BY WILLIAM F. JOLITZ, WE RECOMMEND THE USER STUDY WRITTEN 
- * REFERENCES SUCH AS THE  "PORTING UNIX TO THE 386" SERIES 
- * (BEGINNING JANUARY 1991 "DR. DOBBS JOURNAL", USA AND BEGINNING 
- * JUNE 1991 "UNIX MAGAZIN", GERMANY) BY WILLIAM F. JOLITZ AND 
- * LYNNE GREER JOLITZ, AS WELL AS OTHER BOOKS ON UNIX AND THE 
- * ON-LINE 386BSD USER MANUAL BEFORE USE. A BOOK DISCUSSING THE INTERNALS 
+ * BY WILLIAM F. JOLITZ, WE RECOMMEND THE USER STUDY WRITTEN
+ * REFERENCES SUCH AS THE  "PORTING UNIX TO THE 386" SERIES
+ * (BEGINNING JANUARY 1991 "DR. DOBBS JOURNAL", USA AND BEGINNING
+ * JUNE 1991 "UNIX MAGAZIN", GERMANY) BY WILLIAM F. JOLITZ AND
+ * LYNNE GREER JOLITZ, AS WELL AS OTHER BOOKS ON UNIX AND THE
+ * ON-LINE 386BSD USER MANUAL BEFORE USE. A BOOK DISCUSSING THE INTERNALS
  * OF 386BSD ENTITLED "386BSD FROM THE INSIDE OUT" WILL BE AVAILABLE LATE 1992.
  *
  * THIS SOFTWARE IS PROVIDED BY THE DEVELOPER ``AS IS'' AND
@@ -46,99 +46,185 @@
  * SUCH DAMAGE.
  *
  */
-static char rcsid[] = "$Header: /usr/bill/working/sys/kern/RCS/subr_rlist.c,v 1.2 92/01/21 21:29:31 william Exp $";
+/*
+ * Changes Copyright (C) 1995, David Greenman & John Dyson; This software may
+ * be used, modified, copied, distributed, and sold, in both source and
+ * binary form provided that the above copyright and these terms are
+ * retained. Under no circumstances is the author responsible for the proper
+ * functioning of this software, nor does the author assume any responsibility
+ * for damages incurred with its use.
+ *
+ *	--------- DEPRECIATED ---------
+ *
+ *	$Id: subr_rlist.c,v 1.30 1999/01/21 08:29:04 dillon Exp $
+ */
 
-#include "sys/param.h"
-#include "sys/cdefs.h"
-#include "sys/malloc.h"
-#include "rlist.h"
+#if 0
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/rlist.h>
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
 
 /*
  * Resource lists.
  */
 
-/*
- * Add space to a resource list. Used to either
- * initialize a list or return free space to it.
- */
-rlist_free (rlp, start, end)
-register struct rlist **rlp; unsigned start, end; {
-	struct rlist *head;
-
-	head = *rlp;
-
-loop:
-	/* if nothing here, insert (tail of list) */
-	if (*rlp == 0) {
-		*rlp = (struct rlist *)malloc(sizeof(**rlp), M_TEMP, M_NOWAIT);
-		(*rlp)->rl_start = start;
-		(*rlp)->rl_end = end;
-		(*rlp)->rl_next = 0;
-		return;
-	}
+#define RLIST_MIN 128
+static int rlist_count=0;
+static struct rlist *rlfree;
 
-	/* if new region overlaps something currently present, panic */
-	if (start >= (*rlp)->rl_start && start <= (*rlp)->rl_end)  {
-		printf("Frag %d:%d, ent %d:%d ", start, end,
-			(*rlp)->rl_start, (*rlp)->rl_end);
-		panic("overlapping front rlist_free: freed twice?");
+static struct rlist	*rlist_malloc __P((void));
+static __inline void	rlist_mfree __P((struct rlist *rl));
+
+static struct rlist *
+rlist_malloc()
+{
+	struct rlist *rl;
+	int i;
+	while( rlist_count < RLIST_MIN) {
+		int s = splhigh();
+		rl = (struct rlist *)kmem_alloc(kernel_map, PAGE_SIZE);
+		splx(s);
+		if( !rl)
+			break;
+
+		for(i=0;i<(PAGE_SIZE/(sizeof *rl));i++) {
+			rl->rl_next = rlfree;
+			rlfree = rl;
+			rlist_count++;
+			rl++;
+		}
 	}
-	if (end >= (*rlp)->rl_start && end <= (*rlp)->rl_end) {
-		printf("Frag %d:%d, ent %d:%d ", start, end,
-			(*rlp)->rl_start, (*rlp)->rl_end);
-		panic("overlapping tail rlist_free: freed twice?");
+
+	if( (rl = rlfree) == 0 )
+		panic("Cannot get an rlist entry");
+
+	--rlist_count;
+	rlfree = rl->rl_next;
+	return rl;
+}
+
+static __inline void
+rlist_mfree(rl)
+	struct rlist *rl;
+{
+	rl->rl_next = rlfree;
+	rlfree = rl;
+	++rlist_count;
+}
+
+void
+rlist_free(rlh, start, end)
+	struct rlisthdr *rlh;
+	u_int start, end;
+{
+	struct rlist **rlp = &rlh->rlh_list;
+	struct rlist *prev_rlp = NULL, *cur_rlp, *next_rlp = NULL;
+	int s;
+
+	s = splhigh();
+	while (rlh->rlh_lock & RLH_LOCKED) {
+		rlh->rlh_lock |= RLH_DESIRED;
+		tsleep(rlh, PSWP, "rlistf", 0);
 	}
+	rlh->rlh_lock |= RLH_LOCKED;
+	splx(s);
 
-	/* are we adjacent to this element? (in front) */
-	if (end+1 == (*rlp)->rl_start) {
-		/* coalesce */
-		(*rlp)->rl_start = start;
-		goto scan;
+	/*
+	 * Traverse the list looking for an entry after the one we want
+	 * to insert.
+	 */
+	cur_rlp = *rlp;
+	while (cur_rlp != NULL) {
+		if (start < cur_rlp->rl_start)
+			break;
+		if (prev_rlp) {
+			KASSERT(prev_rlp->rl_end + 1 != cur_rlp->rl_start,
+			    ("rlist_free: missed coalesce opportunity"));
+			KASSERT(prev_rlp->rl_end != cur_rlp->rl_start,
+			    ("rlist_free: entries overlap"));
+			KASSERT(prev_rlp->rl_end <= cur_rlp->rl_start,
+			    ("entries out of order"));
+		}
+		prev_rlp = cur_rlp;
+		cur_rlp = cur_rlp->rl_next;
 	}
 
-	/* are we before this element? */
-	if (end < (*rlp)->rl_start) {
-		register struct rlist *nlp;
+	if (cur_rlp != NULL) {
+
+		if (end >= cur_rlp->rl_start)
+			panic("rlist_free: free end overlaps already freed area");
 
-		nlp = (struct rlist *)malloc(sizeof(*nlp), M_TEMP, M_NOWAIT);
-		nlp->rl_start = start;
-		nlp->rl_end = end;
-		nlp->rl_next = *rlp;
-		*rlp = nlp;
-		return;
+		if (prev_rlp) {
+			if (start <= prev_rlp->rl_end)
+				panic("rlist_free: free start overlaps already freed area");
+			/*
+			 * Attempt to append
+			 */
+			if (prev_rlp->rl_end + 1 == start) {
+				prev_rlp->rl_end = end;
+				/*
+				 * Attempt to prepend and coalesce
+				 */
+				if (end + 1 == cur_rlp->rl_start) {
+					prev_rlp->rl_end = cur_rlp->rl_end;
+					prev_rlp->rl_next = cur_rlp->rl_next;
+					rlist_mfree(cur_rlp);
+				}
+				goto done;
+			}
+		}
+		/*
+		 * Attempt to prepend
+		 */
+		if (end + 1 == cur_rlp->rl_start) {
+			cur_rlp->rl_start = start;
+			goto done;
+		}
+	}
+	/*
+	 * Reached the end of the list without finding a larger entry.
+	 * Append to last entry if there is one and it's adjacent.
+	 */
+	if (prev_rlp) {
+		if (start <= prev_rlp->rl_end)
+			panic("rlist_free: free start overlaps already freed area at list tail");
+		/*
+		 * Attempt to append
+		 */
+		if (prev_rlp->rl_end + 1 == start) {
+			prev_rlp->rl_end = end;
+			goto done;
+		}
 	}
 
-	/* are we adjacent to this element? (at tail) */
-	if ((*rlp)->rl_end + 1 == start) {
-		/* coalesce */
-		(*rlp)->rl_end = end;
-		goto scan;
+	/*
+	 * Could neither append nor prepend; allocate a new entry.
+	 */
+	next_rlp = cur_rlp;
+	cur_rlp = rlist_malloc();
+	cur_rlp->rl_start = start;
+	cur_rlp->rl_end = end;
+	cur_rlp->rl_next = next_rlp;
+	if (prev_rlp) {
+		prev_rlp->rl_next = cur_rlp;
+	} else {
+		/*
+		 * No previous - this entry is the new list head.
+		 */
+		*rlp = cur_rlp;
 	}
 
-	/* are we after this element */
-	if (start  > (*rlp)->rl_end) {
-		rlp = &((*rlp)->rl_next);
-		goto loop;
-	} else
-		panic("rlist_free: can't happen");
-
-scan:
-	/* can we coalesce list now that we've filled a void? */
-	{
-		register struct rlist *lp, *lpn;
-
-		for (lp = head; lp->rl_next ;) { 
-			lpn = lp->rl_next;
-
-			/* coalesce ? */
-			if (lp->rl_end + 1 == lpn->rl_start) {
-				lp->rl_end = lpn->rl_end;
-				lp->rl_next = lpn->rl_next;
-				free(lpn, M_TEMP);
-			} else
-				lp = lp->rl_next;
-		}
+done:
+	rlh->rlh_lock &= ~RLH_LOCKED;
+	if (rlh->rlh_lock & RLH_DESIRED) {
+		wakeup(rlh);
+		rlh->rlh_lock &= ~RLH_DESIRED;
 	}
+	return;
 }
 
 /*
@@ -147,10 +233,23 @@ scan:
  * return a value of 1 and set resource start location with
  * "*loc". (Note: loc can be zero if we don't wish the value)
  */
-int rlist_alloc (rlp, size, loc)
-struct rlist **rlp; unsigned size, *loc; {
+int
+rlist_alloc (rlh, size, loc)
+	struct rlisthdr *rlh;
+	unsigned size, *loc;
+{
+	struct rlist **rlp = &rlh->rlh_list;
 	register struct rlist *lp;
+	int s;
+	register struct rlist *olp = 0;
 
+	s = splhigh();
+	while (rlh->rlh_lock & RLH_LOCKED) {
+		rlh->rlh_lock |= RLH_DESIRED;
+		tsleep(rlh, PSWP, "rlistf", 0);
+	}
+	rlh->rlh_lock |= RLH_LOCKED;
+	splx(s);
 
 	/* walk list, allocating first thing that's big enough (first fit) */
 	for (; *rlp; rlp = &((*rlp)->rl_next))
@@ -163,13 +262,33 @@ struct rlist **rlp; unsigned size, *loc; {
 			/* did we eat this element entirely? */
 			if ((*rlp)->rl_start > (*rlp)->rl_end) {
 				lp = (*rlp)->rl_next;
-				free (*rlp, M_TEMP);
-				*rlp = lp;
+				rlist_mfree(*rlp);
+				/*
+				 * if the deleted element was in fromt
+				 * of the list, adjust *rlp, else don't.
+				 */
+				if (olp) {
+					olp->rl_next = lp;
+				} else {
+					*rlp = lp;
+				}
 			}
 
+			rlh->rlh_lock &= ~RLH_LOCKED;
+			if (rlh->rlh_lock & RLH_DESIRED) {
+				wakeup(rlh);
+				rlh->rlh_lock &= ~RLH_DESIRED;
+			}
 			return (1);
+		} else {
+			olp = *rlp;
 		}
 
+	rlh->rlh_lock &= ~RLH_LOCKED;
+	if (rlh->rlh_lock & RLH_DESIRED) {
+		wakeup(rlh);
+		rlh->rlh_lock &= ~RLH_DESIRED;
+	}
 	/* nothing in list that's big enough */
 	return (0);
 }
@@ -178,14 +297,20 @@ struct rlist **rlp; unsigned size, *loc; {
  * Finished with this resource list, reclaim all space and
  * mark it as being empty.
  */
-rlist_destroy (rlp)
-struct rlist **rlp; {
+void
+rlist_destroy (rlh)
+	struct rlisthdr *rlh;
+{
+	struct rlist **rlp = &rlh->rlh_list;
 	struct rlist *lp, *nlp;
 
 	lp = *rlp;
 	*rlp = 0;
 	for (; lp; lp = nlp) {
 		nlp = lp->rl_next;
-		free (lp, M_TEMP);
+		rlist_mfree(lp);
 	}
 }
+
+#endif
+
diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c
new file mode 100644
index 0000000..e0526bb
--- /dev/null
+++ b/sys/kern/subr_rman.c
@@ -0,0 +1,591 @@
+/*
+ * Copyright 1998 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission.  M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied
+ * warranty.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: subr_rman.c,v 1.3 1998/12/07 21:58:29 archie Exp $
+ */
+
+/*
+ * The kernel resource manager.  This code is responsible for keeping track
+ * of hardware resources which are apportioned out to various drivers.
+ * It does not actually assign those resources, and it is not expected
+ * that end-device drivers will call into this code directly.  Rather,
+ * the code which implements the buses that those devices are attached to,
+ * and the code which manages CPU resources, will call this code, and the
+ * end-device drivers will make upcalls to that code to actually perform
+ * the allocation.
+ *
+ * There are two sorts of resources managed by this code.  The first is
+ * the more familiar array (RMAN_ARRAY) type; resources in this class
+ * consist of a sequence of individually-allocatable objects which have
+ * been numbered in some well-defined order.  Most of the resources
+ * are of this type, as it is the most familiar.  The second type is
+ * called a gauge (RMAN_GAUGE), and models fungible resources (i.e.,
+ * resources in which each instance is indistinguishable from every
+ * other instance).  The principal anticipated application of gauges
+ * is in the context of power consumption, where a bus may have a specific
+ * power budget which all attached devices share.  RMAN_GAUGE is not
+ * implemented yet.
+ *
+ * For array resources, we make one simplifying assumption: two clients
+ * sharing the same resource must use the same range of indices.  That
+ * is to say, sharing of overlapping-but-not-identical regions is not
+ * permitted.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/rman.h>
+#include <sys/bus.h>		/* XXX debugging */
+
+MALLOC_DEFINE(M_RMAN, "rman", "Resource manager");
+
+struct	rman_head rman_head;
+#ifndef NULL_SIMPLELOCKS
+static	struct simplelock rman_lock; /* mutex to protect rman_head */
+#endif
+static	int int_rman_activate_resource(struct rman *rm, struct resource *r,
+				       struct resource **whohas);
+static	int int_rman_release_resource(struct rman *rm, struct resource *r);
+
+#define	CIRCLEQ_TERMCOND(var, head)	(var == (void *)&(head))
+
+int
+rman_init(struct rman *rm)
+{
+	static int once;
+
+	if (once == 0) {
+		once = 1;
+		TAILQ_INIT(&rman_head);
+		simple_lock_init(&rman_lock);
+	}
+
+	if (rm->rm_type == RMAN_UNINIT)
+		panic("rman_init");
+	if (rm->rm_type == RMAN_GAUGE)
+		panic("implement RMAN_GAUGE");
+
+	CIRCLEQ_INIT(&rm->rm_list);
+	rm->rm_slock = malloc(sizeof *rm->rm_slock, M_RMAN, M_NOWAIT);
+	if (rm->rm_slock == 0)
+		return ENOMEM;
+	simple_lock_init(rm->rm_slock);
+
+	simple_lock(&rman_lock);
+	TAILQ_INSERT_TAIL(&rman_head, rm, rm_link);
+	simple_unlock(&rman_lock);
+	return 0;
+}
+
+/*
+ * NB: this interface is not robust against programming errors which
+ * add multiple copies of the same region.
+ */
+int
+rman_manage_region(struct rman *rm, u_long start, u_long end)
+{
+	struct resource *r, *s;
+
+	r = malloc(sizeof *r, M_RMAN, M_NOWAIT);
+	if (r == 0)
+		return ENOMEM;
+	r->r_sharehead = 0;
+	r->r_start = start;
+	r->r_end = end;
+	r->r_flags = 0;
+	r->r_dev = 0;
+	r->r_rm = rm;
+
+	simple_lock(rm->rm_slock);
+	for (s = rm->rm_list.cqh_first;	
+	     !CIRCLEQ_TERMCOND(s, rm->rm_list) && s->r_end < r->r_start;
+	     s = s->r_link.cqe_next)
+		;
+
+	if (CIRCLEQ_TERMCOND(s, rm->rm_list)) {
+		CIRCLEQ_INSERT_TAIL(&rm->rm_list, r, r_link);
+	} else {
+		CIRCLEQ_INSERT_BEFORE(&rm->rm_list, s, r, r_link);
+	}
+
+	simple_unlock(rm->rm_slock);
+	return 0;
+}
+
+int
+rman_fini(struct rman *rm)
+{
+	struct resource *r;
+
+	simple_lock(rm->rm_slock);
+	for (r = rm->rm_list.cqh_first;	!CIRCLEQ_TERMCOND(r, rm->rm_list);
+	     r = r->r_link.cqe_next) {
+		if (r->r_flags & RF_ALLOCATED)
+			return EBUSY;
+	}
+
+	/*
+	 * There really should only be one of these if we are in this
+	 * state and the code is working properly, but it can't hurt.
+	 */
+	for (r = rm->rm_list.cqh_first;	!CIRCLEQ_TERMCOND(r, rm->rm_list);
+	     r = rm->rm_list.cqh_first) {
+		CIRCLEQ_REMOVE(&rm->rm_list, r, r_link);
+		free(r, M_RMAN);
+	}
+	simple_unlock(rm->rm_slock);
+	simple_lock(&rman_lock);
+	TAILQ_REMOVE(&rman_head, rm, rm_link);
+	simple_unlock(&rman_lock);
+	free(rm->rm_slock, M_RMAN);
+
+	return 0;
+}
+
+struct resource *
+rman_reserve_resource(struct rman *rm, u_long start, u_long end, u_long count,
+		      u_int flags, struct device *dev)
+{
+	u_int	want_activate;
+	struct	resource *r, *s, *rv;
+	u_long	rstart, rend;
+
+	rv = 0;
+
+#ifdef RMAN_DEBUG
+	printf("rman_reserve_resource: <%s> request: [%#lx, %#lx], length "
+	       "%#lx, flags %u, device %s%d\n", rm->rm_descr, start, end,
+	       count, flags, device_get_name(dev), device_get_unit(dev));
+#endif /* RMAN_DEBUG */
+	want_activate = (flags & RF_ACTIVE);
+	flags &= ~RF_ACTIVE;
+
+	simple_lock(rm->rm_slock);
+
+	for (r = rm->rm_list.cqh_first; 
+	     !CIRCLEQ_TERMCOND(r, rm->rm_list) && r->r_end < start;
+	     r = r->r_link.cqe_next)
+		;
+
+	if (CIRCLEQ_TERMCOND(r, rm->rm_list)) {
+#ifdef RMAN_DEBUG
+		printf("could not find a region\n");
+#endif RMAN_DEBUG
+		goto out;
+	}
+
+	/*
+	 * First try to find an acceptable totally-unshared region.
+	 */
+	for (s = r; !CIRCLEQ_TERMCOND(s, rm->rm_list);
+	     s = s->r_link.cqe_next) {
+#ifdef RMAN_DEBUG
+		printf("considering [%#lx, %#lx]\n", s->r_start, s->r_end);
+#endif /* RMAN_DEBUG */
+		if (s->r_start > end) {
+#ifdef RMAN_DEBUG
+			printf("s->r_start (%#lx) > end (%#lx)\n", s->r_start, end);
+#endif /* RMAN_DEBUG */
+			break;
+		}
+		if (s->r_flags & RF_ALLOCATED) {
+#ifdef RMAN_DEBUG
+			printf("region is allocated\n");
+#endif /* RMAN_DEBUG */
+			continue;
+		}
+		rstart = max(s->r_start, start);
+		rend = min(s->r_end, max(start + count, end));
+#ifdef RMAN_DEBUG
+		printf("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n",
+		       rstart, rend, (rend - rstart + 1), count);
+#endif /* RMAN_DEBUG */
+
+		if ((rend - rstart + 1) >= count) {
+#ifdef RMAN_DEBUG
+			printf("candidate region: [%#lx, %#lx], size %#lx\n",
+			       rend, rstart, (rend - rstart + 1));
+#endif /* RMAN_DEBUG */
+			if ((s->r_end - s->r_start + 1) == count) {
+#ifdef RMAN_DEBUG
+				printf("candidate region is entire chunk\n");
+#endif /* RMAN_DEBUG */
+				rv = s;
+				rv->r_flags |= RF_ALLOCATED;
+				rv->r_dev = dev;
+				goto out;
+			}
+
+			/*
+			 * If s->r_start < rstart and
+			 *    s->r_end > rstart + count - 1, then
+			 * we need to split the region into three pieces
+			 * (the middle one will get returned to the user).
+			 * Otherwise, we are allocating at either the
+			 * beginning or the end of s, so we only need to
+			 * split it in two.  The first case requires
+			 * two new allocations; the second requires but one.
+			 */
+			rv = malloc(sizeof *r, M_RMAN, M_NOWAIT);
+			if (rv == 0)
+				goto out;
+			rv->r_start = rstart;
+			rv->r_end = rstart + count - 1;
+			rv->r_flags = flags | RF_ALLOCATED;
+			rv->r_dev = dev;
+			rv->r_sharehead = 0;
+			
+			if (s->r_start < rv->r_start && s->r_end > rv->r_end) {
+#ifdef RMAN_DEBUG
+				printf("splitting region in three parts: "
+				       "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n",
+				       s->r_start, rv->r_start - 1,
+				       rv->r_start, rv->r_end,
+				       rv->r_end + 1, s->r_end);
+#endif /* RMAN_DEBUG */
+				/*
+				 * We are allocating in the middle.
+				 */
+				r = malloc(sizeof *r, M_RMAN, M_NOWAIT);
+				if (r == 0) {
+					free(rv, M_RMAN);
+					rv = 0;
+					goto out;
+				}
+				r->r_start = rv->r_end + 1;
+				r->r_end = s->r_end;
+				r->r_flags = s->r_flags;
+				r->r_dev = 0;
+				r->r_sharehead = 0;
+				s->r_end = rv->r_start - 1;
+				CIRCLEQ_INSERT_AFTER(&rm->rm_list, s, rv,
+						     r_link);
+				CIRCLEQ_INSERT_AFTER(&rm->rm_list, rv, r,
+						     r_link);
+			} else if (s->r_start == rv->r_start) {
+#ifdef RMAN_DEBUG
+				printf("allocating from the beginning\n");
+#endif /* RMAN_DEBUG */
+				/*
+				 * We are allocating at the beginning.
+				 */
+				s->r_start = rv->r_end + 1;
+				CIRCLEQ_INSERT_BEFORE(&rm->rm_list, s, rv,
+						      r_link);
+			} else {
+#ifdef RMAN_DEBUG
+				printf("allocating at the end\n");
+#endif /* RMAN_DEBUG */
+				/*
+				 * We are allocating at the end.
+				 */
+				s->r_end = rv->r_start - 1;
+				CIRCLEQ_INSERT_AFTER(&rm->rm_list, s, rv,
+						     r_link);
+			}
+			goto out;
+		}
+	}
+
+	/*
+	 * Now find an acceptable shared region, if the client's requirements
+	 * allow sharing.  By our implementation restriction, a candidate
+	 * region must match exactly by both size and sharing type in order
+	 * to be considered compatible with the client's request.  (The
+	 * former restriction could probably be lifted without too much
+	 * additional work, but this does not seem warranted.)
+	 */
+#ifdef RMAN_DEBUG
+	printf("no unshared regions found\n");
+#endif /* RMAN_DEBUG */
+	if ((flags & (RF_SHAREABLE | RF_TIMESHARE)) == 0)
+		goto out;
+
+	for (s = r; !CIRCLEQ_TERMCOND(s, rm->rm_list);
+	     s = s->r_link.cqe_next) {
+		if (s->r_start > end)
+			break;
+		if ((s->r_flags & flags) != flags)
+			continue;
+		rstart = max(s->r_start, start);
+		rend = min(s->r_end, max(start + count, end));
+		if (s->r_start >= start && s->r_end <= end
+		    && (s->r_end - s->r_start + 1) == count) {
+			rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT);
+			if (rv == 0)
+				goto out;
+			rv->r_start = s->r_start;
+			rv->r_end = s->r_end;
+			rv->r_flags = s->r_flags & 
+				(RF_ALLOCATED | RF_SHAREABLE | RF_TIMESHARE);
+			rv->r_dev = dev;
+			rv->r_rm = rm;
+			if (s->r_sharehead == 0) {
+				s->r_sharehead = malloc(sizeof *s->r_sharehead,
+							M_RMAN, M_NOWAIT);
+				if (s->r_sharehead == 0) {
+					free(rv, M_RMAN);
+					rv = 0;
+					goto out;
+				}
+				LIST_INIT(s->r_sharehead);
+				LIST_INSERT_HEAD(s->r_sharehead, s, 
+						 r_sharelink);
+				s->r_flags = RF_FIRSTSHARE;
+			}
+			rv->r_sharehead = s->r_sharehead;
+			LIST_INSERT_HEAD(s->r_sharehead, rv, r_sharelink);
+			goto out;
+		}
+	}
+
+	/*
+	 * We couldn't find anything.
+	 */
+out:
+	/*
+	 * If the user specified RF_ACTIVE in the initial flags,
+	 * which is reflected in `want_activate', we attempt to atomically
+	 * activate the resource.  If this fails, we release the resource
+	 * and indicate overall failure.  (This behavior probably doesn't
+	 * make sense for RF_TIMESHARE-type resources.)
+	 */
+	if (rv && want_activate) {
+		struct resource *whohas;
+		if (int_rman_activate_resource(rm, rv, &whohas)) {
+			int_rman_release_resource(rm, rv);
+			rv = 0;
+		}
+	}
+			
+	simple_unlock(rm->rm_slock);
+	return (rv);
+}
+
+static int
+int_rman_activate_resource(struct rman *rm, struct resource *r,
+			   struct resource **whohas)
+{
+	struct resource *s;
+	int ok;
+
+	/*
+	 * If we are not timesharing, then there is nothing much to do.
+	 * If we already have the resource, then there is nothing at all to do.
+	 * If we are not on a sharing list with anybody else, then there is
+	 * little to do.
+	 */
+	if ((r->r_flags & RF_TIMESHARE) == 0
+	    || (r->r_flags & RF_ACTIVE) != 0
+	    || r->r_sharehead == 0) {
+		r->r_flags |= RF_ACTIVE;
+		return 0;
+	}
+
+	ok = 1;
+	for (s = r->r_sharehead->lh_first; s && ok;
+	     s = s->r_sharelink.le_next) {
+		if ((s->r_flags & RF_ACTIVE) != 0) {
+			ok = 0;
+			*whohas = s;
+		}
+	}
+	if (ok) {
+		r->r_flags |= RF_ACTIVE;
+		return 0;
+	}
+	return EBUSY;
+}
+
+int
+rman_activate_resource(struct resource *r)
+{
+	int rv;
+	struct resource *whohas;
+	struct rman *rm;
+
+	rm = r->r_rm;
+	simple_lock(rm->rm_slock);
+	rv = int_rman_activate_resource(rm, r, &whohas);
+	simple_unlock(rm->rm_slock);
+	return rv;
+}
+
+int
+rman_await_resource(struct resource *r, int pri, int timo)
+{
+	int	rv, s;
+	struct	resource *whohas;
+	struct	rman *rm;
+
+	rm = r->r_rm;
+	for (;;) {
+		simple_lock(rm->rm_slock);
+		rv = int_rman_activate_resource(rm, r, &whohas);
+		if (rv != EBUSY)
+			return (rv);
+
+		if (r->r_sharehead == 0)
+			panic("rman_await_resource");
+		/*
+		 * splhigh hopefully will prevent a race between
+		 * simple_unlock and tsleep where a process
+		 * could conceivably get in and release the resource
+		 * before we have a chance to sleep on it.
+		 */
+		s = splhigh();
+		whohas->r_flags |= RF_WANTED;
+		simple_unlock(rm->rm_slock);
+		rv = tsleep(r->r_sharehead, pri, "rmwait", timo);
+		if (rv) {
+			splx(s);
+			return rv;
+		}
+		simple_lock(rm->rm_slock);
+		splx(s);
+	}
+}
+
+int
+rman_deactivate_resource(struct resource *r)
+{
+	struct	rman *rm;
+
+	rm = r->r_rm;
+	simple_lock(rm->rm_slock);
+	r->r_flags &= ~RF_ACTIVE;
+	if (r->r_flags & RF_WANTED) {
+		r->r_flags &= ~RF_WANTED;
+		wakeup(r->r_sharehead);
+	}
+	simple_unlock(rm->rm_slock);
+	return 0;
+}
+
+static int
+int_rman_release_resource(struct rman *rm, struct resource *r)
+{
+	struct	resource *s, *t;
+
+	if (r->r_flags & RF_ACTIVE)
+		return EBUSY;
+
+	/*
+	 * Check for a sharing list first.  If there is one, then we don't
+	 * have to think as hard.
+	 */
+	if (r->r_sharehead) {
+		/*
+		 * If a sharing list exists, then we know there are at
+		 * least two sharers.
+		 *
+		 * If we are in the main circleq, appoint someone else.
+		 */
+		LIST_REMOVE(r, r_sharelink);
+		s = r->r_sharehead->lh_first;
+		if (r->r_flags & RF_FIRSTSHARE) {
+			s->r_flags |= RF_FIRSTSHARE;
+			CIRCLEQ_INSERT_BEFORE(&rm->rm_list, r, s, r_link);
+			CIRCLEQ_REMOVE(&rm->rm_list, r, r_link);
+		}
+
+		/*
+		 * Make sure that the sharing list goes away completely
+		 * if the resource is no longer being shared at all.
+		 */
+		if (s->r_sharelink.le_next == 0) {
+			free(s->r_sharehead, M_RMAN);
+			s->r_sharehead = 0;
+			s->r_flags &= ~RF_FIRSTSHARE;
+		}
+		goto out;
+	}
+
+	/*
+	 * Look at the adjacent resources in the list and see if our
+	 * segment can be merged with any of them.
+	 */
+	s = r->r_link.cqe_prev;
+	t = r->r_link.cqe_next;
+
+	if (s != (void *)&rm->rm_list && (s->r_flags & RF_ALLOCATED) == 0
+	    && t != (void *)&rm->rm_list && (t->r_flags & RF_ALLOCATED) == 0) {
+		/*
+		 * Merge all three segments.
+		 */
+		s->r_end = t->r_end;
+		CIRCLEQ_REMOVE(&rm->rm_list, r, r_link);
+		CIRCLEQ_REMOVE(&rm->rm_list, t, r_link);
+		free(t, M_RMAN);
+	} else if (s != (void *)&rm->rm_list
+		   && (s->r_flags & RF_ALLOCATED) == 0) {
+		/*
+		 * Merge previous segment with ours.
+		 */
+		s->r_end = r->r_end;
+		CIRCLEQ_REMOVE(&rm->rm_list, r, r_link);
+	} else if (t != (void *)&rm->rm_list
+		   && (t->r_flags & RF_ALLOCATED) == 0) {
+		/*
+		 * Merge next segment with ours.
+		 */
+		t->r_start = r->r_start;
+		CIRCLEQ_REMOVE(&rm->rm_list, r, r_link);
+	} else {
+		/*
+		 * At this point, we know there is nothing we
+		 * can potentially merge with, because on each
+		 * side, there is either nothing there or what is
+		 * there is still allocated.  In that case, we don't
+		 * want to remove r from the list; we simply want to
+		 * change it to an unallocated region and return
+		 * without freeing anything.
+		 */
+		r->r_flags &= ~RF_ALLOCATED;
+		return 0;
+	}
+
+out:
+	free(r, M_RMAN);
+	return 0;
+}
+
+int
+rman_release_resource(struct resource *r)
+{
+	int	rv;
+	struct	rman *rm = r->r_rm;
+
+	simple_lock(rm->rm_slock);
+	rv = int_rman_release_resource(rm, r);
+	simple_unlock(rm->rm_slock);
+	return (rv);
+}
diff --git a/sys/kern/subr_scanf.c b/sys/kern/subr_scanf.c
new file mode 100644
index 0000000..24f8846
--- /dev/null
+++ b/sys/kern/subr_scanf.c
@@ -0,0 +1,793 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Chris Torek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ * From: Id: vfscanf.c,v 1.13 1998/09/25 12:20:27 obrien Exp 
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <machine/limits.h>
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define	BUF		32 	/* Maximum length of numeric string. */
+
+/*
+ * Flags used during conversion.
+ */
+#define	LONG		0x01	/* l: long or double */
+#define	SHORT		0x04	/* h: short */
+#define	SUPPRESS	0x08	/* suppress assignment */
+#define	POINTER		0x10	/* weird %p pointer (`fake hex') */
+#define	NOSKIP		0x20	/* do not skip blanks */
+#define	QUAD		0x400
+
+/*
+ * The following are used in numeric conversions only:
+ * SIGNOK, NDIGITS, DPTOK, and EXPOK are for floating point;
+ * SIGNOK, NDIGITS, PFXOK, and NZDIGITS are for integral.
+ */
+#define	SIGNOK		0x40	/* +/- is (still) legal */
+#define	NDIGITS		0x80	/* no digits detected */
+
+#define	DPTOK		0x100	/* (float) decimal point is still legal */
+#define	EXPOK		0x200	/* (float) exponent (e+3, etc) still legal */
+
+#define	PFXOK		0x100	/* 0x prefix is (still) legal */
+#define	NZDIGITS	0x200	/* no zero digits detected */
+
+/*
+ * Conversion types.
+ */
+#define	CT_CHAR		0	/* %c conversion */
+#define	CT_CCL		1	/* %[...] conversion */
+#define	CT_STRING	2	/* %s conversion */
+#define	CT_INT		3	/* integer, i.e., strtoq or strtouq */
+typedef u_quad_t (*ccfntype)(const char *, char **, int);
+
+#define isspace(c)	((c) == ' ' || (c) == '\t' || \
+			 (c) == '\r' || (c) == '\n')
+#define isascii(c)	(((c) & ~0x7f) == 0)
+#define isupper(c)	((c) >= 'A' && (c) <= 'Z')
+#define islower(c)	((c) >= 'a' && (c) <= 'z')
+#define isalpha(c)	(isupper(c) || (islower(c)))
+#define isdigit(c)	((c) >= '0' && (c) <= '9')
+
+static u_char *__sccl(char *, u_char *);
+
+int
+sscanf(const char *ibuf, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	
+	va_start(ap, fmt);
+	ret = vsscanf(ibuf, fmt, ap);
+	va_end(ap);
+	return(ret);
+}
+
+int
+vsscanf(const char *inp, char const *fmt0, va_list ap)
+{
+	int inr;
+	u_char *fmt = (u_char *)fmt0;
+	int c;			/* character from format, or conversion */
+	size_t width;		/* field width, or 0 */
+	char *p;		/* points into all kinds of strings */
+	int n;			/* handy integer */
+	int flags;		/* flags as defined above */
+	char *p0;		/* saves original value of p when necessary */
+	int nassigned;		/* number of fields assigned */
+	int nconversions;	/* number of conversions */
+	int nread;		/* number of characters consumed from fp */
+	int base;		/* base argument to strtoq/strtouq */
+	ccfntype ccfn;		/* conversion function (strtoq/strtouq) */
+	char ccltab[256];	/* character class table for %[...] */
+	char buf[BUF];		/* buffer for numeric conversions */
+
+	/* `basefix' is used to avoid `if' tests in the integer scanner */
+	static short basefix[17] =
+		{ 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
+
+	inr = strlen(inp);
+	
+	nassigned = 0;
+	nconversions = 0;
+	nread = 0;
+	base = 0;		/* XXX just to keep gcc happy */
+	ccfn = NULL;		/* XXX just to keep gcc happy */
+	for (;;) {
+		c = *fmt++;
+		if (c == 0)
+			return (nassigned);
+		if (isspace(c)) {
+			while (inr > 0 && isspace(*inp))
+				nread++, inr--, inp++;
+			continue;
+		}
+		if (c != '%')
+			goto literal;
+		width = 0;
+		flags = 0;
+		/*
+		 * switch on the format.  continue if done;
+		 * break once format type is derived.
+		 */
+again:		c = *fmt++;
+		switch (c) {
+		case '%':
+literal:
+			if (inr <= 0)
+				goto input_failure;
+			if (*inp != c)
+				goto match_failure;
+			inr--, inp++;
+			nread++;
+			continue;
+
+		case '*':
+			flags |= SUPPRESS;
+			goto again;
+		case 'l':
+			flags |= LONG;
+			goto again;
+		case 'q':
+			flags |= QUAD;
+			goto again;
+		case 'h':
+			flags |= SHORT;
+			goto again;
+
+		case '0': case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+			width = width * 10 + c - '0';
+			goto again;
+
+		/*
+		 * Conversions.
+		 *
+		 */
+		case 'd':
+			c = CT_INT;
+			ccfn = (ccfntype)strtoq;
+			base = 10;
+			break;
+
+		case 'i':
+			c = CT_INT;
+			ccfn = (ccfntype)strtoq;
+			base = 0;
+			break;
+
+		case 'o':
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 8;
+			break;
+
+		case 'u':
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 10;
+			break;
+
+		case 'x':
+			flags |= PFXOK;	/* enable 0x prefixing */
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 16;
+			break;
+
+		case 's':
+			c = CT_STRING;
+			break;
+
+		case '[':
+			fmt = __sccl(ccltab, fmt);
+			flags |= NOSKIP;
+			c = CT_CCL;
+			break;
+
+		case 'c':
+			flags |= NOSKIP;
+			c = CT_CHAR;
+			break;
+
+		case 'p':	/* pointer format is like hex */
+			flags |= POINTER | PFXOK;
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 16;
+			break;
+
+		case 'n':
+			nconversions++;
+			if (flags & SUPPRESS)	/* ??? */
+				continue;
+			if (flags & SHORT)
+				*va_arg(ap, short *) = nread;
+			else if (flags & LONG)
+				*va_arg(ap, long *) = nread;
+			else if (flags & QUAD)
+				*va_arg(ap, quad_t *) = nread;
+			else
+				*va_arg(ap, int *) = nread;
+			continue;
+		}
+
+		/*
+		 * We have a conversion that requires input.
+		 */
+		if (inr <= 0)
+			goto input_failure;
+
+		/*
+		 * Consume leading white space, except for formats
+		 * that suppress this.
+		 */
+		if ((flags & NOSKIP) == 0) {
+			while (isspace(*inp)) {
+				nread++;
+				if (--inr > 0)
+					inp++;
+				else 
+					goto input_failure;
+			}
+			/*
+			 * Note that there is at least one character in
+			 * the buffer, so conversions that do not set NOSKIP
+			 * can no longer result in an input failure.
+			 */
+		}
+
+		/*
+		 * Do the conversion.
+		 */
+		switch (c) {
+
+		case CT_CHAR:
+			/* scan arbitrary characters (sets NOSKIP) */
+			if (width == 0)
+				width = 1;
+			if (flags & SUPPRESS) {
+				size_t sum = 0;
+				for (;;) {
+					if ((n = inr) < width) {
+						sum += n;
+						width -= n;
+						inp += n;
+						if (sum == 0)
+							goto input_failure;
+							break;
+					} else {
+						sum += width;
+						inr -= width;
+						inp += width;
+						break;
+					}
+				}
+				nread += sum;
+			} else {
+				bcopy(inp, va_arg(ap, char *), width);
+				inr -= width;
+				inp += width;
+				nread += width;
+				nassigned++;
+			}
+			nconversions++;
+			break;
+
+		case CT_CCL:
+			/* scan a (nonempty) character class (sets NOSKIP) */
+			if (width == 0)
+				width = (size_t)~0;	/* `infinity' */
+			/* take only those things in the class */
+			if (flags & SUPPRESS) {
+				n = 0;
+				while (ccltab[*inp]) {
+					n++, inr--, inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0) {
+						if (n == 0)
+							goto input_failure;
+						break;
+					}
+				}
+				if (n == 0)
+					goto match_failure;
+			} else {
+				p0 = p = va_arg(ap, char *);
+				while (ccltab[*inp]) {
+					inr--;
+					*p++ = *inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0) {
+						if (p == p0)
+							goto input_failure;
+						break;
+					}
+				}
+				n = p - p0;
+				if (n == 0)
+					goto match_failure;
+				*p = 0;
+				nassigned++;
+			}
+			nread += n;
+			nconversions++;
+			break;
+
+		case CT_STRING:
+			/* like CCL, but zero-length string OK, & no NOSKIP */
+			if (width == 0)
+				width = (size_t)~0;
+			if (flags & SUPPRESS) {
+				n = 0;
+				while (!isspace(*inp)) {
+					n++, inr--, inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0)
+						break;
+				}
+				nread += n;
+			} else {
+				p0 = p = va_arg(ap, char *);
+				while (!isspace(*inp)) {
+					inr--;
+					*p++ = *inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0)
+						break;
+				}
+				*p = 0;
+				nread += p - p0;
+				nassigned++;
+			}
+			nconversions++;
+			continue;
+
+		case CT_INT:
+			/* scan an integer as if by strtoq/strtouq */
+#ifdef hardway
+			if (width == 0 || width > sizeof(buf) - 1)
+				width = sizeof(buf) - 1;
+#else
+			/* size_t is unsigned, hence this optimisation */
+			if (--width > sizeof(buf) - 2)
+				width = sizeof(buf) - 2;
+			width++;
+#endif
+			flags |= SIGNOK | NDIGITS | NZDIGITS;
+			for (p = buf; width; width--) {
+				c = *inp;
+				/*
+				 * Switch on the character; `goto ok'
+				 * if we accept it as a part of number.
+				 */
+				switch (c) {
+
+				/*
+				 * The digit 0 is always legal, but is
+				 * special.  For %i conversions, if no
+				 * digits (zero or nonzero) have been
+				 * scanned (only signs), we will have
+				 * base==0.  In that case, we should set
+				 * it to 8 and enable 0x prefixing.
+				 * Also, if we have not scanned zero digits
+				 * before this, do not turn off prefixing
+				 * (someone else will turn it off if we
+				 * have scanned any nonzero digits).
+				 */
+				case '0':
+					if (base == 0) {
+						base = 8;
+						flags |= PFXOK;
+					}
+					if (flags & NZDIGITS)
+					    flags &= ~(SIGNOK|NZDIGITS|NDIGITS);
+					else
+					    flags &= ~(SIGNOK|PFXOK|NDIGITS);
+					goto ok;
+
+				/* 1 through 7 always legal */
+				case '1': case '2': case '3':
+				case '4': case '5': case '6': case '7':
+					base = basefix[base];
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* digits 8 and 9 ok iff decimal or hex */
+				case '8': case '9':
+					base = basefix[base];
+					if (base <= 8)
+						break;	/* not legal here */
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* letters ok iff hex */
+				case 'A': case 'B': case 'C':
+				case 'D': case 'E': case 'F':
+				case 'a': case 'b': case 'c':
+				case 'd': case 'e': case 'f':
+					/* no need to fix base here */
+					if (base <= 10)
+						break;	/* not legal here */
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* sign ok only as first character */
+				case '+': case '-':
+					if (flags & SIGNOK) {
+						flags &= ~SIGNOK;
+						goto ok;
+					}
+					break;
+
+				/* x ok iff flag still set & 2nd char */
+				case 'x': case 'X':
+					if (flags & PFXOK && p == buf + 1) {
+						base = 16;	/* if %i */
+						flags &= ~PFXOK;
+						goto ok;
+					}
+					break;
+				}
+
+				/*
+				 * If we got here, c is not a legal character
+				 * for a number.  Stop accumulating digits.
+				 */
+				break;
+		ok:
+				/*
+				 * c is legal: store it and look at the next.
+				 */
+				*p++ = c;
+				if (--inr > 0)
+					inp++;
+				else 
+					break;		/* end of input */
+			}
+			/*
+			 * If we had only a sign, it is no good; push
+			 * back the sign.  If the number ends in `x',
+			 * it was [sign] '0' 'x', so push back the x
+			 * and treat it as [sign] '0'.
+			 */
+			if (flags & NDIGITS) {
+				if (p > buf) {
+					inp--;
+					inr++;
+				}
+				goto match_failure;
+			}
+			c = ((u_char *)p)[-1];
+			if (c == 'x' || c == 'X') {
+				--p;
+				inp--;
+				inr++;
+			}
+			if ((flags & SUPPRESS) == 0) {
+				u_quad_t res;
+
+				*p = 0;
+				res = (*ccfn)(buf, (char **)NULL, base);
+				if (flags & POINTER)
+					*va_arg(ap, void **) =
+						(void *)(u_long)res;
+				else if (flags & SHORT)
+					*va_arg(ap, short *) = res;
+				else if (flags & LONG)
+					*va_arg(ap, long *) = res;
+				else if (flags & QUAD)
+					*va_arg(ap, quad_t *) = res;
+				else
+					*va_arg(ap, int *) = res;
+				nassigned++;
+			}
+			nread += p - buf;
+			nconversions++;
+			break;
+
+		}
+	}
+input_failure:
+	return (nconversions != 0 ? nassigned : -1);
+match_failure:
+	return (nassigned);
+}
+
+/*
+ * Fill in the given table from the scanset at the given format
+ * (just after `[').  Return a pointer to the character past the
+ * closing `]'.  The table has a 1 wherever characters should be
+ * considered part of the scanset.
+ */
+static u_char *
+__sccl(char *tab, u_char *fmt)
+{
+	int c, n, v;
+
+	/* first `clear' the whole table */
+	c = *fmt++;		/* first char hat => negated scanset */
+	if (c == '^') {
+		v = 1;		/* default => accept */
+		c = *fmt++;	/* get new first char */
+	} else
+		v = 0;		/* default => reject */
+
+	/* XXX: Will not work if sizeof(tab*) > sizeof(char) */
+	for (n = 0; n < 256; n++)
+		     tab[n] = v;	/* memset(tab, v, 256) */
+
+	if (c == 0)
+		return (fmt - 1);/* format ended before closing ] */
+
+	/*
+	 * Now set the entries corresponding to the actual scanset
+	 * to the opposite of the above.
+	 *
+	 * The first character may be ']' (or '-') without being special;
+	 * the last character may be '-'.
+	 */
+	v = 1 - v;
+	for (;;) {
+		tab[c] = v;		/* take character c */
+doswitch:
+		n = *fmt++;		/* and examine the next */
+		switch (n) {
+
+		case 0:			/* format ended too soon */
+			return (fmt - 1);
+
+		case '-':
+			/*
+			 * A scanset of the form
+			 *	[01+-]
+			 * is defined as `the digit 0, the digit 1,
+			 * the character +, the character -', but
+			 * the effect of a scanset such as
+			 *	[a-zA-Z0-9]
+			 * is implementation defined.  The V7 Unix
+			 * scanf treats `a-z' as `the letters a through
+			 * z', but treats `a-a' as `the letter a, the
+			 * character -, and the letter a'.
+			 *
+			 * For compatibility, the `-' is not considerd
+			 * to define a range if the character following
+			 * it is either a close bracket (required by ANSI)
+			 * or is not numerically greater than the character
+			 * we just stored in the table (c).
+			 */
+			n = *fmt;
+			if (n == ']' || n < c) {
+				c = '-';
+				break;	/* resume the for(;;) */
+			}
+			fmt++;
+			/* fill in the range */
+			do {
+			    tab[++c] = v;
+			} while (c < n);
+			c = n;
+			/*
+			 * Alas, the V7 Unix scanf also treats formats
+			 * such as [a-c-e] as `the letters a through e'.
+			 * This too is permitted by the standard....
+			 */
+			goto doswitch;
+			break;
+
+		case ']':		/* end of scanset */
+			return (fmt);
+
+		default:		/* just another character */
+			c = n;
+			break;
+		}
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Convert a string to an unsigned quad integer.
+ *
+ * Ignores `locale' stuff.  Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ */
+u_quad_t
+strtouq(const char *nptr, char **endptr, int base)
+{
+	const char *s = nptr;
+	u_quad_t acc;
+	unsigned char c;
+	u_quad_t qbase, cutoff;
+	int neg, any, cutlim;
+
+	/*
+	 * See strtoq for comments as to the logic used.
+	 */
+	s = nptr;
+	do {
+		c = *s++;
+	} while (isspace(c));
+	if (c == '-') {
+		neg = 1;
+		c = *s++;
+	} else {
+		neg = 0;
+		if (c == '+')
+			c = *s++;
+	}
+	if ((base == 0 || base == 16) &&
+	    c == '0' && (*s == 'x' || *s == 'X')) {
+		c = s[1];
+		s += 2;
+		base = 16;
+	}
+	if (base == 0)
+		base = c == '0' ? 8 : 10;
+	qbase = (unsigned)base;
+	cutoff = (u_quad_t)UQUAD_MAX / qbase;
+	cutlim = (u_quad_t)UQUAD_MAX % qbase;
+	for (acc = 0, any = 0;; c = *s++) {
+		if (!isascii(c))
+			break;
+		if (isdigit(c))
+			c -= '0';
+		else if (isalpha(c))
+			c -= isupper(c) ? 'A' - 10 : 'a' - 10;
+		else
+			break;
+		if (c >= base)
+			break;
+		if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+			any = -1;
+		else {
+			any = 1;
+			acc *= qbase;
+			acc += c;
+		}
+	}
+	if (any < 0) {
+		acc = UQUAD_MAX;
+	} else if (neg)
+		acc = -acc;
+	if (endptr != 0)
+		*endptr = (char *)(any ? s - 1 : nptr);
+	return (acc);
+}
+
+/*
+ * Convert a string to a quad integer.
+ *
+ * Ignores `locale' stuff.  Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ */
+quad_t
+strtoq(const char *nptr, char **endptr, int base)
+{
+	const char *s;
+	u_quad_t acc;
+	unsigned char c;
+	u_quad_t qbase, cutoff;
+	int neg, any, cutlim;
+
+	/*
+	 * Skip white space and pick up leading +/- sign if any.
+	 * If base is 0, allow 0x for hex and 0 for octal, else
+	 * assume decimal; if base is already 16, allow 0x.
+	 */
+	s = nptr;
+	do {
+		c = *s++;
+	} while (isspace(c));
+	if (c == '-') {
+		neg = 1;
+		c = *s++;
+	} else {
+		neg = 0;
+		if (c == '+')
+			c = *s++;
+	}
+	if ((base == 0 || base == 16) &&
+	    c == '0' && (*s == 'x' || *s == 'X')) {
+		c = s[1];
+		s += 2;
+		base = 16;
+	}
+	if (base == 0)
+		base = c == '0' ? 8 : 10;
+
+	/*
+	 * Compute the cutoff value between legal numbers and illegal
+	 * numbers.  That is the largest legal value, divided by the
+	 * base.  An input number that is greater than this value, if
+	 * followed by a legal input character, is too big.  One that
+	 * is equal to this value may be valid or not; the limit
+	 * between valid and invalid numbers is then based on the last
+	 * digit.  For instance, if the range for quads is
+	 * [-9223372036854775808..9223372036854775807] and the input base
+	 * is 10, cutoff will be set to 922337203685477580 and cutlim to
+	 * either 7 (neg==0) or 8 (neg==1), meaning that if we have
+	 * accumulated a value > 922337203685477580, or equal but the
+	 * next digit is > 7 (or 8), the number is too big, and we will
+	 * return a range error.
+	 *
+	 * Set any if any `digits' consumed; make it negative to indicate
+	 * overflow.
+	 */
+	qbase = (unsigned)base;
+	cutoff = neg ? (u_quad_t)-(QUAD_MIN + QUAD_MAX) + QUAD_MAX : QUAD_MAX;
+	cutlim = cutoff % qbase;
+	cutoff /= qbase;
+	for (acc = 0, any = 0;; c = *s++) {
+		if (!isascii(c))
+			break;
+		if (isdigit(c))
+			c -= '0';
+		else if (isalpha(c))
+			c -= isupper(c) ? 'A' - 10 : 'a' - 10;
+		else
+			break;
+		if (c >= base)
+			break;
+		if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+			any = -1;
+		else {
+			any = 1;
+			acc *= qbase;
+			acc += c;
+		}
+	}
+	if (any < 0) {
+		acc = neg ? QUAD_MIN : QUAD_MAX;
+	} else if (neg)
+		acc = -acc;
+	if (endptr != 0)
+		*endptr = (char *)(any ? s - 1 : nptr);
+	return (acc);
+}
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
new file mode 100644
index 0000000..569f04b
--- /dev/null
+++ b/sys/kern/subr_smp.c
@@ -0,0 +1,2663 @@
+/*
+ * Copyright (c) 1996, by Steve Passe
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. The name of the developer may NOT be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: mp_machdep.c,v 1.87 1999/01/12 00:19:31 eivind Exp $
+ */
+
+#include "opt_smp.h"
+#include "opt_vm86.h"
+#include "opt_cpu.h"
+#include "opt_user_ldt.h"
+
+#ifdef SMP
+#include <machine/smptests.h>
+#else
+#error
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#ifdef BETTER_CLOCK
+#include <sys/dkstat.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#ifdef BETTER_CLOCK
+#include <sys/lock.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+#ifdef GPROF 
+#include <sys/gmon.h>
+#endif
+#endif
+
+#include <machine/smp.h>
+#include <machine/apic.h>
+#include <machine/mpapic.h>
+#include <machine/segments.h>
+#include <machine/smptests.h>	/** TEST_DEFAULT_CONFIG, TEST_TEST1 */
+#include <machine/tss.h>
+#include <machine/specialreg.h>
+#include <machine/cputypes.h>
+#include <machine/globaldata.h>
+
+#include <i386/i386/cons.h>	/* cngetc() */
+
+#if defined(APIC_IO)
+#include <machine/md_var.h>		/* setidt() */
+#include <i386/isa/icu.h>		/* IPIs */
+#include <i386/isa/intr_machdep.h>	/* IPIs */
+#endif	/* APIC_IO */
+
+#if defined(TEST_DEFAULT_CONFIG)
+#define MPFPS_MPFB1	TEST_DEFAULT_CONFIG
+#else
+#define MPFPS_MPFB1	mpfps->mpfb1
+#endif  /* TEST_DEFAULT_CONFIG */
+
+#define WARMBOOT_TARGET		0
+#define WARMBOOT_OFF		(KERNBASE + 0x0467)
+#define WARMBOOT_SEG		(KERNBASE + 0x0469)
+
+#ifdef PC98
+#define BIOS_BASE		(0xe8000)
+#define BIOS_SIZE		(0x18000)
+#else
+#define BIOS_BASE		(0xf0000)
+#define BIOS_SIZE		(0x10000)
+#endif
+#define BIOS_COUNT		(BIOS_SIZE/4)
+
+#define CMOS_REG		(0x70)
+#define CMOS_DATA		(0x71)
+#define BIOS_RESET		(0x0f)
+#define BIOS_WARM		(0x0a)
+
+#define PROCENTRY_FLAG_EN	0x01
+#define PROCENTRY_FLAG_BP	0x02
+#define IOAPICENTRY_FLAG_EN	0x01
+
+
+/* MP Floating Pointer Structure */
+typedef struct MPFPS {
+	char    signature[4];
+	void   *pap;
+	u_char  length;
+	u_char  spec_rev;
+	u_char  checksum;
+	u_char  mpfb1;
+	u_char  mpfb2;
+	u_char  mpfb3;
+	u_char  mpfb4;
+	u_char  mpfb5;
+}      *mpfps_t;
+
+/* MP Configuration Table Header */
+typedef struct MPCTH {
+	char    signature[4];
+	u_short base_table_length;
+	u_char  spec_rev;
+	u_char  checksum;
+	u_char  oem_id[8];
+	u_char  product_id[12];
+	void   *oem_table_pointer;
+	u_short oem_table_size;
+	u_short entry_count;
+	void   *apic_address;
+	u_short extended_table_length;
+	u_char  extended_table_checksum;
+	u_char  reserved;
+}      *mpcth_t;
+
+
+typedef struct PROCENTRY {
+	u_char  type;
+	u_char  apic_id;
+	u_char  apic_version;
+	u_char  cpu_flags;
+	u_long  cpu_signature;
+	u_long  feature_flags;
+	u_long  reserved1;
+	u_long  reserved2;
+}      *proc_entry_ptr;
+
+typedef struct BUSENTRY {
+	u_char  type;
+	u_char  bus_id;
+	char    bus_type[6];
+}      *bus_entry_ptr;
+
+typedef struct IOAPICENTRY {
+	u_char  type;
+	u_char  apic_id;
+	u_char  apic_version;
+	u_char  apic_flags;
+	void   *apic_address;
+}      *io_apic_entry_ptr;
+
+typedef struct INTENTRY {
+	u_char  type;
+	u_char  int_type;
+	u_short int_flags;
+	u_char  src_bus_id;
+	u_char  src_bus_irq;
+	u_char  dst_apic_id;
+	u_char  dst_apic_int;
+}      *int_entry_ptr;
+
+/* descriptions of MP basetable entries */
+typedef struct BASETABLE_ENTRY {
+	u_char  type;
+	u_char  length;
+	char    name[16];
+}       basetable_entry;
+
+/*
+ * this code MUST be enabled here and in mpboot.s.
+ * it follows the very early stages of AP boot by placing values in CMOS ram.
+ * it NORMALLY will never be needed and thus the primitive method for enabling.
+ *
+#define CHECK_POINTS
+ */
+
+#if defined(CHECK_POINTS) && !defined(PC98)
+#define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
+#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
+
+#define CHECK_INIT(D);				\
+	CHECK_WRITE(0x34, (D));			\
+	CHECK_WRITE(0x35, (D));			\
+	CHECK_WRITE(0x36, (D));			\
+	CHECK_WRITE(0x37, (D));			\
+	CHECK_WRITE(0x38, (D));			\
+	CHECK_WRITE(0x39, (D));
+
+#define CHECK_PRINT(S);				\
+	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
+	   (S),					\
+	   CHECK_READ(0x34),			\
+	   CHECK_READ(0x35),			\
+	   CHECK_READ(0x36),			\
+	   CHECK_READ(0x37),			\
+	   CHECK_READ(0x38),			\
+	   CHECK_READ(0x39));
+
+#else				/* CHECK_POINTS */
+
+#define CHECK_INIT(D)
+#define CHECK_PRINT(S)
+
+#endif				/* CHECK_POINTS */
+
+/*
+ * Values to send to the POST hardware.
+ */
+#define MP_BOOTADDRESS_POST	0x10
+#define MP_PROBE_POST		0x11
+#define MPTABLE_PASS1_POST	0x12
+
+#define MP_START_POST		0x13
+#define MP_ENABLE_POST		0x14
+#define MPTABLE_PASS2_POST	0x15
+
+#define START_ALL_APS_POST	0x16
+#define INSTALL_AP_TRAMP_POST	0x17
+#define START_AP_POST		0x18
+
+#define MP_ANNOUNCE_POST	0x19
+
+
+/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
+int	current_postcode;
+
+/** XXX FIXME: what system files declare these??? */
+extern struct region_descriptor r_gdt, r_idt;
+
+int	bsp_apic_ready = 0;	/* flags useability of BSP apic */
+int	mp_ncpus;		/* # of CPUs, including BSP */
+int	mp_naps;		/* # of Applications processors */
+int	mp_nbusses;		/* # of busses */
+int	mp_napics;		/* # of IO APICs */
+int	boot_cpu_id;		/* designated BSP */
+vm_offset_t cpu_apic_address;
+vm_offset_t io_apic_address[NAPICID];	/* NAPICID is more than enough */
+extern	int nkpt;
+
+u_int32_t cpu_apic_versions[NCPU];
+u_int32_t io_apic_versions[NAPIC];
+
+#ifdef APIC_INTR_DIAGNOSTIC
+int apic_itrace_enter[32];
+int apic_itrace_tryisrlock[32];
+int apic_itrace_gotisrlock[32];
+int apic_itrace_active[32];
+int apic_itrace_masked[32];
+int apic_itrace_noisrlock[32];
+int apic_itrace_masked2[32];
+int apic_itrace_unmask[32];
+int apic_itrace_noforward[32];
+int apic_itrace_leave[32];
+int apic_itrace_enter2[32];
+int apic_itrace_doreti[32];
+int apic_itrace_splz[32];
+int apic_itrace_eoi[32];
+#ifdef APIC_INTR_DIAGNOSTIC_IRQ
+unsigned short apic_itrace_debugbuffer[32768];
+int apic_itrace_debugbuffer_idx;
+struct simplelock apic_itrace_debuglock;
+#endif
+#endif
+
+#ifdef APIC_INTR_REORDER
+struct {
+	volatile int *location;
+	int bit;
+} apic_isrbit_location[32];
+#endif
+
+struct apic_intmapinfo	int_to_apicintpin[APIC_INTMAPSIZE];
+
+/*
+ * APIC ID logical/physical mapping structures.
+ * We oversize these to simplify boot-time config.
+ */
+int     cpu_num_to_apic_id[NAPICID];
+int     io_num_to_apic_id[NAPICID];
+int     apic_id_to_logical[NAPICID];
+
+
+/* Bitmap of all available CPUs */
+u_int	all_cpus;
+
+/* AP uses this PTD during bootstrap.  Do not staticize.  */
+pd_entry_t *bootPTD;
+
+/* Hotwire a 0->4MB V==P mapping */
+extern pt_entry_t *KPTphys;
+
+/* Virtual address of per-cpu common_tss */
+extern struct i386tss common_tss;
+#ifdef VM86
+extern struct segment_descriptor common_tssd;
+extern u_int private_tss;		/* flag indicating private tss */
+extern u_int my_tr;
+#endif /* VM86 */
+
+/* IdlePTD per cpu */
+pd_entry_t *IdlePTDS[NCPU];
+
+/* "my" private page table page, for BSP init */
+extern pt_entry_t SMP_prvpt[];
+
+/* Private page pointer to curcpu's PTD, used during BSP init */
+extern pd_entry_t *my_idlePTD;
+
+struct pcb stoppcbs[NCPU];
+
+int smp_started;		/* has the system started? */
+
+/*
+ * Local data and functions.
+ */
+
+static int	mp_capable;
+static u_int	boot_address;
+static u_int	base_memory;
+
+static int	picmode;		/* 0: virtual wire mode, 1: PIC mode */
+static mpfps_t	mpfps;
+static int	search_for_sig(u_int32_t target, int count);
+static void	mp_enable(u_int boot_addr);
+
+static int	mptable_pass1(void);
+static int	mptable_pass2(void);
+static void	default_mp_table(int type);
+static void	fix_mp_table(void);
+static void	setup_apic_irq_mapping(void);
+static void	init_locks(void);
+static int	start_all_aps(u_int boot_addr);
+static void	install_ap_tramp(u_int boot_addr);
+static int	start_ap(int logicalCpu, u_int boot_addr);
+
+/*
+ * Calculate usable address in base memory for AP trampoline code.
+ */
+u_int
+mp_bootaddress(u_int basemem)
+{
+	POSTCODE(MP_BOOTADDRESS_POST);
+
+	base_memory = basemem * 1024;	/* convert to bytes */
+
+	boot_address = base_memory & ~0xfff;	/* round down to 4k boundary */
+	if ((base_memory - boot_address) < bootMP_size)
+		boot_address -= 4096;	/* not enough, lower by 4k */
+
+	return boot_address;
+}
+
+
+/*
+ * Look for an Intel MP spec table (ie, SMP capable hardware).
+ */
+int
+mp_probe(void)
+{
+	int     x;
+	u_long  segment;
+	u_int32_t target;
+
+	POSTCODE(MP_PROBE_POST);
+
+	/* see if EBDA exists */
+	if (segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) {
+		/* search first 1K of EBDA */
+		target = (u_int32_t) (segment << 4);
+		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
+			goto found;
+	} else {
+		/* last 1K of base memory, effective 'top of base' passed in */
+		target = (u_int32_t) (base_memory - 0x400);
+		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
+			goto found;
+	}
+
+	/* search the BIOS */
+	target = (u_int32_t) BIOS_BASE;
+	if ((x = search_for_sig(target, BIOS_COUNT)) >= 0)
+		goto found;
+
+	/* nothing found */
+	mpfps = (mpfps_t)0;
+	mp_capable = 0;
+	return 0;
+
+found:
+	/* calculate needed resources */
+	mpfps = (mpfps_t)x;
+	if (mptable_pass1())
+		panic("you must reconfigure your kernel");
+
+	/* flag fact that we are running multiple processors */
+	mp_capable = 1;
+	return 1;
+}
+
+
+/*
+ * Startup the SMP processors.
+ */
+void
+mp_start(void)
+{
+	POSTCODE(MP_START_POST);
+
+	/* look for MP capable motherboard */
+	if (mp_capable)
+		mp_enable(boot_address);
+	else
+		panic("MP hardware not found!");
+}
+
+
+/*
+ * Print various information about the SMP system hardware and setup.
+ */
+void
+mp_announce(void)
+{
+	int     x;
+
+	POSTCODE(MP_ANNOUNCE_POST);
+
+	printf("FreeBSD/SMP: Multiprocessor motherboard\n");
+	printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0));
+	printf(", version: 0x%08x", cpu_apic_versions[0]);
+	printf(", at 0x%08x\n", cpu_apic_address);
+	for (x = 1; x <= mp_naps; ++x) {
+		printf(" cpu%d (AP):  apic id: %2d", x, CPU_TO_ID(x));
+		printf(", version: 0x%08x", cpu_apic_versions[x]);
+		printf(", at 0x%08x\n", cpu_apic_address);
+	}
+
+#if defined(APIC_IO)
+	for (x = 0; x < mp_napics; ++x) {
+		printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x));
+		printf(", version: 0x%08x", io_apic_versions[x]);
+		printf(", at 0x%08x\n", io_apic_address[x]);
+	}
+#else
+	printf(" Warning: APIC I/O disabled\n");
+#endif	/* APIC_IO */
+}
+
+/*
+ * AP cpu's call this to sync up protected mode.
+ */
+void
+init_secondary(void)
+{
+	int	gsel_tss;
+#ifndef VM86
+	u_int	my_tr;
+#endif
+
+	r_gdt.rd_limit = sizeof(gdt[0]) * (NGDT + NCPU) - 1;
+	r_gdt.rd_base = (int) gdt;
+	lgdt(&r_gdt);			/* does magic intra-segment return */
+	lidt(&r_idt);
+	lldt(_default_ldt);
+#ifdef USER_LDT
+	currentldt = _default_ldt;
+#endif
+
+	my_tr = NGDT + cpuid;
+	gsel_tss = GSEL(my_tr, SEL_KPL);
+	gdt[my_tr].sd.sd_type = SDT_SYS386TSS;
+	common_tss.tss_esp0 = 0;	/* not used until after switch */
+	common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
+	common_tss.tss_ioopt = (sizeof common_tss) << 16;
+#ifdef VM86
+	common_tssd = gdt[my_tr].sd;
+	private_tss = 0;
+#endif /* VM86 */
+	ltr(gsel_tss);
+
+	load_cr0(0x8005003b);		/* XXX! */
+
+	PTD[0] = 0;
+	pmap_set_opt((unsigned *)PTD);
+
+	putmtrr();
+	pmap_setvidram();
+
+	invltlb();
+}
+
+
+#if defined(APIC_IO)
+/*
+ * Final configuration of the BSP's local APIC:
+ *  - disable 'pic mode'.
+ *  - disable 'virtual wire mode'.
+ *  - enable NMI.
+ */
+void
+bsp_apic_configure(void)
+{
+	u_char		byte;
+	u_int32_t	temp;
+
+	/* leave 'pic mode' if necessary */
+	if (picmode) {
+		outb(0x22, 0x70);	/* select IMCR */
+		byte = inb(0x23);	/* current contents */
+		byte |= 0x01;		/* mask external INTR */
+		outb(0x23, byte);	/* disconnect 8259s/NMI */
+	}
+
+	/* mask lint0 (the 8259 'virtual wire' connection) */
+	temp = lapic.lvt_lint0;
+	temp |= APIC_LVT_M;		/* set the mask */
+	lapic.lvt_lint0 = temp;
+
+        /* setup lint1 to handle NMI */
+        temp = lapic.lvt_lint1;
+        temp &= ~APIC_LVT_M;		/* clear the mask */
+        lapic.lvt_lint1 = temp;
+
+	if (bootverbose)
+		apic_dump("bsp_apic_configure()");
+}
+#endif  /* APIC_IO */
+
+
+/*******************************************************************
+ * local functions and data
+ */
+
+/*
+ * start the SMP system
+ */
+static void
+mp_enable(u_int boot_addr)
+{
+	int     x;
+#if defined(APIC_IO)
+	int     apic;
+	u_int   ux;
+#endif	/* APIC_IO */
+
+	getmtrr();
+	pmap_setvidram();
+
+	POSTCODE(MP_ENABLE_POST);
+
+	/* turn on 4MB of V == P addressing so we can get to MP table */
+	*(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME);
+	invltlb();
+
+	/* examine the MP table for needed info, uses physical addresses */
+	x = mptable_pass2();
+
+	*(int *)PTD = 0;
+	invltlb();
+
+	/* can't process default configs till the CPU APIC is pmapped */
+	if (x)
+		default_mp_table(x);
+
+	/* post scan cleanup */
+	fix_mp_table();
+	setup_apic_irq_mapping();
+
+#if defined(APIC_IO)
+
+	/* fill the LOGICAL io_apic_versions table */
+	for (apic = 0; apic < mp_napics; ++apic) {
+		ux = io_apic_read(apic, IOAPIC_VER);
+		io_apic_versions[apic] = ux;
+	}
+
+	/* program each IO APIC in the system */
+	for (apic = 0; apic < mp_napics; ++apic)
+		if (io_apic_setup(apic) < 0)
+			panic("IO APIC setup failure");
+
+	/* install a 'Spurious INTerrupt' vector */
+	setidt(XSPURIOUSINT_OFFSET, Xspuriousint,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
+	/* install an inter-CPU IPI for TLB invalidation */
+	setidt(XINVLTLB_OFFSET, Xinvltlb,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
+#ifdef BETTER_CLOCK
+	/* install an inter-CPU IPI for reading processor state */
+	setidt(XCPUCHECKSTATE_OFFSET, Xcpucheckstate,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif
+	
+	/* install an inter-CPU IPI for forcing an additional software trap */
+	setidt(XCPUAST_OFFSET, Xcpuast,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	
+	/* install an inter-CPU IPI for interrupt forwarding */
+	setidt(XFORWARD_IRQ_OFFSET, Xforward_irq,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
+	/* install an inter-CPU IPI for CPU stop/restart */
+	setidt(XCPUSTOP_OFFSET, Xcpustop,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
+#if defined(TEST_TEST1)
+	/* install a "fake hardware INTerrupt" vector */
+	setidt(XTEST1_OFFSET, Xtest1,
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif  /** TEST_TEST1 */
+
+#endif	/* APIC_IO */
+
+	/* initialize all SMP locks */
+	init_locks();
+
+	/* start each Application Processor */
+	start_all_aps(boot_addr);
+
+	/* 
+	 * The init process might be started on a different CPU now,
+	 * and the boot CPU might not call prepare_usermode to get
+	 * cr0 correctly configured. Thus we initialize cr0 here.
+	 */
+	load_cr0(rcr0() | CR0_WP | CR0_AM);
+}
+
+
+/*
+ * look for the MP spec signature
+ */
+
+/* string defined by the Intel MP Spec as identifying the MP table */
+#define MP_SIG		0x5f504d5f	/* _MP_ */
+#define NEXT(X)		((X) += 4)
+static int
+search_for_sig(u_int32_t target, int count)
+{
+	int     x;
+	u_int32_t *addr = (u_int32_t *) (KERNBASE + target);
+
+	for (x = 0; x < count; NEXT(x))
+		if (addr[x] == MP_SIG)
+			/* make array index a byte index */
+			return (target + (x * sizeof(u_int32_t)));
+
+	return -1;
+}
+
+
+static basetable_entry basetable_entry_types[] =
+{
+	{0, 20, "Processor"},
+	{1, 8, "Bus"},
+	{2, 8, "I/O APIC"},
+	{3, 8, "I/O INT"},
+	{4, 8, "Local INT"}
+};
+
+typedef struct BUSDATA {
+	u_char  bus_id;
+	enum busTypes bus_type;
+}       bus_datum;
+
+typedef struct INTDATA {
+	u_char  int_type;
+	u_short int_flags;
+	u_char  src_bus_id;
+	u_char  src_bus_irq;
+	u_char  dst_apic_id;
+	u_char  dst_apic_int;
+	u_char	int_vector;
+}       io_int, local_int;
+
+typedef struct BUSTYPENAME {
+	u_char  type;
+	char    name[7];
+}       bus_type_name;
+
+static bus_type_name bus_type_table[] =
+{
+	{CBUS, "CBUS"},
+	{CBUSII, "CBUSII"},
+	{EISA, "EISA"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{ISA, "ISA"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{PCI, "PCI"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{UNKNOWN_BUSTYPE, "---"},
+	{XPRESS, "XPRESS"},
+	{UNKNOWN_BUSTYPE, "---"}
+};
+/* from MP spec v1.4, table 5-1 */
+static int default_data[7][5] =
+{
+/*   nbus, id0, type0, id1, type1 */
+	{1, 0, ISA, 255, 255},
+	{1, 0, EISA, 255, 255},
+	{1, 0, EISA, 255, 255},
+	{0, 255, 255, 255, 255},/* MCA not supported */
+	{2, 0, ISA, 1, PCI},
+	{2, 0, EISA, 1, PCI},
+	{0, 255, 255, 255, 255}	/* MCA not supported */
+};
+
+
+/* the bus data */
+static bus_datum bus_data[NBUS];
+
+/* the IO INT data, one entry per possible APIC INTerrupt */
+static io_int  io_apic_ints[NINTR];
+
+static int nintrs;
+
+static int processor_entry	__P((proc_entry_ptr entry, int cpu));
+static int bus_entry		__P((bus_entry_ptr entry, int bus));
+static int io_apic_entry	__P((io_apic_entry_ptr entry, int apic));
+static int int_entry		__P((int_entry_ptr entry, int intr));
+static int lookup_bus_type	__P((char *name));
+
+
+/*
+ * 1st pass on motherboard's Intel MP specification table.
+ *
+ * initializes:
+ *	mp_ncpus = 1
+ *
+ * determines:
+ *	cpu_apic_address (common to all CPUs)
+ *	io_apic_address[N]
+ *	mp_naps
+ *	mp_nbusses
+ *	mp_napics
+ *	nintrs
+ */
+static int
+mptable_pass1(void)
+{
+	int	x;
+	mpcth_t	cth;
+	int	totalSize;
+	void*	position;
+	int	count;
+	int	type;
+	int	mustpanic;
+
+	POSTCODE(MPTABLE_PASS1_POST);
+
+	mustpanic = 0;
+
+	/* clear various tables */
+	for (x = 0; x < NAPICID; ++x) {
+		io_apic_address[x] = ~0;	/* IO APIC address table */
+	}
+
+	/* init everything to empty */
+	mp_naps = 0;
+	mp_nbusses = 0;
+	mp_napics = 0;
+	nintrs = 0;
+
+	/* check for use of 'default' configuration */
+	if (MPFPS_MPFB1 != 0) {
+		/* use default addresses */
+		cpu_apic_address = DEFAULT_APIC_BASE;
+		io_apic_address[0] = DEFAULT_IO_APIC_BASE;
+
+		/* fill in with defaults */
+		mp_naps = 2;		/* includes BSP */
+		mp_nbusses = default_data[MPFPS_MPFB1 - 1][0];
+#if defined(APIC_IO)
+		mp_napics = 1;
+		nintrs = 16;
+#endif	/* APIC_IO */
+	}
+	else {
+		if ((cth = mpfps->pap) == 0)
+			panic("MP Configuration Table Header MISSING!");
+
+		cpu_apic_address = (vm_offset_t) cth->apic_address;
+
+		/* walk the table, recording info of interest */
+		totalSize = cth->base_table_length - sizeof(struct MPCTH);
+		position = (u_char *) cth + sizeof(struct MPCTH);
+		count = cth->entry_count;
+
+		while (count--) {
+			switch (type = *(u_char *) position) {
+			case 0: /* processor_entry */
+				if (((proc_entry_ptr)position)->cpu_flags
+					& PROCENTRY_FLAG_EN)
+					++mp_naps;
+				break;
+			case 1: /* bus_entry */
+				++mp_nbusses;
+				break;
+			case 2: /* io_apic_entry */
+				if (((io_apic_entry_ptr)position)->apic_flags
+					& IOAPICENTRY_FLAG_EN)
+					io_apic_address[mp_napics++] =
+					    (vm_offset_t)((io_apic_entry_ptr)
+						position)->apic_address;
+				break;
+			case 3: /* int_entry */
+				++nintrs;
+				break;
+			case 4:	/* int_entry */
+				break;
+			default:
+				panic("mpfps Base Table HOSED!");
+				/* NOTREACHED */
+			}
+
+			totalSize -= basetable_entry_types[type].length;
+			(u_char*)position += basetable_entry_types[type].length;
+		}
+	}
+
+	/* qualify the numbers */
+	if (mp_naps > NCPU)
+#if 0 /* XXX FIXME: kern/4255 */
+		printf("Warning: only using %d of %d available CPUs!\n",
+			NCPU, mp_naps);
+#else
+	{
+		printf("NCPU cannot be different than actual CPU count.\n");
+		printf(" add 'options NCPU=%d' to your kernel config file,\n",
+			mp_naps);
+		printf(" then rerun config & rebuild your SMP kernel\n");
+		mustpanic = 1;
+	}
+#endif /* XXX FIXME: kern/4255 */
+	if (mp_nbusses > NBUS) {
+		printf("found %d busses, increase NBUS\n", mp_nbusses);
+		mustpanic = 1;
+	}
+	if (mp_napics > NAPIC) {
+		printf("found %d apics, increase NAPIC\n", mp_napics);
+		mustpanic = 1;
+	}
+	if (nintrs > NINTR) {
+		printf("found %d intrs, increase NINTR\n", nintrs);
+		mustpanic = 1;
+	}
+
+	/*
+	 * Count the BSP.
+	 * This is also used as a counter while starting the APs.
+	 */
+	mp_ncpus = 1;
+
+	--mp_naps;	/* subtract the BSP */
+
+	return mustpanic;
+}
+
+
+/*
+ * 2nd pass on motherboard's Intel MP specification table.
+ *
+ * sets:
+ *	boot_cpu_id
+ *	ID_TO_IO(N), phy APIC ID to log CPU/IO table
+ *	CPU_TO_ID(N), logical CPU to APIC ID table
+ *	IO_TO_ID(N), logical IO to APIC ID table
+ *	bus_data[N]
+ *	io_apic_ints[N]
+ */
+static int
+mptable_pass2(void)
+{
+	int     x;
+	mpcth_t cth;
+	int     totalSize;
+	void*   position;
+	int     count;
+	int     type;
+	int     apic, bus, cpu, intr;
+
+	POSTCODE(MPTABLE_PASS2_POST);
+
+	/* clear various tables */
+	for (x = 0; x < NAPICID; ++x) {
+		ID_TO_IO(x) = -1;	/* phy APIC ID to log CPU/IO table */
+		CPU_TO_ID(x) = -1;	/* logical CPU to APIC ID table */
+		IO_TO_ID(x) = -1;	/* logical IO to APIC ID table */
+	}
+
+	/* clear bus data table */
+	for (x = 0; x < NBUS; ++x)
+		bus_data[x].bus_id = 0xff;
+
+	/* clear IO APIC INT table */
+	for (x = 0; x < NINTR; ++x) {
+		io_apic_ints[x].int_type = 0xff;
+		io_apic_ints[x].int_vector = 0xff;
+	}
+
+	/* setup the cpu/apic mapping arrays */
+	boot_cpu_id = -1;
+
+	/* record whether PIC or virtual-wire mode */
+	picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0;
+
+	/* check for use of 'default' configuration */
+	if (MPFPS_MPFB1 != 0)
+		return MPFPS_MPFB1;	/* return default configuration type */
+
+	if ((cth = mpfps->pap) == 0)
+		panic("MP Configuration Table Header MISSING!");
+
+	/* walk the table, recording info of interest */
+	totalSize = cth->base_table_length - sizeof(struct MPCTH);
+	position = (u_char *) cth + sizeof(struct MPCTH);
+	count = cth->entry_count;
+	apic = bus = intr = 0;
+	cpu = 1;				/* pre-count the BSP */
+
+	while (count--) {
+		switch (type = *(u_char *) position) {
+		case 0:
+			if (processor_entry(position, cpu))
+				++cpu;
+			break;
+		case 1:
+			if (bus_entry(position, bus))
+				++bus;
+			break;
+		case 2:
+			if (io_apic_entry(position, apic))
+				++apic;
+			break;
+		case 3:
+			if (int_entry(position, intr))
+				++intr;
+			break;
+		case 4:
+			/* int_entry(position); */
+			break;
+		default:
+			panic("mpfps Base Table HOSED!");
+			/* NOTREACHED */
+		}
+
+		totalSize -= basetable_entry_types[type].length;
+		(u_char *) position += basetable_entry_types[type].length;
+	}
+
+	if (boot_cpu_id == -1)
+		panic("NO BSP found!");
+
+	/* report fact that its NOT a default configuration */
+	return 0;
+}
+
+
+static void
+assign_apic_irq(int apic, int intpin, int irq)
+{
+	int x;
+	
+	if (int_to_apicintpin[irq].ioapic != -1)
+		panic("assign_apic_irq: inconsistent table");
+	
+	int_to_apicintpin[irq].ioapic = apic;
+	int_to_apicintpin[irq].int_pin = intpin;
+	int_to_apicintpin[irq].apic_address = ioapic[apic];
+	int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin;
+	
+	for (x = 0; x < nintrs; x++) {
+		if ((io_apic_ints[x].int_type == 0 || 
+		     io_apic_ints[x].int_type == 3) &&
+		    io_apic_ints[x].int_vector == 0xff &&
+		    io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) &&
+		    io_apic_ints[x].dst_apic_int == intpin)
+			io_apic_ints[x].int_vector = irq;
+	}
+}
+
+/*
+ * parse an Intel MP specification table
+ */
+static void
+fix_mp_table(void)
+{
+	int	x;
+	int	id;
+	int	bus_0 = 0;	/* Stop GCC warning */
+	int	bus_pci = 0;	/* Stop GCC warning */
+	int	num_pci_bus;
+
+	/*
+	 * Fix mis-numbering of the PCI bus and its INT entries if the BIOS
+	 * did it wrong.  The MP spec says that when more than 1 PCI bus
+	 * exists the BIOS must begin with bus entries for the PCI bus and use
+	 * actual PCI bus numbering.  This implies that when only 1 PCI bus
+	 * exists the BIOS can choose to ignore this ordering, and indeed many
+	 * MP motherboards do ignore it.  This causes a problem when the PCI
+	 * sub-system makes requests of the MP sub-system based on PCI bus
+	 * numbers.	So here we look for the situation and renumber the
+	 * busses and associated INTs in an effort to "make it right".
+	 */
+
+	/* find bus 0, PCI bus, count the number of PCI busses */
+	for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) {
+		if (bus_data[x].bus_id == 0) {
+			bus_0 = x;
+		}
+		if (bus_data[x].bus_type == PCI) {
+			++num_pci_bus;
+			bus_pci = x;
+		}
+	}
+	/*
+	 * bus_0 == slot of bus with ID of 0
+	 * bus_pci == slot of last PCI bus encountered
+	 */
+
+	/* check the 1 PCI bus case for sanity */
+	if (num_pci_bus == 1) {
+
+		/* if it is number 0 all is well */
+		if (bus_data[bus_pci].bus_id == 0)
+			return;
+
+		/* mis-numbered, swap with whichever bus uses slot 0 */
+
+		/* swap the bus entry types */
+		bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type;
+		bus_data[bus_0].bus_type = PCI;
+
+		/* swap each relavant INTerrupt entry */
+		id = bus_data[bus_pci].bus_id;
+		for (x = 0; x < nintrs; ++x) {
+			if (io_apic_ints[x].src_bus_id == id) {
+				io_apic_ints[x].src_bus_id = 0;
+			}
+			else if (io_apic_ints[x].src_bus_id == 0) {
+				io_apic_ints[x].src_bus_id = id;
+			}
+		}
+	}
+	/* sanity check if more than 1 PCI bus */
+	else if (num_pci_bus > 1) {
+		for (x = 0; x < mp_nbusses; ++x) {
+			if (bus_data[x].bus_type != PCI)
+				continue;
+			if (bus_data[x].bus_id >= num_pci_bus)
+				panic("bad PCI bus numbering");
+		}
+	}
+}
+
+
+static void
+setup_apic_irq_mapping(void)
+{
+	int	x;
+	int	int_vector;
+
+	/* Assign low level interrupt handlers */
+	for (x = 0; x < APIC_INTMAPSIZE; x++) {
+		int_to_apicintpin[x].ioapic = -1;
+		int_to_apicintpin[x].int_pin = 0;
+		int_to_apicintpin[x].apic_address = NULL;
+		int_to_apicintpin[x].redirindex = 0;
+	}
+	for (x = 0; x < nintrs; x++) {
+		if (io_apic_ints[x].dst_apic_int <= APIC_INTMAPSIZE &&
+		    io_apic_ints[x].dst_apic_id == IO_TO_ID(0) &&
+		    io_apic_ints[x].int_vector == 0xff && 
+		    (io_apic_ints[x].int_type == 0 ||
+		     io_apic_ints[x].int_type == 3)) {
+			assign_apic_irq(0, 
+					io_apic_ints[x].dst_apic_int,
+					io_apic_ints[x].dst_apic_int);
+		}
+	}
+	int_vector = 0;
+	while (int_vector < APIC_INTMAPSIZE &&
+	       int_to_apicintpin[int_vector].ioapic != -1)
+		int_vector++;
+	for (x = 0; x < nintrs && int_vector < APIC_INTMAPSIZE; x++) {
+		if ((io_apic_ints[x].int_type == 0 ||
+		     io_apic_ints[x].int_type == 3) &&
+		    io_apic_ints[x].int_vector == 0xff) {
+			assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id),
+					io_apic_ints[x].dst_apic_int,
+					int_vector);
+			int_vector++;
+			while (int_vector < APIC_INTMAPSIZE &&
+			       int_to_apicintpin[int_vector].ioapic != -1)
+				int_vector++;
+		}
+	}
+}
+
+
+static int
+processor_entry(proc_entry_ptr entry, int cpu)
+{
+	/* check for usability */
+	if ((cpu >= NCPU) || !(entry->cpu_flags & PROCENTRY_FLAG_EN))
+		return 0;
+
+	/* check for BSP flag */
+	if (entry->cpu_flags & PROCENTRY_FLAG_BP) {
+		boot_cpu_id = entry->apic_id;
+		CPU_TO_ID(0) = entry->apic_id;
+		ID_TO_CPU(entry->apic_id) = 0;
+		return 0;	/* its already been counted */
+	}
+
+	/* add another AP to list, if less than max number of CPUs */
+	else {
+		CPU_TO_ID(cpu) = entry->apic_id;
+		ID_TO_CPU(entry->apic_id) = cpu;
+		return 1;
+	}
+}
+
+
+static int
+bus_entry(bus_entry_ptr entry, int bus)
+{
+	int     x;
+	char    c, name[8];
+
+	/* encode the name into an index */
+	for (x = 0; x < 6; ++x) {
+		if ((c = entry->bus_type[x]) == ' ')
+			break;
+		name[x] = c;
+	}
+	name[x] = '\0';
+
+	if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE)
+		panic("unknown bus type: '%s'", name);
+
+	bus_data[bus].bus_id = entry->bus_id;
+	bus_data[bus].bus_type = x;
+
+	return 1;
+}
+
+
+static int
+io_apic_entry(io_apic_entry_ptr entry, int apic)
+{
+	if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN))
+		return 0;
+
+	IO_TO_ID(apic) = entry->apic_id;
+	ID_TO_IO(entry->apic_id) = apic;
+
+	return 1;
+}
+
+
+static int
+lookup_bus_type(char *name)
+{
+	int     x;
+
+	for (x = 0; x < MAX_BUSTYPE; ++x)
+		if (strcmp(bus_type_table[x].name, name) == 0)
+			return bus_type_table[x].type;
+
+	return UNKNOWN_BUSTYPE;
+}
+
+
+static int
+int_entry(int_entry_ptr entry, int intr)
+{
+	int apic;
+
+	io_apic_ints[intr].int_type = entry->int_type;
+	io_apic_ints[intr].int_flags = entry->int_flags;
+	io_apic_ints[intr].src_bus_id = entry->src_bus_id;
+	io_apic_ints[intr].src_bus_irq = entry->src_bus_irq;
+	if (entry->dst_apic_id == 255) {
+		/* This signal goes to all IO APICS.  Select an IO APIC
+		   with sufficient number of interrupt pins */
+		for (apic = 0; apic < mp_napics; apic++)
+			if (((io_apic_read(apic, IOAPIC_VER) & 
+			      IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >= 
+			    entry->dst_apic_int)
+				break;
+		if (apic < mp_napics)
+			io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic);
+		else
+			io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
+	} else
+		io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
+	io_apic_ints[intr].dst_apic_int = entry->dst_apic_int;
+
+	return 1;
+}
+
+
+static int
+apic_int_is_bus_type(int intr, int bus_type)
+{
+	int     bus;
+
+	for (bus = 0; bus < mp_nbusses; ++bus)
+		if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id)
+		    && ((int) bus_data[bus].bus_type == bus_type))
+			return 1;
+
+	return 0;
+}
+
+
+/*
+ * Given a traditional ISA INT mask, return an APIC mask.
+ */
+u_int
+isa_apic_mask(u_int isa_mask)
+{
+	int isa_irq;
+	int apic_pin;
+
+#if defined(SKIP_IRQ15_REDIRECT)
+	if (isa_mask == (1 << 15)) {
+		printf("skipping ISA IRQ15 redirect\n");
+		return isa_mask;
+	}
+#endif  /* SKIP_IRQ15_REDIRECT */
+
+	isa_irq = ffs(isa_mask);		/* find its bit position */
+	if (isa_irq == 0)			/* doesn't exist */
+		return 0;
+	--isa_irq;				/* make it zero based */
+
+	apic_pin = isa_apic_irq(isa_irq);	/* look for APIC connection */
+	if (apic_pin == -1)
+		return 0;
+
+	return (1 << apic_pin);			/* convert pin# to a mask */
+}
+
+
+/*
+ * Determine which APIC pin an ISA/EISA INT is attached to.
+ */
+#define INTTYPE(I)	(io_apic_ints[(I)].int_type)
+#define INTPIN(I)	(io_apic_ints[(I)].dst_apic_int)
+#define INTIRQ(I)	(io_apic_ints[(I)].int_vector)
+#define INTAPIC(I)	(ID_TO_IO(io_apic_ints[(I)].dst_apic_id))
+
+#define SRCBUSIRQ(I)	(io_apic_ints[(I)].src_bus_irq)
+int
+isa_apic_irq(int isa_irq)
+{
+	int     intr;
+
+	for (intr = 0; intr < nintrs; ++intr) {		/* check each record */
+		if (INTTYPE(intr) == 0) {		/* standard INT */
+			if (SRCBUSIRQ(intr) == isa_irq) {
+				if (apic_int_is_bus_type(intr, ISA) ||
+			            apic_int_is_bus_type(intr, EISA))
+					return INTIRQ(intr);	/* found */
+			}
+		}
+	}
+	return -1;					/* NOT found */
+}
+
+
+/*
+ * Determine which APIC pin a PCI INT is attached to.
+ */
+#define SRCBUSID(I)	(io_apic_ints[(I)].src_bus_id)
+#define SRCBUSDEVICE(I)	((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f)
+#define SRCBUSLINE(I)	(io_apic_ints[(I)].src_bus_irq & 0x03)
+int
+pci_apic_irq(int pciBus, int pciDevice, int pciInt)
+{
+	int     intr;
+
+	--pciInt;					/* zero based */
+
+	for (intr = 0; intr < nintrs; ++intr)		/* check each record */
+		if ((INTTYPE(intr) == 0)		/* standard INT */
+		    && (SRCBUSID(intr) == pciBus)
+		    && (SRCBUSDEVICE(intr) == pciDevice)
+		    && (SRCBUSLINE(intr) == pciInt))	/* a candidate IRQ */
+			if (apic_int_is_bus_type(intr, PCI))
+				return INTIRQ(intr);	/* exact match */
+
+	return -1;					/* NOT found */
+}
+
+int
+next_apic_irq(int irq) 
+{
+	int intr, ointr;
+	int bus, bustype;
+
+	bus = 0;
+	bustype = 0;
+	for (intr = 0; intr < nintrs; intr++) {
+		if (INTIRQ(intr) != irq || INTTYPE(intr) != 0)
+			continue;
+		bus = SRCBUSID(intr);
+		bustype = apic_bus_type(bus);
+		if (bustype != ISA &&
+		    bustype != EISA &&
+		    bustype != PCI)
+			continue;
+		break;
+	}
+	if (intr >= nintrs) {
+		return -1;
+	}
+	for (ointr = intr + 1; ointr < nintrs; ointr++) {
+		if (INTTYPE(ointr) != 0)
+			continue;
+		if (bus != SRCBUSID(ointr))
+			continue;
+		if (bustype == PCI) {
+			if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr))
+				continue;
+			if (SRCBUSLINE(intr) != SRCBUSLINE(ointr))
+				continue;
+		}
+		if (bustype == ISA || bustype == EISA) {
+			if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr))
+				continue;
+		}
+		if (INTPIN(intr) == INTPIN(ointr))
+			continue;
+		break;
+	}
+	if (ointr >= nintrs) {
+		return -1;
+	}
+	return INTIRQ(ointr);
+}
+#undef SRCBUSLINE
+#undef SRCBUSDEVICE
+#undef SRCBUSID
+#undef SRCBUSIRQ
+
+#undef INTPIN
+#undef INTIRQ
+#undef INTAPIC
+#undef INTTYPE
+
+
+/*
+ * Reprogram the MB chipset to NOT redirect an ISA INTerrupt.
+ *
+ * XXX FIXME:
+ *  Exactly what this means is unclear at this point.  It is a solution
+ *  for motherboards that redirect the MBIRQ0 pin.  Generically a motherboard
+ *  could route any of the ISA INTs to upper (>15) IRQ values.  But most would
+ *  NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an
+ *  option.
+ */
+int
+undirect_isa_irq(int rirq)
+{
+#if defined(READY)
+	if (bootverbose)
+	    printf("Freeing redirected ISA irq %d.\n", rirq);
+	/** FIXME: tickle the MB redirector chip */
+	return ???;
+#else
+	if (bootverbose)
+	    printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq);
+	return 0;
+#endif  /* READY */
+}
+
+
+/*
+ * Reprogram the MB chipset to NOT redirect a PCI INTerrupt
+ */
+int
+undirect_pci_irq(int rirq)
+{
+#if defined(READY)
+	if (bootverbose)
+		printf("Freeing redirected PCI irq %d.\n", rirq);
+
+	/** FIXME: tickle the MB redirector chip */
+	return ???;
+#else
+	if (bootverbose)
+		printf("Freeing (NOT implemented) redirected PCI irq %d.\n",
+		       rirq);
+	return 0;
+#endif  /* READY */
+}
+
+
+/*
+ * given a bus ID, return:
+ *  the bus type if found
+ *  -1 if NOT found
+ */
+int
+apic_bus_type(int id)
+{
+	int     x;
+
+	for (x = 0; x < mp_nbusses; ++x)
+		if (bus_data[x].bus_id == id)
+			return bus_data[x].bus_type;
+
+	return -1;
+}
+
+
+/*
+ * given a LOGICAL APIC# and pin#, return:
+ *  the associated src bus ID if found
+ *  -1 if NOT found
+ */
+int
+apic_src_bus_id(int apic, int pin)
+{
+	int     x;
+
+	/* search each of the possible INTerrupt sources */
+	for (x = 0; x < nintrs; ++x)
+		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+		    (pin == io_apic_ints[x].dst_apic_int))
+			return (io_apic_ints[x].src_bus_id);
+
+	return -1;		/* NOT found */
+}
+
+
+/*
+ * given a LOGICAL APIC# and pin#, return:
+ *  the associated src bus IRQ if found
+ *  -1 if NOT found
+ */
+int
+apic_src_bus_irq(int apic, int pin)
+{
+	int     x;
+
+	for (x = 0; x < nintrs; x++)
+		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+		    (pin == io_apic_ints[x].dst_apic_int))
+			return (io_apic_ints[x].src_bus_irq);
+
+	return -1;		/* NOT found */
+}
+
+
+/*
+ * given a LOGICAL APIC# and pin#, return:
+ *  the associated INTerrupt type if found
+ *  -1 if NOT found
+ */
+int
+apic_int_type(int apic, int pin)
+{
+	int     x;
+
+	/* search each of the possible INTerrupt sources */
+	for (x = 0; x < nintrs; ++x)
+		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+		    (pin == io_apic_ints[x].dst_apic_int))
+			return (io_apic_ints[x].int_type);
+
+	return -1;		/* NOT found */
+}
+
+int 
+apic_irq(int apic, int pin)
+{
+	int x;
+	int res;
+
+	for (x = 0; x < nintrs; ++x)
+		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+		    (pin == io_apic_ints[x].dst_apic_int)) {
+			res = io_apic_ints[x].int_vector;
+			if (res == 0xff)
+				return -1;
+			if (apic != int_to_apicintpin[res].ioapic)
+				panic("apic_irq: inconsistent table");
+			if (pin != int_to_apicintpin[res].int_pin)
+				panic("apic_irq inconsistent table (2)");
+			return res;
+		}
+	return -1;
+}
+
+
+/*
+ * given a LOGICAL APIC# and pin#, return:
+ *  the associated trigger mode if found
+ *  -1 if NOT found
+ */
+int
+apic_trigger(int apic, int pin)
+{
+	int     x;
+
+	/* search each of the possible INTerrupt sources */
+	for (x = 0; x < nintrs; ++x)
+		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+		    (pin == io_apic_ints[x].dst_apic_int))
+			return ((io_apic_ints[x].int_flags >> 2) & 0x03);
+
+	return -1;		/* NOT found */
+}
+
+
+/*
+ * given a LOGICAL APIC# and pin#, return:
+ *  the associated 'active' level if found
+ *  -1 if NOT found
+ */
+int
+apic_polarity(int apic, int pin)
+{
+	int     x;
+
+	/* search each of the possible INTerrupt sources */
+	for (x = 0; x < nintrs; ++x)
+		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
+		    (pin == io_apic_ints[x].dst_apic_int))
+			return (io_apic_ints[x].int_flags & 0x03);
+
+	return -1;		/* NOT found */
+}
+
+
+/*
+ * set data according to MP defaults
+ * FIXME: probably not complete yet...
+ */
+static void
+default_mp_table(int type)
+{
+	int     ap_cpu_id;
+#if defined(APIC_IO)
+	u_int32_t ux;
+	int     io_apic_id;
+	int     pin;
+#endif	/* APIC_IO */
+
+#if 0
+	printf("  MP default config type: %d\n", type);
+	switch (type) {
+	case 1:
+		printf("   bus: ISA, APIC: 82489DX\n");
+		break;
+	case 2:
+		printf("   bus: EISA, APIC: 82489DX\n");
+		break;
+	case 3:
+		printf("   bus: EISA, APIC: 82489DX\n");
+		break;
+	case 4:
+		printf("   bus: MCA, APIC: 82489DX\n");
+		break;
+	case 5:
+		printf("   bus: ISA+PCI, APIC: Integrated\n");
+		break;
+	case 6:
+		printf("   bus: EISA+PCI, APIC: Integrated\n");
+		break;
+	case 7:
+		printf("   bus: MCA+PCI, APIC: Integrated\n");
+		break;
+	default:
+		printf("   future type\n");
+		break;
+		/* NOTREACHED */
+	}
+#endif	/* 0 */
+
+	boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24;
+	ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0;
+
+	/* BSP */
+	CPU_TO_ID(0) = boot_cpu_id;
+	ID_TO_CPU(boot_cpu_id) = 0;
+
+	/* one and only AP */
+	CPU_TO_ID(1) = ap_cpu_id;
+	ID_TO_CPU(ap_cpu_id) = 1;
+
+#if defined(APIC_IO)
+	/* one and only IO APIC */
+	io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24;
+
+	/*
+	 * sanity check, refer to MP spec section 3.6.6, last paragraph
+	 * necessary as some hardware isn't properly setting up the IO APIC
+	 */
+#if defined(REALLY_ANAL_IOAPICID_VALUE)
+	if (io_apic_id != 2) {
+#else
+	if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) {
+#endif	/* REALLY_ANAL_IOAPICID_VALUE */
+		ux = io_apic_read(0, IOAPIC_ID);	/* get current contents */
+		ux &= ~APIC_ID_MASK;	/* clear the ID field */
+		ux |= 0x02000000;	/* set it to '2' */
+		io_apic_write(0, IOAPIC_ID, ux);	/* write new value */
+		ux = io_apic_read(0, IOAPIC_ID);	/* re-read && test */
+		if ((ux & APIC_ID_MASK) != 0x02000000)
+			panic("can't control IO APIC ID, reg: 0x%08x", ux);
+		io_apic_id = 2;
+	}
+	IO_TO_ID(0) = io_apic_id;
+	ID_TO_IO(io_apic_id) = 0;
+#endif	/* APIC_IO */
+
+	/* fill out bus entries */
+	switch (type) {
+	case 1:
+	case 2:
+	case 3:
+	case 5:
+	case 6:
+		bus_data[0].bus_id = default_data[type - 1][1];
+		bus_data[0].bus_type = default_data[type - 1][2];
+		bus_data[1].bus_id = default_data[type - 1][3];
+		bus_data[1].bus_type = default_data[type - 1][4];
+		break;
+
+	/* case 4: case 7:		   MCA NOT supported */
+	default:		/* illegal/reserved */
+		panic("BAD default MP config: %d", type);
+		/* NOTREACHED */
+	}
+
+#if defined(APIC_IO)
+	/* general cases from MP v1.4, table 5-2 */
+	for (pin = 0; pin < 16; ++pin) {
+		io_apic_ints[pin].int_type = 0;
+		io_apic_ints[pin].int_flags = 0x05;	/* edge/active-hi */
+		io_apic_ints[pin].src_bus_id = 0;
+		io_apic_ints[pin].src_bus_irq = pin;	/* IRQ2 caught below */
+		io_apic_ints[pin].dst_apic_id = io_apic_id;
+		io_apic_ints[pin].dst_apic_int = pin;	/* 1-to-1 */
+	}
+
+	/* special cases from MP v1.4, table 5-2 */
+	if (type == 2) {
+		io_apic_ints[2].int_type = 0xff;	/* N/C */
+		io_apic_ints[13].int_type = 0xff;	/* N/C */
+#if !defined(APIC_MIXED_MODE)
+		/** FIXME: ??? */
+		panic("sorry, can't support type 2 default yet");
+#endif	/* APIC_MIXED_MODE */
+	}
+	else
+		io_apic_ints[2].src_bus_irq = 0;	/* ISA IRQ0 is on APIC INT 2 */
+
+	if (type == 7)
+		io_apic_ints[0].int_type = 0xff;	/* N/C */
+	else
+		io_apic_ints[0].int_type = 3;	/* vectored 8259 */
+#endif	/* APIC_IO */
+}
+
+
+/*
+ * initialize all the SMP locks
+ */
+
+/* critical region around IO APIC, apic_imen */
+struct simplelock	imen_lock;
+
+/* critical region around splxx(), cpl, cml, cil, ipending */
+struct simplelock	cpl_lock;
+
+/* Make FAST_INTR() routines sequential */
+struct simplelock	fast_intr_lock;
+
+/* critical region around INTR() routines */
+struct simplelock	intr_lock;
+
+/* lock regions protected in UP kernel via cli/sti */
+struct simplelock	mpintr_lock;
+
+/* lock region used by kernel profiling */
+struct simplelock	mcount_lock;
+
+#ifdef USE_COMLOCK
+/* locks com (tty) data/hardware accesses: a FASTINTR() */
+struct simplelock	com_lock;
+#endif /* USE_COMLOCK */
+
+#ifdef USE_CLOCKLOCK
+/* lock regions around the clock hardware */
+struct simplelock	clock_lock;
+#endif /* USE_CLOCKLOCK */
+
+static void
+init_locks(void)
+{
+	/*
+	 * Get the initial mp_lock with a count of 1 for the BSP.
+	 * This uses a LOGICAL cpu ID, ie BSP == 0.
+	 */
+	mp_lock = 0x00000001;
+
+	/* ISR uses its own "giant lock" */
+	isr_lock = FREE_LOCK;
+
+#if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ)
+	s_lock_init((struct simplelock*)&apic_itrace_debuglock);
+#endif
+
+	s_lock_init((struct simplelock*)&mpintr_lock);
+
+	s_lock_init((struct simplelock*)&mcount_lock);
+
+	s_lock_init((struct simplelock*)&fast_intr_lock);
+	s_lock_init((struct simplelock*)&intr_lock);
+	s_lock_init((struct simplelock*)&imen_lock);
+	s_lock_init((struct simplelock*)&cpl_lock);
+
+#ifdef USE_COMLOCK
+	s_lock_init((struct simplelock*)&com_lock);
+#endif /* USE_COMLOCK */
+#ifdef USE_CLOCKLOCK
+	s_lock_init((struct simplelock*)&clock_lock);
+#endif /* USE_CLOCKLOCK */
+}
+
+
+/*
+ * start each AP in our list
+ */
+static int
+start_all_aps(u_int boot_addr)
+{
+	int     x, i;
+	u_char  mpbiosreason;
+	u_long  mpbioswarmvec;
+	pd_entry_t *newptd;
+	pt_entry_t *newpt;
+	struct globaldata *gd;
+	char *stack;
+	pd_entry_t	*myPTD;
+
+	POSTCODE(START_ALL_APS_POST);
+
+	/* initialize BSP's local APIC */
+	apic_initialize();
+	bsp_apic_ready = 1;
+
+	/* install the AP 1st level boot code */
+	install_ap_tramp(boot_addr);
+
+
+	/* save the current value of the warm-start vector */
+	mpbioswarmvec = *((u_long *) WARMBOOT_OFF);
+#ifndef PC98
+	outb(CMOS_REG, BIOS_RESET);
+	mpbiosreason = inb(CMOS_DATA);
+#endif
+
+	/* record BSP in CPU map */
+	all_cpus = 1;
+
+	/* start each AP */
+	for (x = 1; x <= mp_naps; ++x) {
+
+		/* This is a bit verbose, it will go away soon.  */
+
+		/* alloc new page table directory */
+		newptd = (pd_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE));
+
+		/* Store the virtual PTD address for this CPU */
+		IdlePTDS[x] = newptd;
+
+		/* clone currently active one (ie: IdlePTD) */
+		bcopy(PTD, newptd, PAGE_SIZE);	/* inc prv page pde */
+
+		/* set up 0 -> 4MB P==V mapping for AP boot */
+		newptd[0] = (void *)(uintptr_t)(PG_V | PG_RW |
+		    ((uintptr_t)(void *)KPTphys & PG_FRAME));
+
+		/* store PTD for this AP's boot sequence */
+		myPTD = (pd_entry_t *)vtophys(newptd);
+
+		/* alloc new page table page */
+		newpt = (pt_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE));
+
+		/* set the new PTD's private page to point there */
+		newptd[MPPTDI] = (pt_entry_t)(PG_V | PG_RW | vtophys(newpt));
+
+		/* install self referential entry */
+		newptd[PTDPTDI] = (pd_entry_t)(PG_V | PG_RW | vtophys(newptd));
+
+		/* allocate a new private data page */
+		gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE);
+
+		/* wire it into the private page table page */
+		newpt[0] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd));
+
+		/* wire the ptp into itself for access */
+		newpt[1] = (pt_entry_t)(PG_V | PG_RW | vtophys(newpt));
+
+		/* copy in the pointer to the local apic */
+		newpt[2] = SMP_prvpt[2];
+
+		/* and the IO apic mapping[s] */
+		for (i = 16; i < 32; i++)
+			newpt[i] = SMP_prvpt[i];
+
+		/* allocate and set up an idle stack data page */
+		stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE);
+		for (i = 0; i < UPAGES; i++)
+			newpt[i + 3] = (pt_entry_t)(PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
+
+		newpt[3 + UPAGES] = 0;		/* *prv_CMAP1 */
+		newpt[4 + UPAGES] = 0;		/* *prv_CMAP2 */
+		newpt[5 + UPAGES] = 0;		/* *prv_CMAP3 */
+		newpt[6 + UPAGES] = 0;		/* *prv_PMAP1 */
+
+		/* prime data page for it to use */
+		gd->cpuid = x;
+		gd->cpu_lockid = x << 24;
+		gd->my_idlePTD = myPTD;
+		gd->prv_CMAP1 = &newpt[3 + UPAGES];
+		gd->prv_CMAP2 = &newpt[4 + UPAGES];
+		gd->prv_CMAP3 = &newpt[5 + UPAGES];
+		gd->prv_PMAP1 = &newpt[6 + UPAGES];
+
+		/* setup a vector to our boot code */
+		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
+		*((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4);
+#ifndef PC98
+		outb(CMOS_REG, BIOS_RESET);
+		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
+#endif
+
+		bootPTD = myPTD;
+		/* attempt to start the Application Processor */
+		CHECK_INIT(99);	/* setup checkpoints */
+		if (!start_ap(x, boot_addr)) {
+			printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x));
+			CHECK_PRINT("trace");	/* show checkpoints */
+			/* better panic as the AP may be running loose */
+			printf("panic y/n? [y] ");
+			if (cngetc() != 'n')
+				panic("bye-bye");
+		}
+		CHECK_PRINT("trace");		/* show checkpoints */
+
+		/* record its version info */
+		cpu_apic_versions[x] = cpu_apic_versions[0];
+
+		all_cpus |= (1 << x);		/* record AP in CPU map */
+	}
+
+	/* build our map of 'other' CPUs */
+	other_cpus = all_cpus & ~(1 << cpuid);
+
+	/* fill in our (BSP) APIC version */
+	cpu_apic_versions[0] = lapic.version;
+
+	/* restore the warmstart vector */
+	*(u_long *) WARMBOOT_OFF = mpbioswarmvec;
+#ifndef PC98
+	outb(CMOS_REG, BIOS_RESET);
+	outb(CMOS_DATA, mpbiosreason);
+#endif
+
+	/*
+	 * Set up the idle context for the BSP.  Similar to above except
+	 * that some was done by locore, some by pmap.c and some is implicit
+	 * because the BSP is cpu#0 and the page is initially zero, and also
+	 * because we can refer to variables by name on the BSP..
+	 */
+	newptd = (pd_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE));
+
+	bcopy(PTD, newptd, PAGE_SIZE);	/* inc prv page pde */
+	IdlePTDS[0] = newptd;
+
+	/* Point PTD[] to this page instead of IdlePTD's physical page */
+	newptd[PTDPTDI] = (pd_entry_t)(PG_V | PG_RW | vtophys(newptd));
+
+	my_idlePTD = (pd_entry_t *)vtophys(newptd);
+
+	/* Allocate and setup BSP idle stack */
+	stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE);
+	for (i = 0; i < UPAGES; i++)
+		SMP_prvpt[i + 3] = (pt_entry_t)(PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
+
+	pmap_set_opt_bsp();
+
+	for (i = 0; i < mp_ncpus; i++) {
+		bcopy( (int *) PTD + KPTDI, (int *) IdlePTDS[i] + KPTDI, NKPDE * sizeof (int));
+	}
+
+	/* number of APs actually started */
+	return mp_ncpus - 1;
+}
+
+
+/*
+ * load the 1st level AP boot code into base memory.
+ */
+
+/* targets for relocation */
+extern void bigJump(void);
+extern void bootCodeSeg(void);
+extern void bootDataSeg(void);
+extern void MPentry(void);
+extern u_int MP_GDT;
+extern u_int mp_gdtbase;
+
+static void
+install_ap_tramp(u_int boot_addr)
+{
+	int     x;
+	int     size = *(int *) ((u_long) & bootMP_size);
+	u_char *src = (u_char *) ((u_long) bootMP);
+	u_char *dst = (u_char *) boot_addr + KERNBASE;
+	u_int   boot_base = (u_int) bootMP;
+	u_int8_t *dst8;
+	u_int16_t *dst16;
+	u_int32_t *dst32;
+
+	POSTCODE(INSTALL_AP_TRAMP_POST);
+
+	for (x = 0; x < size; ++x)
+		*dst++ = *src++;
+
+	/*
+	 * modify addresses in code we just moved to basemem. unfortunately we
+	 * need fairly detailed info about mpboot.s for this to work.  changes
+	 * to mpboot.s might require changes here.
+	 */
+
+	/* boot code is located in KERNEL space */
+	dst = (u_char *) boot_addr + KERNBASE;
+
+	/* modify the lgdt arg */
+	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
+	*dst32 = boot_addr + ((u_int) & MP_GDT - boot_base);
+
+	/* modify the ljmp target for MPentry() */
+	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
+	*dst32 = ((u_int) MPentry - KERNBASE);
+
+	/* modify the target for boot code segment */
+	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
+	dst8 = (u_int8_t *) (dst16 + 1);
+	*dst16 = (u_int) boot_addr & 0xffff;
+	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
+
+	/* modify the target for boot data segment */
+	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
+	dst8 = (u_int8_t *) (dst16 + 1);
+	*dst16 = (u_int) boot_addr & 0xffff;
+	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
+}
+
+
+/*
+ * this function starts the AP (application processor) identified
+ * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
+ * to accomplish this.  This is necessary because of the nuances
+ * of the different hardware we might encounter.  It ain't pretty,
+ * but it seems to work.
+ */
+static int
+start_ap(int logical_cpu, u_int boot_addr)
+{
+	int     physical_cpu;
+	int     vector;
+	int     cpus;
+	u_long  icr_lo, icr_hi;
+
+	POSTCODE(START_AP_POST);
+
+	/* get the PHYSICAL APIC ID# */
+	physical_cpu = CPU_TO_ID(logical_cpu);
+
+	/* calculate the vector */
+	vector = (boot_addr >> 12) & 0xff;
+
+	/* used as a watchpoint to signal AP startup */
+	cpus = mp_ncpus;
+
+	/*
+	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
+	 * and running the target CPU. OR this INIT IPI might be latched (P5
+	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
+	 * ignored.
+	 */
+
+	/* setup the address for the target AP */
+	icr_hi = lapic.icr_hi & ~APIC_ID_MASK;
+	icr_hi |= (physical_cpu << 24);
+	lapic.icr_hi = icr_hi;
+
+	/* do an INIT IPI: assert RESET */
+	icr_lo = lapic.icr_lo & 0xfff00000;
+	lapic.icr_lo = icr_lo | 0x0000c500;
+
+	/* wait for pending status end */
+	while (lapic.icr_lo & APIC_DELSTAT_MASK)
+		 /* spin */ ;
+
+	/* do an INIT IPI: deassert RESET */
+	lapic.icr_lo = icr_lo | 0x00008500;
+
+	/* wait for pending status end */
+	u_sleep(10000);		/* wait ~10mS */
+	while (lapic.icr_lo & APIC_DELSTAT_MASK)
+		 /* spin */ ;
+
+	/*
+	 * next we do a STARTUP IPI: the previous INIT IPI might still be
+	 * latched, (P5 bug) this 1st STARTUP would then terminate
+	 * immediately, and the previously started INIT IPI would continue. OR
+	 * the previous INIT IPI has already run. and this STARTUP IPI will
+	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
+	 * will run.
+	 */
+
+	/* do a STARTUP IPI */
+	lapic.icr_lo = icr_lo | 0x00000600 | vector;
+	while (lapic.icr_lo & APIC_DELSTAT_MASK)
+		 /* spin */ ;
+	u_sleep(200);		/* wait ~200uS */
+
+	/*
+	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
+	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
+	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
+	 * recognized after hardware RESET or INIT IPI.
+	 */
+
+	lapic.icr_lo = icr_lo | 0x00000600 | vector;
+	while (lapic.icr_lo & APIC_DELSTAT_MASK)
+		 /* spin */ ;
+	u_sleep(200);		/* wait ~200uS */
+
+	/* wait for it to start */
+	set_apic_timer(5000000);/* == 5 seconds */
+	while (read_apic_timer())
+		if (mp_ncpus > cpus)
+			return 1;	/* return SUCCESS */
+
+	return 0;		/* return FAILURE */
+}
+
+
+/*
+ * Flush the TLB on all other CPU's
+ *
+ * XXX: Needs to handshake and wait for completion before proceding.
+ */
+void
+smp_invltlb(void)
+{
+#if defined(APIC_IO)
+	if (smp_started && invltlb_ok)
+		all_but_self_ipi(XINVLTLB_OFFSET);
+#endif  /* APIC_IO */
+}
+
+void
+invlpg(u_int addr)
+{
+	__asm   __volatile("invlpg (%0)"::"r"(addr):"memory");
+
+	/* send a message to the other CPUs */
+	smp_invltlb();
+}
+
+void
+invltlb(void)
+{
+	u_long  temp;
+
+	/*
+	 * This should be implemented as load_cr3(rcr3()) when load_cr3() is
+	 * inlined.
+	 */
+	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
+
+	/* send a message to the other CPUs */
+	smp_invltlb();
+}
+
+
+/*
+ * When called the executing CPU will send an IPI to all other CPUs
+ *  requesting that they halt execution.
+ *
+ * Usually (but not necessarily) called with 'other_cpus' as its arg.
+ *
+ *  - Signals all CPUs in map to stop.
+ *  - Waits for each to stop.
+ *
+ * Returns:
+ *  -1: error
+ *   0: NA
+ *   1: ok
+ *
+ * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs
+ *            from executing at same time.
+ */
+int
+stop_cpus(u_int map)
+{
+	if (!smp_started)
+		return 0;
+
+	/* send the Xcpustop IPI to all CPUs in map */
+	selected_apic_ipi(map, XCPUSTOP_OFFSET, APIC_DELMODE_FIXED);
+	
+	while ((stopped_cpus & map) != map)
+		/* spin */ ;
+
+	return 1;
+}
+
+
+/*
+ * Called by a CPU to restart stopped CPUs. 
+ *
+ * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
+ *
+ *  - Signals all CPUs in map to restart.
+ *  - Waits for each to restart.
+ *
+ * Returns:
+ *  -1: error
+ *   0: NA
+ *   1: ok
+ */
+int
+restart_cpus(u_int map)
+{
+	if (!smp_started)
+		return 0;
+
+	started_cpus = map;		/* signal other cpus to restart */
+
+	while ((stopped_cpus & map) != 0) /* wait for each to clear its bit */
+		/* spin */ ;
+
+	return 1;
+}
+
+int smp_active = 0;	/* are the APs allowed to run? */
+SYSCTL_INT(_machdep, OID_AUTO, smp_active, CTLFLAG_RW, &smp_active, 0, "");
+
+/* XXX maybe should be hw.ncpu */
+static int smp_cpus = 1;	/* how many cpu's running */
+SYSCTL_INT(_machdep, OID_AUTO, smp_cpus, CTLFLAG_RD, &smp_cpus, 0, "");
+
+int invltlb_ok = 0;	/* throttle smp_invltlb() till safe */
+SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, "");
+
+/* Warning: Do not staticize.  Used from swtch.s */
+int do_page_zero_idle = 1; /* bzero pages for fun and profit in idleloop */
+SYSCTL_INT(_machdep, OID_AUTO, do_page_zero_idle, CTLFLAG_RW,
+	   &do_page_zero_idle, 0, "");
+
+/* Is forwarding of a interrupt to the CPU holding the ISR lock enabled ? */
+int forward_irq_enabled = 1;
+SYSCTL_INT(_machdep, OID_AUTO, forward_irq_enabled, CTLFLAG_RW,
+	   &forward_irq_enabled, 0, "");
+
+/* Enable forwarding of a signal to a process running on a different CPU */
+static int forward_signal_enabled = 1;
+SYSCTL_INT(_machdep, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
+	   &forward_signal_enabled, 0, "");
+
+/* Enable forwarding of roundrobin to all other cpus */
+static int forward_roundrobin_enabled = 1;
+SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW,
+	   &forward_roundrobin_enabled, 0, "");
+
+/*
+ * This is called once the rest of the system is up and running and we're
+ * ready to let the AP's out of the pen.
+ */
+void ap_init(void);
+
+void
+ap_init()
+{
+	u_int	apic_id;
+
+	smp_cpus++;
+
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+	lidt(&r_idt);
+#endif
+
+	/* Build our map of 'other' CPUs. */
+	other_cpus = all_cpus & ~(1 << cpuid);
+
+	printf("SMP: AP CPU #%d Launched!\n", cpuid);
+
+	/* XXX FIXME: i386 specific, and redundant: Setup the FPU. */
+	load_cr0((rcr0() & ~CR0_EM) | CR0_MP | CR0_NE | CR0_TS);
+
+	/* A quick check from sanity claus */
+	apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
+	if (cpuid != apic_id) {
+		printf("SMP: cpuid = %d\n", cpuid);
+		printf("SMP: apic_id = %d\n", apic_id);
+		printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]);
+		panic("cpuid mismatch! boom!!");
+	}
+
+	getmtrr();
+
+	/* Init local apic for irq's */
+	apic_initialize();
+
+	/*
+	 * Activate smp_invltlb, although strictly speaking, this isn't
+	 * quite correct yet.  We should have a bitfield for cpus willing
+	 * to accept TLB flush IPI's or something and sync them.
+	 */
+	if (smp_cpus == mp_ncpus) {
+		invltlb_ok = 1;
+		smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */
+		smp_active = 1;	 /* historic */
+	}
+
+	curproc = NULL;		/* make sure */
+}
+
+#ifdef BETTER_CLOCK
+
+#define CHECKSTATE_USER	0
+#define CHECKSTATE_SYS	1
+#define CHECKSTATE_INTR	2
+
+/* Do not staticize.  Used from apic_vector.s */
+struct proc*	checkstate_curproc[NCPU];
+int		checkstate_cpustate[NCPU];
+u_long		checkstate_pc[NCPU];
+
+extern long	cp_time[CPUSTATES];
+
+#define PC_TO_INDEX(pc, prof)				\
+        ((int)(((u_quad_t)((pc) - (prof)->pr_off) *	\
+            (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
+
+static void
+addupc_intr_forwarded(struct proc *p, int id, int *astmap)
+{
+	int i;
+	struct uprof *prof;
+	u_long pc;
+
+	pc = checkstate_pc[id];
+	prof = &p->p_stats->p_prof;
+	if (pc >= prof->pr_off &&
+	    (i = PC_TO_INDEX(pc, prof)) < prof->pr_size) {
+		if ((p->p_flag & P_OWEUPC) == 0) {
+			prof->pr_addr = pc;
+			prof->pr_ticks = 1;
+			p->p_flag |= P_OWEUPC;
+		}
+		*astmap |= (1 << id);
+	}
+}
+
+static void
+forwarded_statclock(int id, int pscnt, int *astmap)
+{
+	struct pstats *pstats;
+	long rss;
+	struct rusage *ru;
+	struct vmspace *vm;
+	int cpustate;
+	struct proc *p;
+#ifdef GPROF
+	register struct gmonparam *g;
+	int i;
+#endif
+
+	p = checkstate_curproc[id];
+	cpustate = checkstate_cpustate[id];
+
+	switch (cpustate) {
+	case CHECKSTATE_USER:
+		if (p->p_flag & P_PROFIL)
+			addupc_intr_forwarded(p, id, astmap);
+		if (pscnt > 1)
+			return;
+		p->p_uticks++;
+		if (p->p_nice > NZERO)
+			cp_time[CP_NICE]++;
+		else
+			cp_time[CP_USER]++;
+		break;
+	case CHECKSTATE_SYS:
+#ifdef GPROF
+		/*
+		 * Kernel statistics are just like addupc_intr, only easier.
+		 */
+		g = &_gmonparam;
+		if (g->state == GMON_PROF_ON) {
+			i = checkstate_pc[id] - g->lowpc;
+			if (i < g->textsize) {
+				i /= HISTFRACTION * sizeof(*g->kcount);
+				g->kcount[i]++;
+			}
+		}
+#endif
+		if (pscnt > 1)
+			return;
+
+		if (!p)
+			cp_time[CP_IDLE]++;
+		else {
+			p->p_sticks++;
+			cp_time[CP_SYS]++;
+		}
+		break;
+	case CHECKSTATE_INTR:
+	default:
+#ifdef GPROF
+		/*
+		 * Kernel statistics are just like addupc_intr, only easier.
+		 */
+		g = &_gmonparam;
+		if (g->state == GMON_PROF_ON) {
+			i = checkstate_pc[id] - g->lowpc;
+			if (i < g->textsize) {
+				i /= HISTFRACTION * sizeof(*g->kcount);
+				g->kcount[i]++;
+			}
+		}
+#endif
+		if (pscnt > 1)
+			return;
+		if (p)
+			p->p_iticks++;
+		cp_time[CP_INTR]++;
+	}
+	if (p != NULL) {
+		p->p_cpticks++;
+		if (++p->p_estcpu == 0)
+			p->p_estcpu--;
+		if ((p->p_estcpu & 3) == 0) {
+			resetpriority(p);
+			if (p->p_priority >= PUSER)
+				p->p_priority = p->p_usrpri;
+		}
+		
+		/* Update resource usage integrals and maximums. */
+		if ((pstats = p->p_stats) != NULL &&
+		    (ru = &pstats->p_ru) != NULL &&
+		    (vm = p->p_vmspace) != NULL) {
+			ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+			ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+			ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+			rss = vm->vm_pmap.pm_stats.resident_count *
+				PAGE_SIZE / 1024;
+			if (ru->ru_maxrss < rss)
+				ru->ru_maxrss = rss;
+        	}
+	}
+}
+
+void
+forward_statclock(int pscnt)
+{
+	int map;
+	int id;
+	int i;
+
+	/* Kludge. We don't yet have separate locks for the interrupts
+	 * and the kernel. This means that we cannot let the other processors
+	 * handle complex interrupts while inhibiting them from entering
+	 * the kernel in a non-interrupt context.
+	 *
+	 * What we can do, without changing the locking mechanisms yet,
+	 * is letting the other processors handle a very simple interrupt
+	 * (wich determines the processor states), and do the main
+	 * work ourself.
+	 */
+
+	if (!smp_started || !invltlb_ok || cold || panicstr)
+		return;
+
+	/* Step 1: Probe state   (user, cpu, interrupt, spinlock, idle ) */
+	
+	map = other_cpus & ~stopped_cpus ;
+	checkstate_probed_cpus = 0;
+	if (map != 0)
+		selected_apic_ipi(map,
+				  XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED);
+
+	i = 0;
+	while (checkstate_probed_cpus != map) {
+		/* spin */
+		i++;
+		if (i == 100000) {
+#ifdef BETTER_CLOCK_DIAGNOSTIC
+			printf("forward_statclock: checkstate %x\n",
+			       checkstate_probed_cpus);
+#endif
+			break;
+		}
+	}
+
+	/*
+	 * Step 2: walk through other processors processes, update ticks and 
+	 * profiling info.
+	 */
+	
+	map = 0;
+	for (id = 0; id < mp_ncpus; id++) {
+		if (id == cpuid)
+			continue;
+		if (((1 << id) & checkstate_probed_cpus) == 0)
+			continue;
+		forwarded_statclock(id, pscnt, &map);
+	}
+	if (map != 0) {
+		checkstate_need_ast |= map;
+		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
+		i = 0;
+		while ((checkstate_need_ast & map) != 0) {
+			/* spin */
+			i++;
+			if (i > 100000) { 
+#ifdef BETTER_CLOCK_DIAGNOSTIC
+				printf("forward_statclock: dropped ast 0x%x\n",
+				       checkstate_need_ast & map);
+#endif
+				break;
+			}
+		}
+	}
+}
+
+void 
+forward_hardclock(int pscnt)
+{
+	int map;
+	int id;
+	struct proc *p;
+	struct pstats *pstats;
+	int i;
+
+	/* Kludge. We don't yet have separate locks for the interrupts
+	 * and the kernel. This means that we cannot let the other processors
+	 * handle complex interrupts while inhibiting them from entering
+	 * the kernel in a non-interrupt context.
+	 *
+	 * What we can do, without changing the locking mechanisms yet,
+	 * is letting the other processors handle a very simple interrupt
+	 * (wich determines the processor states), and do the main
+	 * work ourself.
+	 */
+
+	if (!smp_started || !invltlb_ok || cold || panicstr)
+		return;
+
+	/* Step 1: Probe state   (user, cpu, interrupt, spinlock, idle) */
+	
+	map = other_cpus & ~stopped_cpus ;
+	checkstate_probed_cpus = 0;
+	if (map != 0)
+		selected_apic_ipi(map,
+				  XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED);
+	
+	i = 0;
+	while (checkstate_probed_cpus != map) {
+		/* spin */
+		i++;
+		if (i == 100000) {
+#ifdef BETTER_CLOCK_DIAGNOSTIC
+			printf("forward_hardclock: checkstate %x\n",
+			       checkstate_probed_cpus);
+#endif
+			break;
+		}
+	}
+
+	/*
+	 * Step 2: walk through other processors processes, update virtual 
+	 * timer and profiling timer. If stathz == 0, also update ticks and 
+	 * profiling info.
+	 */
+	
+	map = 0;
+	for (id = 0; id < mp_ncpus; id++) {
+		if (id == cpuid)
+			continue;
+		if (((1 << id) & checkstate_probed_cpus) == 0)
+			continue;
+		p = checkstate_curproc[id];
+		if (p) {
+			pstats = p->p_stats;
+			if (checkstate_cpustate[id] == CHECKSTATE_USER &&
+			    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+			    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
+				psignal(p, SIGVTALRM);
+				map |= (1 << id);
+			}
+			if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+			    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
+				psignal(p, SIGPROF);
+				map |= (1 << id);
+			}
+		}
+		if (stathz == 0) {
+			forwarded_statclock( id, pscnt, &map);
+		}
+	}
+	if (map != 0) {
+		checkstate_need_ast |= map;
+		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
+		i = 0;
+		while ((checkstate_need_ast & map) != 0) {
+			/* spin */
+			i++;
+			if (i > 100000) { 
+#ifdef BETTER_CLOCK_DIAGNOSTIC
+				printf("forward_hardclock: dropped ast 0x%x\n",
+				       checkstate_need_ast & map);
+#endif
+				break;
+			}
+		}
+	}
+}
+
+#endif /* BETTER_CLOCK */
+
+void 
+forward_signal(struct proc *p)
+{
+	int map;
+	int id;
+	int i;
+
+	/* Kludge. We don't yet have separate locks for the interrupts
+	 * and the kernel. This means that we cannot let the other processors
+	 * handle complex interrupts while inhibiting them from entering
+	 * the kernel in a non-interrupt context.
+	 *
+	 * What we can do, without changing the locking mechanisms yet,
+	 * is letting the other processors handle a very simple interrupt
+	 * (wich determines the processor states), and do the main
+	 * work ourself.
+	 */
+
+	if (!smp_started || !invltlb_ok || cold || panicstr)
+		return;
+	if (!forward_signal_enabled)
+		return;
+	while (1) {
+		if (p->p_stat != SRUN)
+			return;
+		id = (u_char) p->p_oncpu;
+		if (id == 0xff)
+			return;
+		map = (1<<id);
+		checkstate_need_ast |= map;
+		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
+		i = 0;
+		while ((checkstate_need_ast & map) != 0) {
+			/* spin */
+			i++;
+			if (i > 100000) { 
+#if 0
+				printf("forward_signal: dropped ast 0x%x\n",
+				       checkstate_need_ast & map);
+#endif
+				break;
+			}
+		}
+		if (id == (u_char) p->p_oncpu)
+			return;
+	}
+}
+
+void
+forward_roundrobin(void)
+{
+	u_int map;
+	int i;
+
+	if (!smp_started || !invltlb_ok || cold || panicstr)
+		return;
+	if (!forward_roundrobin_enabled)
+		return;
+	resched_cpus |= other_cpus;
+	map = other_cpus & ~stopped_cpus ;
+#if 1
+	selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
+#else
+	(void) all_but_self_ipi(XCPUAST_OFFSET);
+#endif
+	i = 0;
+	while ((checkstate_need_ast & map) != 0) {
+		/* spin */
+		i++;
+		if (i > 100000) {
+#if 0
+			printf("forward_roundrobin: dropped ast 0x%x\n",
+			       checkstate_need_ast & map);
+#endif
+			break;
+		}
+	}
+}
+
+
+#ifdef APIC_INTR_REORDER
+/*
+ *	Maintain mapping from softintr vector to isr bit in local apic.
+ */
+void
+set_lapic_isrloc(int intr, int vector)
+{
+	if (intr < 0 || intr > 32)
+		panic("set_apic_isrloc: bad intr argument: %d",intr);
+	if (vector < ICU_OFFSET || vector > 255)
+		panic("set_apic_isrloc: bad vector argument: %d",vector);
+	apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2);
+	apic_isrbit_location[intr].bit = (1<<(vector & 31));
+}
+#endif
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 57195f3..42b0c85 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -1,6 +1,7 @@
 /*-
- * Copyright (c) 1990 The Regents of the University of California.
- * All rights reserved.
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
@@ -33,515 +34,1145 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)trap.c	7.4 (Berkeley) 5/13/91
- *
- * PATCHES MAGIC                LEVEL   PATCH THAT GOT US HERE
- * --------------------         -----   ----------------------
- * CURRENT PATCH LEVEL:         1       00137
- * --------------------         -----   ----------------------
- *
- * 08 Apr 93	Bruce Evans		Several VM system fixes
- * 		Paul Kranenburg		Add counter for vmstat
+ *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
+ *	$Id: trap.c,v 1.132 1998/12/28 23:02:56 msmith Exp $
  */
-static char rcsid[] = "$Header: /usr/bill/working/sys/i386/i386/RCS/trap.c,v 1.2 92/01/21 14:22:13 william Exp $";
 
 /*
- * 386 Trap and System call handleing
+ * 386 Trap and System call handling
  */
 
-#include "machine/cpu.h"
-#include "machine/psl.h"
-#include "machine/reg.h"
+#include "opt_cpu.h"
+#include "opt_ddb.h"
+#include "opt_ktrace.h"
+#include "opt_trap.h"
+#include "opt_vm86.h"
 
-#include "param.h"
-#include "systm.h"
-#include "proc.h"
-#include "user.h"
-#include "acct.h"
-#include "kernel.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/kernel.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/uio.h>
+#include <sys/vmmeter.h>
 #ifdef KTRACE
-#include "ktrace.h"
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+
+#include <machine/cpu.h>
+#include <machine/ipl.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#ifdef SMP
+#include <machine/smp.h>
 #endif
+#include <machine/tss.h>
+
+#include <i386/isa/intr_machdep.h>
+
+#ifdef POWERFAIL_NMI
+#include <sys/syslog.h>
+#include <machine/clock.h>
+#endif
+
+#ifdef VM86
+#include <machine/vm86.h>
+#endif
+
+#ifdef DDB
+	extern int in_Debugger, debugger_on_panic;
+#endif
+
+#include "isa.h"
+#include "npx.h"
+
+extern struct i386tss common_tss;
+
+int (*pmath_emulate) __P((struct trapframe *));
+
+extern void trap __P((struct trapframe frame));
+extern int trapwrite __P((unsigned addr));
+extern void syscall __P((struct trapframe frame));
+
+static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
+static void trap_fatal __P((struct trapframe *, vm_offset_t));
+void dblfault_handler __P((void));
+
+extern inthand_t IDTVEC(syscall);
+
+#define MAX_TRAP_MSG		28
+static char *trap_msg[] = {
+	"",					/*  0 unused */
+	"privileged instruction fault",		/*  1 T_PRIVINFLT */
+	"",					/*  2 unused */
+	"breakpoint instruction fault",		/*  3 T_BPTFLT */
+	"",					/*  4 unused */
+	"",					/*  5 unused */
+	"arithmetic trap",			/*  6 T_ARITHTRAP */
+	"system forced exception",		/*  7 T_ASTFLT */
+	"",					/*  8 unused */
+	"general protection fault",		/*  9 T_PROTFLT */
+	"trace trap",				/* 10 T_TRCTRAP */
+	"",					/* 11 unused */
+	"page fault",				/* 12 T_PAGEFLT */
+	"",					/* 13 unused */
+	"alignment fault",			/* 14 T_ALIGNFLT */
+	"",					/* 15 unused */
+	"",					/* 16 unused */
+	"",					/* 17 unused */
+	"integer divide fault",			/* 18 T_DIVIDE */
+	"non-maskable interrupt trap",		/* 19 T_NMI */
+	"overflow trap",			/* 20 T_OFLOW */
+	"FPU bounds check fault",		/* 21 T_BOUND */
+	"FPU device not available",		/* 22 T_DNA */
+	"double fault",				/* 23 T_DOUBLEFLT */
+	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
+	"invalid TSS fault",			/* 25 T_TSSFLT */
+	"segment not present fault",		/* 26 T_SEGNPFLT */
+	"stack fault",				/* 27 T_STKFLT */
+	"machine check trap",			/* 28 T_MCHK */
+};
+
+static __inline void userret __P((struct proc *p, struct trapframe *frame,
+				  u_quad_t oticks));
 
-#include "vm/vm_param.h"
-#include "vm/pmap.h"
-#include "vm/vm_map.h"
-#include "sys/vmmeter.h"
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+extern struct gate_descriptor *t_idt;
+extern int has_f00f_bug;
+#endif
 
-#include "machine/trap.h"
+static __inline void
+userret(p, frame, oticks)
+	struct proc *p;
+	struct trapframe *frame;
+	u_quad_t oticks;
+{
+	int sig, s;
 
+	while ((sig = CURSIG(p)) != 0)
+		postsig(sig);
 
-struct	sysent sysent[];
-int	nsysent;
-int dostacklimits;
-unsigned rcr2();
-extern short cpl;
+#if 0
+	if (!want_resched &&
+		(p->p_priority <= p->p_usrpri) &&
+		(p->p_rtprio.type == RTP_PRIO_NORMAL)) {
+		 int newpriority;
+		 p->p_estcpu += 1;
+		 newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
+		 newpriority = min(newpriority, MAXPRI);
+		 p->p_usrpri = newpriority;
+	}
+#endif
+		
+	p->p_priority = p->p_usrpri;
+	if (want_resched) {
+		/*
+		 * Since we are curproc, clock will normally just change
+		 * our priority without moving us from one queue to another
+		 * (since the running process is not on a queue.)
+		 * If that happened after we setrunqueue ourselves but before we
+		 * mi_switch()'ed, we might not be on the queue indicated by
+		 * our priority.
+		 */
+		s = splhigh();
+		setrunqueue(p);
+		p->p_stats->p_ru.ru_nivcsw++;
+		mi_switch();
+		splx(s);
+		while ((sig = CURSIG(p)) != 0)
+			postsig(sig);
+	}
+	/*
+	 * Charge system time if profiling.
+	 */
+	if (p->p_flag & P_PROFIL)
+		addupc_task(p, frame->tf_eip,
+			    (u_int)(p->p_sticks - oticks) * psratio);
 
+	curpriority = p->p_priority;
+}
 
 /*
- * trap(frame):
- *	Exception, fault, and trap interface to BSD kernel. This
- * common code is called from assembly language IDT gate entry
+ * Exception, fault, and trap interface to the FreeBSD kernel.
+ * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
- * frame after the exception has been processed. Note that the
- * effect is as if the arguments were passed call by reference.
+ * frame after the exception has been processed.
  */
 
-/*ARGSUSED*/
+void
 trap(frame)
 	struct trapframe frame;
 {
-	register int i;
-	register struct proc *p = curproc;
-	struct timeval syst;
-	int ucode, type, code, eva;
+	struct proc *p = curproc;
+	u_quad_t sticks = 0;
+	int i = 0, ucode = 0, type, code;
+	vm_offset_t eva;
 
-	frame.tf_eflags &= ~PSL_NT;	/* clear nested trap XXX */
-	type = frame.tf_trapno;
-#include "ddb.h"
-#if NDDB > 0
-	if (curpcb && curpcb->pcb_onfault) {
-		if (frame.tf_trapno == T_BPTFLT
-		    || frame.tf_trapno == T_TRCTRAP)
-			if (kdb_trap (type, 0, &frame))
-				return;
-	}
-#endif
-	
-/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x",
-			frame.tf_trapno, frame.tf_err, frame.tf_eip,
-			frame.tf_cs, rcr2(), frame.tf_esp);*/
-if(curpcb == 0 || curproc == 0) goto we_re_toast;
-	if (curpcb->pcb_onfault && frame.tf_trapno != 0xc) {
-copyfault:
-		frame.tf_eip = (int)curpcb->pcb_onfault;
-		return;
+	if (!(frame.tf_eflags & PSL_I)) {
+		/*
+		 * Buggy application or kernel code has disabled interrupts
+		 * and then trapped.  Enabling interrupts now is wrong, but
+		 * it is better than running with interrupts disabled until
+		 * they are accidentally enabled later.
+		 */
+		type = frame.tf_trapno;
+		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
+			printf(
+			    "pid %ld (%s): trap %d with interrupts disabled\n",
+			    (long)curproc->p_pid, curproc->p_comm, type);
+		else if (type != T_BPTFLT && type != T_TRCTRAP)
+			/*
+			 * XXX not quite right, since this may be for a
+			 * multiple fault in user mode.
+			 */
+			printf("kernel trap %d with interrupts disabled\n",
+			    type);
+		enable_intr();
 	}
 
-	syst = p->p_stime;
-	if (ISPL(frame.tf_cs) == SEL_UPL) {
-		type |= T_USER;
-		p->p_regs = (int *)&frame;
-		curpcb->pcb_flags |= FM_TRAP;	/* used by sendsig */
+	eva = 0;
+	if (frame.tf_trapno == T_PAGEFLT) {
+		/*
+		 * For some Cyrix CPUs, %cr2 is clobbered by interrupts.
+		 * This problem is worked around by using an interrupt
+		 * gate for the pagefault handler.  We are finally ready
+		 * to read %cr2 and then must reenable interrupts.
+		 *
+		 * XXX this should be in the switch statement, but the
+		 * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the
+		 * flow of control too much for this to be obviously
+		 * correct.
+		 */
+		eva = rcr2();
+		enable_intr();
 	}
 
-	ucode=0;
-	eva = rcr2();
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+restart:
+#endif
+	type = frame.tf_trapno;
 	code = frame.tf_err;
-	switch (type) {
 
-	default:
-	we_re_toast:
-#ifdef KDB
-		if (kdb_trap(&psl))
+#ifdef VM86
+	if (in_vm86call) {
+		if (frame.tf_eflags & PSL_VM &&
+		    (type == T_PROTFLT || type == T_STKFLT)) {
+			i = vm86_emulate((struct vm86frame *)&frame);
+			if (i != 0)
+				/*
+				 * returns to original process
+				 */
+				vm86_trap((struct vm86frame *)&frame);
 			return;
-#endif
-#if NDDB > 0
-		if (kdb_trap (type, 0, &frame))
+		}
+		switch (type) {
+			/*
+			 * these traps want either a process context, or
+			 * assume a normal userspace trap.
+			 */
+		case T_PROTFLT:
+		case T_SEGNPFLT:
+			trap_fatal(&frame, eva);
 			return;
+		case T_TRCTRAP:
+			type = T_BPTFLT;	/* kernel breakpoint */
+			/* FALL THROUGH */
+		}
+		goto kernel_trap;	/* normal kernel trap handling */
+	}
 #endif
 
-		printf("trap type %d code = %x eip = %x cs = %x eflags = %x ",
-			frame.tf_trapno, frame.tf_err, frame.tf_eip,
-			frame.tf_cs, frame.tf_eflags);
-	eva = rcr2();
-		printf("cr2 %x cpl %x\n", eva, cpl);
-		/* type &= ~T_USER; */ /* XXX what the hell is this */
-		panic("trap");
-		/*NOTREACHED*/
-
-	case T_SEGNPFLT|T_USER:
-	case T_STKFLT|T_USER:
-	case T_PROTFLT|T_USER:		/* protection fault */
-		ucode = code + BUS_SEGM_FAULT ;
-		i = SIGBUS;
-		break;
+        if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) {
+		/* user trap */
 
-	case T_PRIVINFLT|T_USER:	/* privileged instruction fault */
-	case T_RESADFLT|T_USER:		/* reserved addressing fault */
-	case T_RESOPFLT|T_USER:		/* reserved operand fault */
-	case T_FPOPFLT|T_USER:		/* coprocessor operand fault */
-		ucode = type &~ T_USER;
-		i = SIGILL;
-		break;
+		sticks = p->p_sticks;
+		p->p_md.md_regs = &frame;
+
+		switch (type) {
+		case T_PRIVINFLT:	/* privileged instruction fault */
+			ucode = type;
+			i = SIGILL;
+			break;
+
+		case T_BPTFLT:		/* bpt instruction fault */
+		case T_TRCTRAP:		/* trace trap */
+			frame.tf_eflags &= ~PSL_T;
+			i = SIGTRAP;
+			break;
+
+		case T_ARITHTRAP:	/* arithmetic trap */
+			ucode = code;
+			i = SIGFPE;
+			break;
+
+		case T_ASTFLT:		/* Allow process switch */
+			astoff();
+			cnt.v_soft++;
+			if (p->p_flag & P_OWEUPC) {
+				p->p_flag &= ~P_OWEUPC;
+				addupc_task(p, p->p_stats->p_prof.pr_addr,
+					    p->p_stats->p_prof.pr_ticks);
+			}
+			goto out;
+
+			/*
+			 * The following two traps can happen in
+			 * vm86 mode, and, if so, we want to handle
+			 * them specially.
+			 */
+		case T_PROTFLT:		/* general protection fault */
+		case T_STKFLT:		/* stack fault */
+#ifdef VM86
+			if (frame.tf_eflags & PSL_VM) {
+				i = vm86_emulate((struct vm86frame *)&frame);
+				if (i == 0)
+					goto out;
+				break;
+			}
+#endif /* VM86 */
+			/* FALL THROUGH */
+
+		case T_SEGNPFLT:	/* segment not present fault */
+		case T_TSSFLT:		/* invalid TSS fault */
+		case T_DOUBLEFLT:	/* double fault */
+		default:
+			ucode = code + BUS_SEGM_FAULT ;
+			i = SIGBUS;
+			break;
+
+		case T_PAGEFLT:		/* page fault */
+			i = trap_pfault(&frame, TRUE, eva);
+			if (i == -1)
+				return;
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+			if (i == -2)
+				goto restart;
+#endif
+			if (i == 0)
+				goto out;
+
+			ucode = T_PAGEFLT;
+			break;
+
+		case T_DIVIDE:		/* integer divide fault */
+			ucode = FPE_INTDIV_TRAP;
+			i = SIGFPE;
+			break;
+
+#if NISA > 0
+		case T_NMI:
+#ifdef POWERFAIL_NMI
+			goto handle_powerfail;
+#else /* !POWERFAIL_NMI */
+#ifdef DDB
+			/* NMI can be hooked up to a pushbutton for debugging */
+			printf ("NMI ... going to debugger\n");
+			if (kdb_trap (type, 0, &frame))
+				return;
+#endif /* DDB */
+			/* machine/parity/power fail/"kitchen sink" faults */
+			if (isa_nmi(code) == 0) return;
+			panic("NMI indicates hardware failure");
+#endif /* POWERFAIL_NMI */
+#endif /* NISA > 0 */
+
+		case T_OFLOW:		/* integer overflow fault */
+			ucode = FPE_INTOVF_TRAP;
+			i = SIGFPE;
+			break;
 
-	case T_ASTFLT|T_USER:		/* Allow process switch */
-		astoff();
-		cnt.v_soft++;
-		if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) {
-			addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
-			p->p_flag &= ~SOWEUPC;
+		case T_BOUND:		/* bounds check fault */
+			ucode = FPE_SUBRNG_TRAP;
+			i = SIGFPE;
+			break;
+
+		case T_DNA:
+#if NNPX > 0
+			/* if a transparent fault (due to context switch "late") */
+			if (npxdna())
+				return;
+#endif
+			if (!pmath_emulate) {
+				i = SIGFPE;
+				ucode = FPE_FPU_NP_TRAP;
+				break;
+			}
+			i = (*pmath_emulate)(&frame);
+			if (i == 0) {
+				if (!(frame.tf_eflags & PSL_T))
+					return;
+				frame.tf_eflags &= ~PSL_T;
+				i = SIGTRAP;
+			}
+			/* else ucode = emulator_only_knows() XXX */
+			break;
+
+		case T_FPOPFLT:		/* FPU operand fetch fault */
+			ucode = T_FPOPFLT;
+			i = SIGILL;
+			break;
 		}
-		goto out;
+	} else {
+#ifdef VM86
+kernel_trap:
+#endif
+		/* kernel trap */
+
+		switch (type) {
+		case T_PAGEFLT:			/* page fault */
+			(void) trap_pfault(&frame, FALSE, eva);
+			return;
 
-	case T_DNA|T_USER:
-#ifdef	NPX
-		/* if a transparent fault (due to context switch "late") */
-		if (npxdna()) return;
+		case T_DNA:
+#if NNPX > 0
+			/*
+			 * The kernel is apparently using npx for copying.
+			 * XXX this should be fatal unless the kernel has
+			 * registered such use.
+			 */
+			if (npxdna())
+				return;
 #endif
-		i = math_emulate(&frame);
-		if (i == 0) return;
-		ucode = FPE_FPU_NP_TRAP;
-		break;
+			break;
 
-	case T_BOUND|T_USER:
-		ucode = FPE_SUBRNG_TRAP;
-		i = SIGFPE;
-		break;
+		case T_PROTFLT:		/* general protection fault */
+		case T_SEGNPFLT:	/* segment not present fault */
+			/*
+			 * Invalid segment selectors and out of bounds
+			 * %eip's and %esp's can be set up in user mode.
+			 * This causes a fault in kernel mode when the
+			 * kernel tries to return to user mode.  We want
+			 * to get this fault so that we can fix the
+			 * problem here and not have to check all the
+			 * selectors and pointers when the user changes
+			 * them.
+			 */
+#define	MAYBE_DORETI_FAULT(where, whereto)				\
+	do {								\
+		if (frame.tf_eip == (int)where) {			\
+			frame.tf_eip = (int)whereto;			\
+			return;						\
+		}							\
+	} while (0)
 
-	case T_OFLOW|T_USER:
-		ucode = FPE_INTOVF_TRAP;
-		i = SIGFPE;
-		break;
+			if (intr_nesting_level == 0) {
+				/*
+				 * Invalid %fs's and %gs's can be created using
+				 * procfs or PT_SETREGS or by invalidating the
+				 * underlying LDT entry.  This causes a fault
+				 * in kernel mode when the kernel attempts to
+				 * switch contexts.  Lose the bad context
+				 * (XXX) so that we can continue, and generate
+				 * a signal.
+				 */
+				if (frame.tf_eip == (int)cpu_switch_load_fs) {
+					curpcb->pcb_fs = 0;
+					psignal(p, SIGBUS);
+					return;
+				}
+				if (frame.tf_eip == (int)cpu_switch_load_gs) {
+					curpcb->pcb_gs = 0;
+					psignal(p, SIGBUS);
+					return;
+				}
+				MAYBE_DORETI_FAULT(doreti_iret,
+						   doreti_iret_fault);
+				MAYBE_DORETI_FAULT(doreti_popl_ds,
+						   doreti_popl_ds_fault);
+				MAYBE_DORETI_FAULT(doreti_popl_es,
+						   doreti_popl_es_fault);
+				if (curpcb && curpcb->pcb_onfault) {
+					frame.tf_eip = (int)curpcb->pcb_onfault;
+					return;
+				}
+			}
+			break;
 
-	case T_DIVIDE|T_USER:
-		ucode = FPE_INTDIV_TRAP;
-		i = SIGFPE;
-		break;
+		case T_TSSFLT:
+			/*
+			 * PSL_NT can be set in user mode and isn't cleared
+			 * automatically when the kernel is entered.  This
+			 * causes a TSS fault when the kernel attempts to
+			 * `iret' because the TSS link is uninitialized.  We
+			 * want to get this fault so that we can fix the
+			 * problem here and not every time the kernel is
+			 * entered.
+			 */
+			if (frame.tf_eflags & PSL_NT) {
+				frame.tf_eflags &= ~PSL_NT;
+				return;
+			}
+			break;
 
-	case T_ARITHTRAP|T_USER:
-		ucode = code;
-		i = SIGFPE;
-		break;
+		case T_TRCTRAP:	 /* trace trap */
+			if (frame.tf_eip == (int)IDTVEC(syscall)) {
+				/*
+				 * We've just entered system mode via the
+				 * syscall lcall.  Continue single stepping
+				 * silently until the syscall handler has
+				 * saved the flags.
+				 */
+				return;
+			}
+			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
+				/*
+				 * The syscall handler has now saved the
+				 * flags.  Stop single stepping it.
+				 */
+				frame.tf_eflags &= ~PSL_T;
+				return;
+			}
+			/*
+			 * Fall through.
+			 */
+		case T_BPTFLT:
+			/*
+			 * If DDB is enabled, let it handle the debugger trap.
+			 * Otherwise, debugger traps "can't happen".
+			 */
+#ifdef DDB
+			if (kdb_trap (type, 0, &frame))
+				return;
+#endif
+			break;
 
-	case T_PAGEFLT:			/* allow page faults in kernel mode */
-#if 0
-		/* XXX - check only applies to 386's and 486's with WP off */
-		if (code & PGEX_P) goto we_re_toast;
+#if NISA > 0
+		case T_NMI:
+#ifdef POWERFAIL_NMI
+#ifndef TIMER_FREQ
+#  define TIMER_FREQ 1193182
 #endif
+	handle_powerfail:
+		{
+		  static unsigned lastalert = 0;
+
+		  if(time_second - lastalert > 10)
+		    {
+		      log(LOG_WARNING, "NMI: power fail\n");
+		      sysbeep(TIMER_FREQ/880, hz);
+		      lastalert = time_second;
+		    }
+		  return;
+		}
+#else /* !POWERFAIL_NMI */
+#ifdef DDB
+			/* NMI can be hooked up to a pushbutton for debugging */
+			printf ("NMI ... going to debugger\n");
+			if (kdb_trap (type, 0, &frame))
+				return;
+#endif /* DDB */
+			/* machine/parity/power fail/"kitchen sink" faults */
+			if (isa_nmi(code) == 0) return;
+			/* FALL THROUGH */
+#endif /* POWERFAIL_NMI */
+#endif /* NISA > 0 */
+		}
+
+		trap_fatal(&frame, eva);
+		return;
+	}
+
+	/* Translate fault for emulators (e.g. Linux) */
+	if (*p->p_sysent->sv_transtrap)
+		i = (*p->p_sysent->sv_transtrap)(i, type);
+
+	trapsignal(p, i, ucode);
+
+#ifdef DEBUG
+	if (type <= MAX_TRAP_MSG) {
+		uprintf("fatal process exception: %s",
+			trap_msg[type]);
+		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
+			uprintf(", fault VA = 0x%lx", (u_long)eva);
+		uprintf("\n");
+	}
+#endif
+
+out:
+	userret(p, &frame, sticks);
+}
+
+#ifdef notyet
+/*
+ * This version doesn't allow a page fault to user space while
+ * in the kernel. The rest of the kernel needs to be made "safe"
+ * before this can be used. I think the only things remaining
+ * to be made safe are the iBCS2 code and the process tracing/
+ * debugging code.
+ */
+static int
+trap_pfault(frame, usermode, eva)
+	struct trapframe *frame;
+	int usermode;
+	vm_offset_t eva;
+{
+	vm_offset_t va;
+	struct vmspace *vm = NULL;
+	vm_map_t map = 0;
+	int rv = 0;
+	vm_prot_t ftype;
+	struct proc *p = curproc;
+
+	if (frame->tf_err & PGEX_W)
+		ftype = VM_PROT_READ | VM_PROT_WRITE;
+	else
+		ftype = VM_PROT_READ;
+
+	va = trunc_page(eva);
+	if (va < VM_MIN_KERNEL_ADDRESS) {
+		vm_offset_t v;
+		vm_page_t mpte;
+
+		if (p == NULL ||
+		    (!usermode && va < VM_MAXUSER_ADDRESS &&
+		     (intr_nesting_level != 0 || curpcb == NULL ||
+		      curpcb->pcb_onfault == NULL))) {
+			trap_fatal(frame, eva);
+			return (-1);
+		}
 
-		/* fall into */
-	case T_PAGEFLT|T_USER:		/* page fault */
-	    {
-		register vm_offset_t va;
-		register struct vmspace *vm = p->p_vmspace;
-		register vm_map_t map;
-		int rv;
-		vm_prot_t ftype;
-		extern vm_map_t kernel_map;
-		unsigned nss,v;
-
-		va = trunc_page((vm_offset_t)eva);
 		/*
-		 * Avoid even looking at pde_v(va) for high va's.   va's
-		 * above VM_MAX_KERNEL_ADDRESS don't correspond to normal
-		 * PDE's (half of them correspond to APDEpde and half to
-		 * an unmapped kernel PDE).  va's betweeen 0xFEC00000 and
-		 * VM_MAX_KERNEL_ADDRESS correspond to unmapped kernel PDE's
-		 * (XXX - why are only 3 initialized when 6 are required to
-		 * reach VM_MAX_KERNEL_ADDRESS?).  Faulting in an unmapped
-		 * kernel page table would give inconsistent PTD's.
-		 *
-		 * XXX - faulting in unmapped page tables wastes a page if
-		 * va turns out to be invalid.
-		 *
-		 * XXX - should "kernel address space" cover the kernel page
-		 * tables?  Might have same problem with PDEpde as with
-		 * APDEpde (or there may be no problem with APDEpde).
+		 * This is a fault on non-kernel virtual memory.
+		 * vm is initialized above to NULL. If curproc is NULL
+		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
-		if (va > 0xFEBFF000) {
-			rv = KERN_FAILURE;	/* becomes SIGBUS */
+		vm = p->p_vmspace;
+		if (vm == NULL)
 			goto nogo;
-		}
+
+		map = &vm->vm_map;
+
 		/*
-		 * It is only a kernel address space fault iff:
-		 * 	1. (type & T_USER) == 0  and
-		 * 	2. pcb_onfault not set or
-		 *	3. pcb_onfault set but supervisor space fault
-		 * The last can occur during an exec() copyin where the
-		 * argument space is lazy-allocated.
+		 * Keep swapout from messing with us during this
+		 *	critical time.
 		 */
-		if (type == T_PAGEFLT && va >= KERNBASE)
-			map = kernel_map;
-		else
-			map = &vm->vm_map;
-		if (code & PGEX_W)
-			ftype = VM_PROT_READ | VM_PROT_WRITE;
-		else
-			ftype = VM_PROT_READ;
-
-#ifdef DEBUG
-		if (map == kernel_map && va == 0) {
-			printf("trap: bad kernel access at %x\n", va);
-			goto we_re_toast;
-		}
-#endif
+		++p->p_lock;
 
 		/*
-		 * XXX: rude hack to make stack limits "work"
+		 * Grow the stack if necessary
 		 */
-		nss = 0;
-		if ((caddr_t)va >= vm->vm_maxsaddr && map != kernel_map
-			&& dostacklimits) {
-			nss = clrnd(btoc((unsigned)vm->vm_maxsaddr
-				+ MAXSSIZ - (unsigned)va));
-			if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
-/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/
+#ifndef VM_STACK
+		if ((caddr_t)va > vm->vm_maxsaddr && va < USRSTACK) {
+			if (!grow(p, va)) {
 				rv = KERN_FAILURE;
+				--p->p_lock;
 				goto nogo;
 			}
 		}
 
-		/* check if page table is mapped, if not, fault it first */
-#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v)
-		if (!pde_v(va)) {
-			v = trunc_page(vtopte(va));
-			rv = vm_fault(map, v, ftype, FALSE);
-			if (rv != KERN_SUCCESS) goto nogo;
-			/* check if page table fault, increment wiring */
-			vm_map_pageable(map, v, round_page(v+1), FALSE);
-		} else v=0;
-		rv = vm_fault(map, va, ftype, FALSE);
-		if (rv == KERN_SUCCESS) {
-			/*
-			 * XXX: continuation of rude stack hack
-			 */
-			if (nss > vm->vm_ssize)
-				vm->vm_ssize = nss;
-			va = trunc_page(vtopte(va));
-			/* for page table, increment wiring
-			   as long as not a page table fault as well */
-			if (!v && type != T_PAGEFLT)
-			  vm_map_pageable(map, va, round_page(va+1), FALSE);
-			if (type == T_PAGEFLT)
-				return;
-			goto out;
+#else
+		/* grow_stack returns false only if va falls into
+		 * a growable stack region and the stack growth
+		 * fails.  It returns true if va was not within
+		 * a growable stack region, or if the stack 
+		 * growth succeeded.
+		 */
+		if (!grow_stack (p, va)) {
+			rv = KERN_FAILURE;
+			--p->p_lock;
+			goto nogo;
 		}
+#endif
+		
+		/* Fault in the user page: */
+		rv = vm_fault(map, va, ftype,
+			(ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0);
+
+		--p->p_lock;
+	} else {
+		/*
+		 * Don't allow user-mode faults in kernel address space.
+		 */
+		if (usermode)
+			goto nogo;
+
+		/*
+		 * Since we know that kernel virtual address addresses
+		 * always have pte pages mapped, we just have to fault
+		 * the page.
+		 */
+		rv = vm_fault(kernel_map, va, ftype, FALSE);
+	}
+
+	if (rv == KERN_SUCCESS)
+		return (0);
 nogo:
-		if (type == T_PAGEFLT) {
-			if (curpcb->pcb_onfault)
-				goto copyfault;
-			printf("vm_fault(%x, %x, %x, 0) -> %x\n",
-			       map, va, ftype, rv);
-			printf("  type %x, code %x\n",
-			       type, code);
-			goto we_re_toast;
+	if (!usermode) {
+		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
+			frame->tf_eip = (int)curpcb->pcb_onfault;
+			return (0);
 		}
-		i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV;
-		break;
-	    }
+		trap_fatal(frame, eva);
+		return (-1);
+	}
 
-#if NDDB == 0
-	case T_TRCTRAP:	 /* trace trap -- someone single stepping lcall's */
-		frame.tf_eflags &= ~PSL_T;
+	/* kludge to pass faulting virtual address to sendsig */
+	frame->tf_err = eva;
 
-			/* Q: how do we turn it on again? */
-		return;
+	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+}
 #endif
-	
-	case T_BPTFLT|T_USER:		/* bpt instruction fault */
-	case T_TRCTRAP|T_USER:		/* trace trap */
-		frame.tf_eflags &= ~PSL_T;
-		i = SIGTRAP;
-		break;
 
-#include "isa.h"
-#if	NISA > 0
-	case T_NMI:
-	case T_NMI|T_USER:
-#if NDDB > 0
-		/* NMI can be hooked up to a pushbutton for debugging */
-		printf ("NMI ... going to debugger\n");
-		if (kdb_trap (type, 0, &frame))
-			return;
-#endif
-		/* machine/parity/power fail/"kitchen sink" faults */
-		if(isa_nmi(code) == 0) return;
-		else goto we_re_toast;
+int
+trap_pfault(frame, usermode, eva)
+	struct trapframe *frame;
+	int usermode;
+	vm_offset_t eva;
+{
+	vm_offset_t va;
+	struct vmspace *vm = NULL;
+	vm_map_t map = 0;
+	int rv = 0;
+	vm_prot_t ftype;
+	struct proc *p = curproc;
+
+	va = trunc_page(eva);
+	if (va >= KERNBASE) {
+		/*
+		 * Don't allow user-mode faults in kernel address space.
+		 * An exception:  if the faulting address is the invalid
+		 * instruction entry in the IDT, then the Intel Pentium
+		 * F00F bug workaround was triggered, and we need to
+		 * treat it is as an illegal instruction, and not a page
+		 * fault.
+		 */
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+		if ((eva == (unsigned int)&t_idt[6]) && has_f00f_bug) {
+			frame->tf_trapno = T_PRIVINFLT;
+			return -2;
+		}
 #endif
-	}
+		if (usermode)
+			goto nogo;
 
-	trapsignal(p, i, ucode);
-	if ((type & T_USER) == 0)
-		return;
-out:
-	while (i = CURSIG(p))
-		psig(i);
-	p->p_pri = p->p_usrpri;
-	if (want_resched) {
+		map = kernel_map;
+	} else {
 		/*
-		 * Since we are curproc, clock will normally just change
-		 * our priority without moving us from one queue to another
-		 * (since the running process is not on a queue.)
-		 * If that happened after we setrq ourselves but before we
-		 * swtch()'ed, we might not be on the queue indicated by
-		 * our priority.
+		 * This is a fault on non-kernel virtual memory.
+		 * vm is initialized above to NULL. If curproc is NULL
+		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
-		(void) splclock();
-		setrq(p);
-		p->p_stats->p_ru.ru_nivcsw++;
-		swtch();
-		(void) splnone();
-		while (i = CURSIG(p))
-			psig(i);
+		if (p != NULL)
+			vm = p->p_vmspace;
+
+		if (vm == NULL)
+			goto nogo;
+
+		map = &vm->vm_map;
 	}
-	if (p->p_stats->p_prof.pr_scale) {
-		int ticks;
-		struct timeval *tv = &p->p_stime;
-
-		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
-			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
-		if (ticks) {
-#ifdef PROFTIMER
-			extern int profscale;
-			addupc(frame.tf_eip, &p->p_stats->p_prof,
-			    ticks * profscale);
+
+	if (frame->tf_err & PGEX_W)
+		ftype = VM_PROT_READ | VM_PROT_WRITE;
+	else
+		ftype = VM_PROT_READ;
+
+	if (map != kernel_map) {
+		/*
+		 * Keep swapout from messing with us during this
+		 *	critical time.
+		 */
+		++p->p_lock;
+
+		/*
+		 * Grow the stack if necessary
+		 */
+#ifndef VM_STACK
+		if ((caddr_t)va > vm->vm_maxsaddr && va < USRSTACK) {
+			if (!grow(p, va)) {
+				rv = KERN_FAILURE;
+				--p->p_lock;
+				goto nogo;
+			}
+		}
 #else
-			addupc(frame.tf_eip, &p->p_stats->p_prof, ticks);
+		/* grow_stack returns false only if va falls into
+		 * a growable stack region and the stack growth
+		 * fails.  It returns true if va was not within
+		 * a growable stack region, or if the stack 
+		 * growth succeeded.
+		 */
+		if (!grow_stack (p, va)) {
+			rv = KERN_FAILURE;
+			--p->p_lock;
+			goto nogo;
+		}
 #endif
+
+		/* Fault in the user page: */
+		rv = vm_fault(map, va, ftype,
+			(ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0);
+
+		--p->p_lock;
+	} else {
+		/*
+		 * Don't have to worry about process locking or stacks in the kernel.
+		 */
+		rv = vm_fault(map, va, ftype, FALSE);
+	}
+
+	if (rv == KERN_SUCCESS)
+		return (0);
+nogo:
+	if (!usermode) {
+		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
+			frame->tf_eip = (int)curpcb->pcb_onfault;
+			return (0);
 		}
+		trap_fatal(frame, eva);
+		return (-1);
 	}
-	curpri = p->p_pri;
-	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
+
+	/* kludge to pass faulting virtual address to sendsig */
+	frame->tf_err = eva;
+
+	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+}
+
+static void
+trap_fatal(frame, eva)
+	struct trapframe *frame;
+	vm_offset_t eva;
+{
+	int code, type, ss, esp;
+	struct soft_segment_descriptor softseg;
+
+	code = frame->tf_err;
+	type = frame->tf_trapno;
+	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
+
+	if (type <= MAX_TRAP_MSG)
+		printf("\n\nFatal trap %d: %s while in %s mode\n",
+			type, trap_msg[type],
+        		frame->tf_eflags & PSL_VM ? "vm86" :
+			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
+#ifdef SMP
+	/* three seperate prints in case of a trap on an unmapped page */
+	printf("mp_lock = %08x; ", mp_lock);
+	printf("cpuid = %d; ", cpuid);
+	printf("lapic.id = %08x\n", lapic.id);
+#endif
+	if (type == T_PAGEFLT) {
+		printf("fault virtual address	= 0x%x\n", eva);
+		printf("fault code		= %s %s, %s\n",
+			code & PGEX_U ? "user" : "supervisor",
+			code & PGEX_W ? "write" : "read",
+			code & PGEX_P ? "protection violation" : "page not present");
+	}
+	printf("instruction pointer	= 0x%x:0x%x\n",
+	       frame->tf_cs & 0xffff, frame->tf_eip);
+        if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
+		ss = frame->tf_ss & 0xffff;
+		esp = frame->tf_esp;
+	} else {
+		ss = GSEL(GDATA_SEL, SEL_KPL);
+		esp = (int)&frame->tf_esp;
+	}
+	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
+	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
+	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
+	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
+	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
+	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
+	       softseg.ssd_gran);
+	printf("processor eflags	= ");
+	if (frame->tf_eflags & PSL_T)
+		printf("trace trap, ");
+	if (frame->tf_eflags & PSL_I)
+		printf("interrupt enabled, ");
+	if (frame->tf_eflags & PSL_NT)
+		printf("nested task, ");
+	if (frame->tf_eflags & PSL_RF)
+		printf("resume, ");
+	if (frame->tf_eflags & PSL_VM)
+		printf("vm86, ");
+	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
+	printf("current process		= ");
+	if (curproc) {
+		printf("%lu (%s)\n",
+		    (u_long)curproc->p_pid, curproc->p_comm ?
+		    curproc->p_comm : "");
+	} else {
+		printf("Idle\n");
+	}
+	printf("interrupt mask		= ");
+	if ((cpl & net_imask) == net_imask)
+		printf("net ");
+	if ((cpl & tty_imask) == tty_imask)
+		printf("tty ");
+	if ((cpl & bio_imask) == bio_imask)
+		printf("bio ");
+	if ((cpl & cam_imask) == cam_imask)
+		printf("cam ");
+	if (cpl == 0)
+		printf("none");
+#ifdef SMP
+/**
+ *  XXX FIXME:
+ *	we probably SHOULD have stopped the other CPUs before now!
+ *	another CPU COULD have been touching cpl at this moment...
+ */
+	printf(" <- SMP: XXX");
+#endif
+	printf("\n");
+
+#ifdef KDB
+	if (kdb_trap(&psl))
+		return;
+#endif
+#ifdef DDB
+	if ((debugger_on_panic || in_Debugger) && kdb_trap(type, 0, frame))
+		return;
+#endif
+	printf("trap number		= %d\n", type);
+	if (type <= MAX_TRAP_MSG)
+		panic(trap_msg[type]);
+	else
+		panic("unknown/reserved trap");
 }
 
 /*
- * Compensate for 386 brain damage (missing URKR)
+ * Double fault handler. Called when a fault occurs while writing
+ * a frame for a trap/exception onto the stack. This usually occurs
+ * when the stack overflows (such is the case with infinite recursion,
+ * for example).
+ *
+ * XXX Note that the current PTD gets replaced by IdlePTD when the
+ * task switch occurs. This means that the stack that was active at
+ * the time of the double fault is not available at <kstack> unless
+ * the machine was idle when the double fault occurred. The downside
+ * of this is that "trace <ebp>" in ddb won't work.
  */
-int trapwrite(unsigned addr) {
-	int rv;
+void
+dblfault_handler()
+{
+	printf("\nFatal double fault:\n");
+	printf("eip = 0x%x\n", common_tss.tss_eip);
+	printf("esp = 0x%x\n", common_tss.tss_esp);
+	printf("ebp = 0x%x\n", common_tss.tss_ebp);
+#ifdef SMP
+	/* three seperate prints in case of a trap on an unmapped page */
+	printf("mp_lock = %08x; ", mp_lock);
+	printf("cpuid = %d; ", cpuid);
+	printf("lapic.id = %08x\n", lapic.id);
+#endif
+	panic("double fault");
+}
+
+/*
+ * Compensate for 386 brain damage (missing URKR).
+ * This is a little simpler than the pagefault handler in trap() because
+ * it the page tables have already been faulted in and high addresses
+ * are thrown out early for other reasons.
+ */
+int trapwrite(addr)
+	unsigned addr;
+{
+	struct proc *p;
 	vm_offset_t va;
+	struct vmspace *vm;
+	int rv;
 
 	va = trunc_page((vm_offset_t)addr);
-	if (va > VM_MAXUSER_ADDRESS) return(1);
-	rv = vm_fault(&curproc->p_vmspace->vm_map, va,
-		VM_PROT_READ | VM_PROT_WRITE, FALSE);
-	if (rv == KERN_SUCCESS) return(0);
-	else return(1);
+	/*
+	 * XXX - MAX is END.  Changed > to >= for temp. fix.
+	 */
+	if (va >= VM_MAXUSER_ADDRESS)
+		return (1);
+
+	p = curproc;
+	vm = p->p_vmspace;
+
+	++p->p_lock;
+
+#ifndef VM_STACK
+	if ((caddr_t)va >= vm->vm_maxsaddr && va < USRSTACK) {
+		if (!grow(p, va)) {
+			--p->p_lock;
+			return (1);
+		}
+	}
+#else
+	if (!grow_stack (p, va)) {
+		--p->p_lock;
+		return (1);
+	}
+#endif
+
+	/*
+	 * fault the data page
+	 */
+	rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, VM_FAULT_DIRTY);
+
+	--p->p_lock;
+
+	if (rv != KERN_SUCCESS)
+		return 1;
+
+	return (0);
 }
 
 /*
- * syscall(frame):
- *	System call request from POSIX system call gate interface to kernel.
+ * System call request from POSIX system call gate interface to kernel.
  * Like trap(), argument is call by reference.
  */
-/*ARGSUSED*/
+void
 syscall(frame)
-	volatile struct syscframe frame;
+	struct trapframe frame;
 {
-	register int *locr0 = ((int *)&frame);
-	register caddr_t params;
-	register int i;
-	register struct sysent *callp;
-	register struct proc *p = curproc;
-	struct timeval syst;
-	int error, opc;
-	int args[8], rval[2];
-	int code;
-
-#ifdef lint
-	r0 = 0; r0 = r0; r1 = 0; r1 = r1;
-#endif
-	syst = p->p_stime;
-	if (ISPL(frame.sf_cs) != SEL_UPL)
+	caddr_t params;
+	int i;
+	struct sysent *callp;
+	struct proc *p = curproc;
+	u_quad_t sticks;
+	int error;
+	int args[8];
+	u_int code;
+
+#ifdef DIAGNOSTIC
+	if (ISPL(frame.tf_cs) != SEL_UPL)
 		panic("syscall");
+#endif
+	sticks = p->p_sticks;
+	p->p_md.md_regs = &frame;
+	params = (caddr_t)frame.tf_esp + sizeof(int);
+	code = frame.tf_eax;
+	if (p->p_sysent->sv_prepsyscall) {
+		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
+	} else {
+		/*
+		 * Need to check if this is a 32 bit or 64 bit syscall.
+		 */
+		if (code == SYS_syscall) {
+			/*
+			 * Code is first argument, followed by actual args.
+			 */
+			code = fuword(params);
+			params += sizeof(int);
+		} else if (code == SYS___syscall) {
+			/*
+			 * Like syscall, but code is a quad, so as to maintain
+			 * quad alignment for the rest of the arguments.
+			 */
+			code = fuword(params);
+			params += sizeof(quad_t);
+		}
+	}
 
-	code = frame.sf_eax;
-	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
-	p->p_regs = (int *)&frame;
-	params = (caddr_t)frame.sf_esp + sizeof (int) ;
+ 	if (p->p_sysent->sv_mask)
+ 		code &= p->p_sysent->sv_mask;
 
-	/*
-	 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always.
-	 */
-	opc = frame.sf_eip - 7;
-	callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
-	if (callp == sysent) {
-		i = fuword(params);
-		params += sizeof (int);
-		callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
-	}
+ 	if (code >= p->p_sysent->sv_size)
+ 		callp = &p->p_sysent->sv_table[0];
+  	else
+ 		callp = &p->p_sysent->sv_table[code];
 
-	if ((i = callp->sy_narg * sizeof (int)) &&
+	if (params && (i = callp->sy_narg * sizeof(int)) &&
 	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
-		frame.sf_eax = error;
-		frame.sf_eflags |= PSL_C;	/* carry bit */
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_SYSCALL))
-			ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
+			ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
 #endif
-		goto done;
+		goto bad;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL))
-		ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
+		ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
 #endif
-	rval[0] = 0;
-	rval[1] = frame.sf_edx;
-/*pg("%d. s %d\n", p->p_pid, code);*/
-	error = (*callp->sy_call)(p, args, rval);
-	if (error == ERESTART)
-		frame.sf_eip = opc;
-	else if (error != EJUSTRETURN) {
-		if (error) {
-/*pg("error %d", error);*/
-			frame.sf_eax = error;
-			frame.sf_eflags |= PSL_C;	/* carry bit */
-		} else {
-			frame.sf_eax = rval[0];
-			frame.sf_edx = rval[1];
-			frame.sf_eflags &= ~PSL_C;	/* carry bit */
-		}
-	}
-	/* else if (error == EJUSTRETURN) */
-		/* nothing to do */
-done:
-	/*
-	 * Reinitialize proc pointer `p' as it may be different
-	 * if this is a child returning from fork syscall.
-	 */
-	p = curproc;
-	while (i = CURSIG(p))
-		psig(i);
-	p->p_pri = p->p_usrpri;
-	if (want_resched) {
+	p->p_retval[0] = 0;
+	p->p_retval[1] = frame.tf_edx;
+
+	STOPEVENT(p, S_SCE, callp->sy_narg);
+
+	error = (*callp->sy_call)(p, args);
+
+	switch (error) {
+
+	case 0:
 		/*
-		 * Since we are curproc, clock will normally just change
-		 * our priority without moving us from one queue to another
-		 * (since the running process is not on a queue.)
-		 * If that happened after we setrq ourselves but before we
-		 * swtch()'ed, we might not be on the queue indicated by
-		 * our priority.
+		 * Reinitialize proc pointer `p' as it may be different
+		 * if this is a child returning from fork syscall.
 		 */
-		(void) splclock();
-		setrq(p);
-		p->p_stats->p_ru.ru_nivcsw++;
-		swtch();
-		(void) splnone();
-		while (i = CURSIG(p))
-			psig(i);
+		p = curproc;
+		frame.tf_eax = p->p_retval[0];
+		frame.tf_edx = p->p_retval[1];
+		frame.tf_eflags &= ~PSL_C;
+		break;
+
+	case ERESTART:
+		/*
+		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
+		 * int 0x80 is 2 bytes. We saved this in tf_err.
+		 */
+		frame.tf_eip -= frame.tf_err;
+		break;
+
+	case EJUSTRETURN:
+		break;
+
+	default:
+bad:
+ 		if (p->p_sysent->sv_errsize)
+ 			if (error >= p->p_sysent->sv_errsize)
+  				error = -1;	/* XXX */
+   			else
+  				error = p->p_sysent->sv_errtbl[error];
+		frame.tf_eax = error;
+		frame.tf_eflags |= PSL_C;
+		break;
 	}
-	if (p->p_stats->p_prof.pr_scale) {
-		int ticks;
-		struct timeval *tv = &p->p_stime;
-
-		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
-			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
-		if (ticks) {
-#ifdef PROFTIMER
-			extern int profscale;
-			addupc(frame.sf_eip, &p->p_stats->p_prof,
-			    ticks * profscale);
-#else
-			addupc(frame.sf_eip, &p->p_stats->p_prof, ticks);
-#endif
-		}
+
+	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
+		/* Traced syscall. */
+		frame.tf_eflags &= ~PSL_T;
+		trapsignal(p, SIGTRAP, 0);
 	}
-	curpri = p->p_pri;
+
+	userret(p, &frame, sticks);
+
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET))
-		ktrsysret(p->p_tracep, code, error, rval[0]);
+		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
 #endif
-#ifdef	DIAGNOSTICx
-{ extern int _udatasel, _ucodesel;
-	if (frame.sf_ss != _udatasel)
-		printf("ss %x call %d\n", frame.sf_ss, code);
-	if ((frame.sf_cs&0xffff) != _ucodesel)
-		printf("cs %x call %d\n", frame.sf_cs, code);
-	if (frame.sf_eip > VM_MAXUSER_ADDRESS) {
-		printf("eip %x call %d\n", frame.sf_eip, code);
-		frame.sf_eip = 0;
-	}
+
+	/*
+	 * This works because errno is findable through the
+	 * register set.  If we ever support an emulation where this
+	 * is not the case, this code will need to be revisited.
+	 */
+	STOPEVENT(p, S_SCX, code);
+
 }
+
+/*
+ * Simplified back end of syscall(), used when returning from fork()
+ * directly into user mode.
+ */
+void
+fork_return(p, frame)
+	struct proc *p;
+	struct trapframe frame;
+{
+	frame.tf_eax = 0;		/* Child returns zero */
+	frame.tf_eflags &= ~PSL_C;	/* success */
+	frame.tf_edx = 1;
+
+	userret(p, &frame, 0);
+#ifdef KTRACE
+	if (KTRPOINT(p, KTR_SYSRET))
+		ktrsysret(p->p_tracep, SYS_fork, 0, 0);
 #endif
 }
diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c
new file mode 100644
index 0000000..7ff3366
--- /dev/null
+++ b/sys/kern/subr_xxx.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_xxx.c	8.1 (Berkeley) 6/10/93
+ * $Id: subr_xxx.c,v 1.11 1998/08/20 06:10:40 bde Exp $
+ */
+
+/*
+ * Miscellaneous trivial functions.
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+
+/*
+ * Return error for operation not supported
+ * on a specific object or file type.
+ */
+int
+eopnotsupp()
+{
+
+	return (EOPNOTSUPP);
+}
+
+/*
+ * Return error for an inval operation
+ * on a specific object or file type.
+ */
+int
+einval()
+{
+
+	return (EINVAL);
+}
+
+/*
+ * Generic null operation, always returns success.
+ */
+int
+nullop()
+{
+
+	return (0);
+}
+
+#include <sys/conf.h>
+
+/*
+ * Unsupported devswitch functions (e.g. for writing to read-only device).
+ * XXX may belong elsewhere.
+ */
+
+int
+noopen(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
+{
+
+	return (ENODEV);
+}
+
+int
+noclose(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
+{
+
+	return (ENODEV);
+}
+
+int
+noread(dev, uio, ioflag)
+	dev_t dev;
+	struct uio *uio;
+	int ioflag;
+{
+
+	return (ENODEV);
+}
+
+int
+nowrite(dev, uio, ioflag)
+	dev_t dev;
+	struct uio *uio;
+	int ioflag;
+{
+
+	return (ENODEV);
+}
+
+int
+noioctl(dev, cmd, data, flags, p)
+	dev_t dev;
+	u_long cmd;
+	caddr_t data;
+	int flags;
+	struct proc *p;
+{
+
+	return (ENODEV);
+}
+
+void
+nostop(tp, rw)
+	struct tty *tp;
+	int rw;
+{
+
+}
+
+int
+noreset(dev)
+	dev_t dev;
+{
+
+	printf("noreset(0x%x) called\n", dev);
+	return (ENODEV);
+}
+
+struct tty *
+nodevtotty(dev)
+	dev_t dev;
+{
+
+	return (NULL);
+}
+
+int
+nommap(dev, offset, nprot)
+	dev_t dev;
+	vm_offset_t offset;
+	int nprot;
+{
+
+	/* Don't return ENODEV.  That would allow mapping address ENODEV! */
+	return (-1);
+}
+
+int
+nodump(dev)
+	dev_t dev;
+{
+
+	return (ENODEV);
+}
+
+/*
+ * Null devswitch functions (for when the operation always succeeds).
+ * XXX may belong elsewhere.
+ * XXX not all are here (e.g., seltrue() isn't).
+ */
+
+/*
+ * XXX this is probably bogus.  Any device that uses it isn't checking the
+ * minor number.
+ */
+int
+nullopen(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
+{
+
+	return (0);
+}
+
+int
+nullclose(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
+{
+
+	return (0);
+}
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
new file mode 100644
index 0000000..8d90ee9
--- /dev/null
+++ b/sys/kern/sys_generic.c
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
+ * $Id: sys_generic.c,v 1.42 1998/11/11 10:03:55 truckman Exp $
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/sysent.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/limits.h>
+
+static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
+static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
+MALLOC_DEFINE(M_IOV, "iov", "large iov's");
+
+static int	pollscan __P((struct proc *, struct pollfd *, int));
+static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
+
+/*
+ * Read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct read_args {
+	int	fd;
+	void	*buf;
+	size_t	nbyte;
+};
+#endif
+/* ARGSUSED */
+int
+read(p, uap)
+	struct proc *p;
+	register struct read_args *uap;
+{
+	register struct file *fp;
+	register struct filedesc *fdp = p->p_fd;
+	struct uio auio;
+	struct iovec aiov;
+	long cnt, error = 0;
+#ifdef KTRACE
+	struct iovec ktriov;
+#endif
+
+	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+	    (fp->f_flag & FREAD) == 0)
+		return (EBADF);
+	aiov.iov_base = (caddr_t)uap->buf;
+	aiov.iov_len = uap->nbyte;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = -1;
+	if (uap->nbyte > INT_MAX)
+		return (EINVAL);
+	auio.uio_resid = uap->nbyte;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+#ifdef KTRACE
+	/*
+	 * if tracing, save a copy of iovec
+	 */
+	if (KTRPOINT(p, KTR_GENIO))
+		ktriov = aiov;
+#endif
+	cnt = uap->nbyte;
+	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
+		if (auio.uio_resid != cnt && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	cnt -= auio.uio_resid;
+#ifdef KTRACE
+	if (KTRPOINT(p, KTR_GENIO) && error == 0)
+		ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktriov, cnt, error);
+#endif
+	p->p_retval[0] = cnt;
+	return (error);
+}
+
+/*
+ * Scatter read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readv_args {
+	int	fd;
+	struct	iovec *iovp;
+	u_int	iovcnt;
+};
+#endif
+int
+readv(p, uap)
+	struct proc *p;
+	register struct readv_args *uap;
+{
+	register struct file *fp;
+	register struct filedesc *fdp = p->p_fd;
+	struct uio auio;
+	register struct iovec *iov;
+	struct iovec *needfree;
+	struct iovec aiov[UIO_SMALLIOV];
+	long i, cnt, error = 0;
+	u_int iovlen;
+#ifdef KTRACE
+	struct iovec *ktriov = NULL;
+#endif
+
+	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+	    (fp->f_flag & FREAD) == 0)
+		return (EBADF);
+	/* note: can't use iovlen until iovcnt is validated */
+	iovlen = uap->iovcnt * sizeof (struct iovec);
+	if (uap->iovcnt > UIO_SMALLIOV) {
+		if (uap->iovcnt > UIO_MAXIOV)
+			return (EINVAL);
+		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+		needfree = iov;
+	} else {
+		iov = aiov;
+		needfree = NULL;
+	}
+	auio.uio_iov = iov;
+	auio.uio_iovcnt = uap->iovcnt;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+	auio.uio_offset = -1;
+	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
+		goto done;
+	auio.uio_resid = 0;
+	for (i = 0; i < uap->iovcnt; i++) {
+		if (iov->iov_len > INT_MAX - auio.uio_resid) {
+			error = EINVAL;
+			goto done;
+		}
+		auio.uio_resid += iov->iov_len;
+		iov++;
+	}
+#ifdef KTRACE
+	/*
+	 * if tracing, save a copy of iovec
+	 */
+	if (KTRPOINT(p, KTR_GENIO))  {
+		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+	}
+#endif
+	cnt = auio.uio_resid;
+	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
+		if (auio.uio_resid != cnt && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	cnt -= auio.uio_resid;
+#ifdef KTRACE
+	if (ktriov != NULL) {
+		if (error == 0)
+			ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
+			    cnt, error);
+		FREE(ktriov, M_TEMP);
+	}
+#endif
+	p->p_retval[0] = cnt;
+done:
+	if (needfree)
+		FREE(needfree, M_IOV);
+	return (error);
+}
+
+/*
+ * Write system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct write_args {
+	int	fd;
+	const void *buf;
+	size_t	nbyte;
+};
+#endif
+int
+write(p, uap)
+	struct proc *p;
+	register struct write_args *uap;
+{
+	register struct file *fp;
+	register struct filedesc *fdp = p->p_fd;
+	struct uio auio;
+	struct iovec aiov;
+	long cnt, error = 0;
+#ifdef KTRACE
+	struct iovec ktriov;
+#endif
+
+	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+	    (fp->f_flag & FWRITE) == 0)
+		return (EBADF);
+	aiov.iov_base = (caddr_t)uap->buf;
+	aiov.iov_len = uap->nbyte;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = -1;
+	if (uap->nbyte > INT_MAX)
+		return (EINVAL);
+	auio.uio_resid = uap->nbyte;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+#ifdef KTRACE
+	/*
+	 * if tracing, save a copy of iovec
+	 */
+	if (KTRPOINT(p, KTR_GENIO))
+		ktriov = aiov;
+#endif
+	cnt = uap->nbyte;
+	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
+		if (auio.uio_resid != cnt && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		if (error == EPIPE)
+			psignal(p, SIGPIPE);
+	}
+	cnt -= auio.uio_resid;
+#ifdef KTRACE
+	if (KTRPOINT(p, KTR_GENIO) && error == 0)
+		ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
+		    &ktriov, cnt, error);
+#endif
+	p->p_retval[0] = cnt;
+	return (error);
+}
+
+/*
+ * Gather write system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct writev_args {
+	int	fd;
+	struct	iovec *iovp;
+	u_int	iovcnt;
+};
+#endif
+int
+writev(p, uap)
+	struct proc *p;
+	register struct writev_args *uap;
+{
+	register struct file *fp;
+	register struct filedesc *fdp = p->p_fd;
+	struct uio auio;
+	register struct iovec *iov;
+	struct iovec *needfree;
+	struct iovec aiov[UIO_SMALLIOV];
+	long i, cnt, error = 0;
+	u_int iovlen;
+#ifdef KTRACE
+	struct iovec *ktriov = NULL;
+#endif
+
+	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+	    (fp->f_flag & FWRITE) == 0)
+		return (EBADF);
+	/* note: can't use iovlen until iovcnt is validated */
+	iovlen = uap->iovcnt * sizeof (struct iovec);
+	if (uap->iovcnt > UIO_SMALLIOV) {
+		if (uap->iovcnt > UIO_MAXIOV)
+			return (EINVAL);
+		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+		needfree = iov;
+	} else {
+		iov = aiov;
+		needfree = NULL;
+	}
+	auio.uio_iov = iov;
+	auio.uio_iovcnt = uap->iovcnt;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+	auio.uio_offset = -1;
+	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
+		goto done;
+	auio.uio_resid = 0;
+	for (i = 0; i < uap->iovcnt; i++) {
+		if (iov->iov_len > INT_MAX - auio.uio_resid) {
+			error = EINVAL;
+			goto done;
+		}
+		auio.uio_resid += iov->iov_len;
+		iov++;
+	}
+#ifdef KTRACE
+	/*
+	 * if tracing, save a copy of iovec
+	 */
+	if (KTRPOINT(p, KTR_GENIO))  {
+		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+	}
+#endif
+	cnt = auio.uio_resid;
+	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
+		if (auio.uio_resid != cnt && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		if (error == EPIPE)
+			psignal(p, SIGPIPE);
+	}
+	cnt -= auio.uio_resid;
+#ifdef KTRACE
+	if (ktriov != NULL) {
+		if (error == 0)
+			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
+				ktriov, cnt, error);
+		FREE(ktriov, M_TEMP);
+	}
+#endif
+	p->p_retval[0] = cnt;
+done:
+	if (needfree)
+		FREE(needfree, M_IOV);
+	return (error);
+}
+
+/*
+ * Ioctl system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ioctl_args {
+	int	fd;
+	u_long	com;
+	caddr_t	data;
+};
+#endif
+/* ARGSUSED */
+int
+ioctl(p, uap)
+	struct proc *p;
+	register struct ioctl_args *uap;
+{
+	register struct file *fp;
+	register struct filedesc *fdp;
+	register u_long com;
+	int error;
+	register u_int size;
+	caddr_t data, memp;
+	int tmp;
+#define STK_PARAMS	128
+	char stkbuf[STK_PARAMS];
+
+	fdp = p->p_fd;
+	if ((u_int)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+		return (EBADF);
+
+	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
+		return (EBADF);
+
+	switch (com = uap->com) {
+	case FIONCLEX:
+		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
+		return (0);
+	case FIOCLEX:
+		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
+		return (0);
+	}
+
+	/*
+	 * Interpret high order word to find amount of data to be
+	 * copied to/from the user's address space.
+	 */
+	size = IOCPARM_LEN(com);
+	if (size > IOCPARM_MAX)
+		return (ENOTTY);
+	memp = NULL;
+	if (size > sizeof (stkbuf)) {
+		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+		data = memp;
+	} else
+		data = stkbuf;
+	if (com&IOC_IN) {
+		if (size) {
+			error = copyin(uap->data, data, (u_int)size);
+			if (error) {
+				if (memp)
+					free(memp, M_IOCTLOPS);
+				return (error);
+			}
+		} else
+			*(caddr_t *)data = uap->data;
+	} else if ((com&IOC_OUT) && size)
+		/*
+		 * Zero the buffer so the user always
+		 * gets back something deterministic.
+		 */
+		bzero(data, size);
+	else if (com&IOC_VOID)
+		*(caddr_t *)data = uap->data;
+
+	switch (com) {
+
+	case FIONBIO:
+		if ((tmp = *(int *)data))
+			fp->f_flag |= FNONBLOCK;
+		else
+			fp->f_flag &= ~FNONBLOCK;
+		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
+		break;
+
+	case FIOASYNC:
+		if ((tmp = *(int *)data))
+			fp->f_flag |= FASYNC;
+		else
+			fp->f_flag &= ~FASYNC;
+		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
+		break;
+
+	default:
+		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
+		/*
+		 * Copy any data to user, size was
+		 * already set and checked above.
+		 */
+		if (error == 0 && (com&IOC_OUT) && size)
+			error = copyout(data, uap->data, (u_int)size);
+		break;
+	}
+	if (memp)
+		free(memp, M_IOCTLOPS);
+	return (error);
+}
+
+static int	nselcoll;
+int	selwait;
+
+/*
+ * Select system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct select_args {
+	int	nd;
+	fd_set	*in, *ou, *ex;
+	struct	timeval *tv;
+};
+#endif
+int
+select(p, uap)
+	register struct proc *p;
+	register struct select_args *uap;
+{
+	/*
+	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
+	 * infds with the new FD_SETSIZE of 1024, and more than enough for
+	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
+	 * of 256.
+	 */
+	fd_mask s_selbits[howmany(2048, NFDBITS)];
+	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
+	struct timeval atv, rtv, ttv;
+	int s, ncoll, error, timo;
+	u_int nbufbytes, ncpbytes, nfdbits;
+
+	if (uap->nd < 0)
+		return (EINVAL);
+	if (uap->nd > p->p_fd->fd_nfiles)
+		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
+
+	/*
+	 * Allocate just enough bits for the non-null fd_sets.  Use the
+	 * preallocated auto buffer if possible.
+	 */
+	nfdbits = roundup(uap->nd, NFDBITS);
+	ncpbytes = nfdbits / NBBY;
+	nbufbytes = 0;
+	if (uap->in != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (uap->ou != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (uap->ex != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (nbufbytes <= sizeof s_selbits)
+		selbits = &s_selbits[0];
+	else
+		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
+
+	/*
+	 * Assign pointers into the bit buffers and fetch the input bits.
+	 * Put the output buffers together so that they can be bzeroed
+	 * together.
+	 */
+	sbp = selbits;
+#define	getbits(name, x) \
+	do {								\
+		if (uap->name == NULL)					\
+			ibits[x] = NULL;				\
+		else {							\
+			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
+			obits[x] = sbp;					\
+			sbp += ncpbytes / sizeof *sbp;			\
+			error = copyin(uap->name, ibits[x], ncpbytes);	\
+			if (error != 0)					\
+				goto done;				\
+		}							\
+	} while (0)
+	getbits(in, 0);
+	getbits(ou, 1);
+	getbits(ex, 2);
+#undef	getbits
+	if (nbufbytes != 0)
+		bzero(selbits, nbufbytes / 2);
+
+	if (uap->tv) {
+		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
+			sizeof (atv));
+		if (error)
+			goto done;
+		if (itimerfix(&atv)) {
+			error = EINVAL;
+			goto done;
+		}
+		getmicrouptime(&rtv);
+		timevaladd(&atv, &rtv);
+	} else
+		atv.tv_sec = 0;
+	timo = 0;
+retry:
+	ncoll = nselcoll;
+	p->p_flag |= P_SELECT;
+	error = selscan(p, ibits, obits, uap->nd);
+	if (error || p->p_retval[0])
+		goto done;
+	if (atv.tv_sec) {
+		getmicrouptime(&rtv);
+		if (timevalcmp(&rtv, &atv, >=)) 
+			goto done;
+		ttv = atv;
+		timevalsub(&ttv, &rtv);
+		timo = ttv.tv_sec > 24 * 60 * 60 ?
+		    24 * 60 * 60 * hz : tvtohz(&ttv);
+	}
+	s = splhigh();
+	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
+		splx(s);
+		goto retry;
+	}
+	p->p_flag &= ~P_SELECT;
+	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
+	splx(s);
+	if (error == 0)
+		goto retry;
+done:
+	p->p_flag &= ~P_SELECT;
+	/* select is not restarted after signals... */
+	if (error == ERESTART)
+		error = EINTR;
+	if (error == EWOULDBLOCK)
+		error = 0;
+#define	putbits(name, x) \
+	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
+		error = error2;
+	if (error == 0) {
+		int error2;
+
+		putbits(in, 0);
+		putbits(ou, 1);
+		putbits(ex, 2);
+#undef putbits
+	}
+	if (selbits != &s_selbits[0])
+		free(selbits, M_SELECT);
+	return (error);
+}
+
+static int
+selscan(p, ibits, obits, nfd)
+	struct proc *p;
+	fd_mask **ibits, **obits;
+	int nfd;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register int msk, i, j, fd;
+	register fd_mask bits;
+	struct file *fp;
+	int n = 0;
+	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
+	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
+
+	for (msk = 0; msk < 3; msk++) {
+		if (ibits[msk] == NULL)
+			continue;
+		for (i = 0; i < nfd; i += NFDBITS) {
+			bits = ibits[msk][i/NFDBITS];
+			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
+				bits &= ~(1 << j);
+				fp = fdp->fd_ofiles[fd];
+				if (fp == NULL)
+					return (EBADF);
+				if ((*fp->f_ops->fo_poll)(fp, flag[msk],
+				    fp->f_cred, p)) {
+					obits[msk][(fd)/NFDBITS] |=
+						(1 << ((fd) % NFDBITS));
+					n++;
+				}
+			}
+		}
+	}
+	p->p_retval[0] = n;
+	return (0);
+}
+
+/*
+ * Poll system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct poll_args {
+	struct pollfd *fds;
+	u_int	nfds;
+	int	timeout;
+};
+#endif
+int
+poll(p, uap)
+	register struct proc *p;
+	register struct poll_args *uap;
+{
+	caddr_t bits;
+	char smallbits[32 * sizeof(struct pollfd)];
+	struct timeval atv, rtv, ttv;
+	int s, ncoll, error = 0, timo;
+	size_t ni;
+
+	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
+		/* forgiving; slightly wrong */
+		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
+	}
+	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
+	if (ni > sizeof(smallbits))
+		bits = malloc(ni, M_TEMP, M_WAITOK);
+	else
+		bits = smallbits;
+	error = copyin(SCARG(uap, fds), bits, ni);
+	if (error)
+		goto done;
+	if (SCARG(uap, timeout) != INFTIM) {
+		atv.tv_sec = SCARG(uap, timeout) / 1000;
+		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
+		if (itimerfix(&atv)) {
+			error = EINVAL;
+			goto done;
+		}
+		getmicrouptime(&rtv);
+		timevaladd(&atv, &rtv);
+	} else
+		atv.tv_sec = 0;
+	timo = 0;
+retry:
+	ncoll = nselcoll;
+	p->p_flag |= P_SELECT;
+	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
+	if (error || p->p_retval[0])
+		goto done;
+	if (atv.tv_sec) {
+		getmicrouptime(&rtv);
+		if (timevalcmp(&rtv, &atv, >=))
+			goto done;
+		ttv = atv;
+		timevalsub(&ttv, &rtv);
+		timo = ttv.tv_sec > 24 * 60 * 60 ?
+		    24 * 60 * 60 * hz : tvtohz(&ttv);
+	} 
+	s = splhigh(); 
+	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
+		splx(s);
+		goto retry;
+	}
+	p->p_flag &= ~P_SELECT;
+	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
+	splx(s);
+	if (error == 0)
+		goto retry;
+done:
+	p->p_flag &= ~P_SELECT;
+	/* poll is not restarted after signals... */
+	if (error == ERESTART)
+		error = EINTR;
+	if (error == EWOULDBLOCK)
+		error = 0;
+	if (error == 0) {
+		error = copyout(bits, SCARG(uap, fds), ni);
+		if (error)
+			goto out;
+	}
+out:
+	if (ni > sizeof(smallbits))
+		free(bits, M_TEMP);
+	return (error);
+}
+
+static int
+pollscan(p, fds, nfd)
+	struct proc *p;
+	struct pollfd *fds;
+	int nfd;
+{
+	register struct filedesc *fdp = p->p_fd;
+	int i;
+	struct file *fp;
+	int n = 0;
+
+	for (i = 0; i < nfd; i++, fds++) {
+		if (fds->fd >= fdp->fd_nfiles) {
+			fds->revents = POLLNVAL;
+			n++;
+		} else if (fds->fd < 0) {
+			fds->revents = 0;
+		} else {
+			fp = fdp->fd_ofiles[fds->fd];
+			if (fp == 0) {
+				fds->revents = POLLNVAL;
+				n++;
+			} else {
+				/*
+				 * Note: backend also returns POLLHUP and
+				 * POLLERR if appropriate.
+				 */
+				fds->revents = (*fp->f_ops->fo_poll)(fp,
+				    fds->events, fp->f_cred, p);
+				if (fds->revents != 0)
+					n++;
+			}
+		}
+	}
+	p->p_retval[0] = n;
+	return (0);
+}
+
+/*
+ * OpenBSD poll system call.
+ * XXX this isn't quite a true representation..  OpenBSD uses select ops.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct openbsd_poll_args {
+	struct pollfd *fds;
+	u_int	nfds;
+	int	timeout;
+};
+#endif
+int
+openbsd_poll(p, uap)
+	register struct proc *p;
+	register struct openbsd_poll_args *uap;
+{
+	return (poll(p, (struct poll_args *)uap));
+}
+
+/*ARGSUSED*/
+int
+seltrue(dev, events, p)
+	dev_t dev;
+	int events;
+	struct proc *p;
+{
+
+	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Record a select request.
+ */
+void
+selrecord(selector, sip)
+	struct proc *selector;
+	struct selinfo *sip;
+{
+	struct proc *p;
+	pid_t mypid;
+
+	mypid = selector->p_pid;
+	if (sip->si_pid == mypid)
+		return;
+	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
+	    p->p_wchan == (caddr_t)&selwait)
+		sip->si_flags |= SI_COLL;
+	else
+		sip->si_pid = mypid;
+}
+
+/*
+ * Do a wakeup when a selectable event occurs.
+ */
+void
+selwakeup(sip)
+	register struct selinfo *sip;
+{
+	register struct proc *p;
+	int s;
+
+	if (sip->si_pid == 0)
+		return;
+	if (sip->si_flags & SI_COLL) {
+		nselcoll++;
+		sip->si_flags &= ~SI_COLL;
+		wakeup((caddr_t)&selwait);
+	}
+	p = pfind(sip->si_pid);
+	sip->si_pid = 0;
+	if (p != NULL) {
+		s = splhigh();
+		if (p->p_wchan == (caddr_t)&selwait) {
+			if (p->p_stat == SSLEEP)
+				setrunnable(p);
+			else
+				unsleep(p);
+		} else if (p->p_flag & P_SELECT)
+			p->p_flag &= ~P_SELECT;
+		splx(s);
+	}
+}
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
new file mode 100644
index 0000000..29e1e97
--- /dev/null
+++ b/sys/kern/sys_pipe.c
@@ -0,0 +1,1102 @@
+/*
+ * Copyright (c) 1996 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ *    John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ *    are met.
+ *
+ * $Id: sys_pipe.c,v 1.45 1998/11/11 10:03:55 truckman Exp $
+ */
+
+/*
+ * This file contains a high-performance replacement for the socket-based
+ * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
+ * all features of sockets, but does do everything that pipes normally
+ * do.
+ */
+
+/*
+ * This code has two modes of operation, a small write mode and a large
+ * write mode.  The small write mode acts like conventional pipes with
+ * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
+ * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
+ * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
+ * the receiving process can copy it directly from the pages in the sending
+ * process.
+ *
+ * If the sending process receives a signal, it is possible that it will
+ * go away, and certainly its address space can change, because control
+ * is returned back to the user-mode side.  In that case, the pipe code
+ * arranges to copy the buffer supplied by the user process, to a pageable
+ * kernel buffer, and the receiving process will grab the data from the
+ * pageable kernel buffer.  Since signals don't happen all that often,
+ * the copy operation is normally eliminated.
+ *
+ * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
+ * happen for small transfers so that the system will not spend all of
+ * its time context switching.  PIPE_SIZE is constrained by the
+ * amount of kernel virtual memory.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/stat.h>
+#include <sys/poll.h>
+#include <sys/signalvar.h>
+#include <sys/sysproto.h>
+#include <sys/pipe.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_zone.h>
+
+/*
+ * Use this define if you want to disable *fancy* VM things.  Expect an
+ * approx 30% decrease in transfer rate.  This could be useful for
+ * NetBSD or OpenBSD.
+ */
+/* #define PIPE_NODIRECT */
+
+/*
+ * interfaces to the outside world
+ */
+static int pipe_read __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+static int pipe_write __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+static int pipe_close __P((struct file *fp, struct proc *p));
+static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
+		struct proc *p));
+static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
+
+static struct fileops pipeops =
+    { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_close };
+
+/*
+ * Default pipe buffer size(s), this can be kind-of large now because pipe
+ * space is pageable.  The pipe code will try to maintain locality of
+ * reference for performance reasons, so small amounts of outstanding I/O
+ * will not wipe the cache.
+ */
+#define MINPIPESIZE (PIPE_SIZE/3)
+#define MAXPIPESIZE (2*PIPE_SIZE/3)
+
+/*
+ * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
+ * is there so that on large systems, we don't exhaust it.
+ */
+#define MAXPIPEKVA (8*1024*1024)
+
+/*
+ * Limit for direct transfers, we cannot, of course limit
+ * the amount of kva for pipes in general though.
+ */
+#define LIMITPIPEKVA (16*1024*1024)
+
+/*
+ * Limit the number of "big" pipes
+ */
+#define LIMITBIGPIPES	32
+static int nbigpipe;
+
+static int amountpipekva;
+
+static void pipeclose __P((struct pipe *cpipe));
+static void pipeinit __P((struct pipe *cpipe));
+static __inline int pipelock __P((struct pipe *cpipe, int catch));
+static __inline void pipeunlock __P((struct pipe *cpipe));
+static __inline void pipeselwakeup __P((struct pipe *cpipe));
+#ifndef PIPE_NODIRECT
+static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
+static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
+static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
+static void pipe_clone_write_buffer __P((struct pipe *wpipe));
+#endif
+static void pipespace __P((struct pipe *cpipe));
+
+static vm_zone_t pipe_zone;
+
+/*
+ * The pipe system call for the DTYPE_PIPE type of pipes
+ */
+
+/* ARGSUSED */
+int
+pipe(p, uap)
+	struct proc *p;
+	struct pipe_args /* {
+		int	dummy;
+	} */ *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	struct file *rf, *wf;
+	struct pipe *rpipe, *wpipe;
+	int fd, error;
+
+	if (pipe_zone == NULL)
+		pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4);
+
+	rpipe = zalloc( pipe_zone);
+	pipeinit(rpipe);
+	rpipe->pipe_state |= PIPE_DIRECTOK;
+	wpipe = zalloc( pipe_zone);
+	pipeinit(wpipe);
+	wpipe->pipe_state |= PIPE_DIRECTOK;
+
+	error = falloc(p, &rf, &fd);
+	if (error)
+		goto free2;
+	p->p_retval[0] = fd;
+	rf->f_flag = FREAD | FWRITE;
+	rf->f_type = DTYPE_PIPE;
+	rf->f_ops = &pipeops;
+	rf->f_data = (caddr_t)rpipe;
+	error = falloc(p, &wf, &fd);
+	if (error)
+		goto free3;
+	wf->f_flag = FREAD | FWRITE;
+	wf->f_type = DTYPE_PIPE;
+	wf->f_ops = &pipeops;
+	wf->f_data = (caddr_t)wpipe;
+	p->p_retval[1] = fd;
+
+	rpipe->pipe_peer = wpipe;
+	wpipe->pipe_peer = rpipe;
+
+	return (0);
+free3:
+	ffree(rf);
+	fdp->fd_ofiles[p->p_retval[0]] = 0;
+free2:
+	(void)pipeclose(wpipe);
+	(void)pipeclose(rpipe);
+	return (error);
+}
+
+/*
+ * Allocate kva for pipe circular buffer, the space is pageable
+ */
+static void
+pipespace(cpipe)
+	struct pipe *cpipe;
+{
+	int npages, error;
+
+	npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
+	/*
+	 * Create an object, I don't like the idea of paging to/from
+	 * kernel_object.
+	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+	 */
+	cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
+	cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
+
+	/*
+	 * Insert the object into the kernel map, and allocate kva for it.
+	 * The map entry is, by default, pageable.
+	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+	 */
+	error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
+		(vm_offset_t *) &cpipe->pipe_buffer.buffer, 
+		cpipe->pipe_buffer.size, 1,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
+
+	if (error != KERN_SUCCESS)
+		panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
+	amountpipekva += cpipe->pipe_buffer.size;
+}
+
+/*
+ * initialize and allocate VM and memory for pipe
+ */
+static void
+pipeinit(cpipe)
+	struct pipe *cpipe;
+{
+
+	cpipe->pipe_buffer.in = 0;
+	cpipe->pipe_buffer.out = 0;
+	cpipe->pipe_buffer.cnt = 0;
+	cpipe->pipe_buffer.size = PIPE_SIZE;
+
+	/* Buffer kva gets dynamically allocated */
+	cpipe->pipe_buffer.buffer = NULL;
+	/* cpipe->pipe_buffer.object = invalid */
+
+	cpipe->pipe_state = 0;
+	cpipe->pipe_peer = NULL;
+	cpipe->pipe_busy = 0;
+	getnanotime(&cpipe->pipe_ctime);
+	cpipe->pipe_atime = cpipe->pipe_ctime;
+	cpipe->pipe_mtime = cpipe->pipe_ctime;
+	bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
+
+#ifndef PIPE_NODIRECT
+	/*
+	 * pipe data structure initializations to support direct pipe I/O
+	 */
+	cpipe->pipe_map.cnt = 0;
+	cpipe->pipe_map.kva = 0;
+	cpipe->pipe_map.pos = 0;
+	cpipe->pipe_map.npages = 0;
+	/* cpipe->pipe_map.ms[] = invalid */
+#endif
+}
+
+
+/*
+ * lock a pipe for I/O, blocking other access
+ */
+static __inline int
+pipelock(cpipe, catch)
+	struct pipe *cpipe;
+	int catch;
+{
+	int error;
+	while (cpipe->pipe_state & PIPE_LOCK) {
+		cpipe->pipe_state |= PIPE_LWANT;
+		if (error = tsleep( cpipe,
+			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) {
+			return error;
+		}
+	}
+	cpipe->pipe_state |= PIPE_LOCK;
+	return 0;
+}
+
+/*
+ * unlock a pipe I/O lock
+ */
+static __inline void
+pipeunlock(cpipe)
+	struct pipe *cpipe;
+{
+	cpipe->pipe_state &= ~PIPE_LOCK;
+	if (cpipe->pipe_state & PIPE_LWANT) {
+		cpipe->pipe_state &= ~PIPE_LWANT;
+		wakeup(cpipe);
+	}
+}
+
+static __inline void
+pipeselwakeup(cpipe)
+	struct pipe *cpipe;
+{
+	if (cpipe->pipe_state & PIPE_SEL) {
+		cpipe->pipe_state &= ~PIPE_SEL;
+		selwakeup(&cpipe->pipe_sel);
+	}
+	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
+		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
+}
+
+/* ARGSUSED */
+static int
+pipe_read(fp, uio, cred)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+{
+
+	struct pipe *rpipe = (struct pipe *) fp->f_data;
+	int error = 0;
+	int nread = 0;
+	u_int size;
+
+	++rpipe->pipe_busy;
+	while (uio->uio_resid) {
+		/*
+		 * normal pipe buffer receive
+		 */
+		if (rpipe->pipe_buffer.cnt > 0) {
+			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
+			if (size > rpipe->pipe_buffer.cnt)
+				size = rpipe->pipe_buffer.cnt;
+			if (size > (u_int) uio->uio_resid)
+				size = (u_int) uio->uio_resid;
+			if ((error = pipelock(rpipe,1)) == 0) {
+				error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 
+					size, uio);
+				pipeunlock(rpipe);
+			}
+			if (error) {
+				break;
+			}
+			rpipe->pipe_buffer.out += size;
+			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
+				rpipe->pipe_buffer.out = 0;
+
+			rpipe->pipe_buffer.cnt -= size;
+			nread += size;
+#ifndef PIPE_NODIRECT
+		/*
+		 * Direct copy, bypassing a kernel buffer.
+		 */
+		} else if ((size = rpipe->pipe_map.cnt) &&
+			(rpipe->pipe_state & PIPE_DIRECTW)) {
+			caddr_t va;
+			if (size > (u_int) uio->uio_resid)
+				size = (u_int) uio->uio_resid;
+			if ((error = pipelock(rpipe,1)) == 0) {
+				va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
+				error = uiomove(va, size, uio);
+				pipeunlock(rpipe);
+			}
+			if (error)
+				break;
+			nread += size;
+			rpipe->pipe_map.pos += size;
+			rpipe->pipe_map.cnt -= size;
+			if (rpipe->pipe_map.cnt == 0) {
+				rpipe->pipe_state &= ~PIPE_DIRECTW;
+				wakeup(rpipe);
+			}
+#endif
+		} else {
+			/*
+			 * detect EOF condition
+			 */
+			if (rpipe->pipe_state & PIPE_EOF) {
+				/* XXX error = ? */
+				break;
+			}
+			/*
+			 * If the "write-side" has been blocked, wake it up now.
+			 */
+			if (rpipe->pipe_state & PIPE_WANTW) {
+				rpipe->pipe_state &= ~PIPE_WANTW;
+				wakeup(rpipe);
+			}
+			if (nread > 0)
+				break;
+
+			if (fp->f_flag & FNONBLOCK) {
+				error = EAGAIN;
+				break;
+			}
+
+			/*
+			 * If there is no more to read in the pipe, reset
+			 * its pointers to the beginning.  This improves
+			 * cache hit stats.
+			 */
+		
+			if ((error = pipelock(rpipe,1)) == 0) {
+				if (rpipe->pipe_buffer.cnt == 0) {
+					rpipe->pipe_buffer.in = 0;
+					rpipe->pipe_buffer.out = 0;
+				}
+				pipeunlock(rpipe);
+			} else {
+				break;
+			}
+
+			if (rpipe->pipe_state & PIPE_WANTW) {
+				rpipe->pipe_state &= ~PIPE_WANTW;
+				wakeup(rpipe);
+			}
+
+			rpipe->pipe_state |= PIPE_WANTR;
+			if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) {
+				break;
+			}
+		}
+	}
+
+	if (error == 0)
+		getnanotime(&rpipe->pipe_atime);
+
+	--rpipe->pipe_busy;
+	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
+		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
+		wakeup(rpipe);
+	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
+		/*
+		 * If there is no more to read in the pipe, reset
+		 * its pointers to the beginning.  This improves
+		 * cache hit stats.
+		 */
+		if (rpipe->pipe_buffer.cnt == 0) {
+			if ((error == 0) && (error = pipelock(rpipe,1)) == 0) {
+				rpipe->pipe_buffer.in = 0;
+				rpipe->pipe_buffer.out = 0;
+				pipeunlock(rpipe);
+			}
+		}
+
+		/*
+		 * If the "write-side" has been blocked, wake it up now.
+		 */
+		if (rpipe->pipe_state & PIPE_WANTW) {
+			rpipe->pipe_state &= ~PIPE_WANTW;
+			wakeup(rpipe);
+		}
+	}
+
+	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
+		pipeselwakeup(rpipe);
+
+	return error;
+}
+
+#ifndef PIPE_NODIRECT
+/*
+ * Map the sending processes' buffer into kernel space and wire it.
+ * This is similar to a physical write operation.
+ */
+static int
+pipe_build_write_buffer(wpipe, uio)
+	struct pipe *wpipe;
+	struct uio *uio;
+{
+	u_int size;
+	int i;
+	vm_offset_t addr, endaddr, paddr;
+
+	size = (u_int) uio->uio_iov->iov_len;
+	if (size > wpipe->pipe_buffer.size)
+		size = wpipe->pipe_buffer.size;
+
+	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
+	for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
+		addr < endaddr;
+		addr += PAGE_SIZE, i+=1) {
+
+		vm_page_t m;
+
+		vm_fault_quick( (caddr_t) addr, VM_PROT_READ);
+		paddr = pmap_kextract(addr);
+		if (!paddr) {
+			int j;
+			for(j=0;j<i;j++)
+				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
+			return EFAULT;
+		}
+
+		m = PHYS_TO_VM_PAGE(paddr);
+		vm_page_wire(m);
+		wpipe->pipe_map.ms[i] = m;
+	}
+
+/*
+ * set up the control block
+ */
+	wpipe->pipe_map.npages = i;
+	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
+	wpipe->pipe_map.cnt = size;
+
+/*
+ * and map the buffer
+ */
+	if (wpipe->pipe_map.kva == 0) {
+		/*
+		 * We need to allocate space for an extra page because the
+		 * address range might (will) span pages at times.
+		 */
+		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
+			wpipe->pipe_buffer.size + PAGE_SIZE);
+		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
+	}
+	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
+		wpipe->pipe_map.npages);
+
+/*
+ * and update the uio data
+ */
+
+	uio->uio_iov->iov_len -= size;
+	uio->uio_iov->iov_base += size;
+	if (uio->uio_iov->iov_len == 0)
+		uio->uio_iov++;
+	uio->uio_resid -= size;
+	uio->uio_offset += size;
+	return 0;
+}
+
+/*
+ * unmap and unwire the process buffer
+ */
+static void
+pipe_destroy_write_buffer(wpipe)
+struct pipe *wpipe;
+{
+	int i;
+	if (wpipe->pipe_map.kva) {
+		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
+
+		if (amountpipekva > MAXPIPEKVA) {
+			vm_offset_t kva = wpipe->pipe_map.kva;
+			wpipe->pipe_map.kva = 0;
+			kmem_free(kernel_map, kva,
+				wpipe->pipe_buffer.size + PAGE_SIZE);
+			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
+		}
+	}
+	for (i=0;i<wpipe->pipe_map.npages;i++)
+		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
+}
+
+/*
+ * In the case of a signal, the writing process might go away.  This
+ * code copies the data into the circular buffer so that the source
+ * pages can be freed without loss of data.
+ */
+static void
+pipe_clone_write_buffer(wpipe)
+struct pipe *wpipe;
+{
+	int size;
+	int pos;
+
+	size = wpipe->pipe_map.cnt;
+	pos = wpipe->pipe_map.pos;
+	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
+			(caddr_t) wpipe->pipe_buffer.buffer,
+			size);
+
+	wpipe->pipe_buffer.in = size;
+	wpipe->pipe_buffer.out = 0;
+	wpipe->pipe_buffer.cnt = size;
+	wpipe->pipe_state &= ~PIPE_DIRECTW;
+
+	pipe_destroy_write_buffer(wpipe);
+}
+
+/*
+ * This implements the pipe buffer write mechanism.  Note that only
+ * a direct write OR a normal pipe write can be pending at any given time.
+ * If there are any characters in the pipe buffer, the direct write will
+ * be deferred until the receiving process grabs all of the bytes from
+ * the pipe buffer.  Then the direct mapping write is set-up.
+ */
+static int
+pipe_direct_write(wpipe, uio)
+	struct pipe *wpipe;
+	struct uio *uio;
+{
+	int error;
+retry:
+	while (wpipe->pipe_state & PIPE_DIRECTW) {
+		if ( wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+		wpipe->pipe_state |= PIPE_WANTW;
+		error = tsleep(wpipe,
+				PRIBIO|PCATCH, "pipdww", 0);
+		if (error)
+			goto error1;
+		if (wpipe->pipe_state & PIPE_EOF) {
+			error = EPIPE;
+			goto error1;
+		}
+	}
+	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
+	if (wpipe->pipe_buffer.cnt > 0) {
+		if ( wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+			
+		wpipe->pipe_state |= PIPE_WANTW;
+		error = tsleep(wpipe,
+				PRIBIO|PCATCH, "pipdwc", 0);
+		if (error)
+			goto error1;
+		if (wpipe->pipe_state & PIPE_EOF) {
+			error = EPIPE;
+			goto error1;
+		}
+		goto retry;
+	}
+
+	wpipe->pipe_state |= PIPE_DIRECTW;
+
+	error = pipe_build_write_buffer(wpipe, uio);
+	if (error) {
+		wpipe->pipe_state &= ~PIPE_DIRECTW;
+		goto error1;
+	}
+
+	error = 0;
+	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
+		if (wpipe->pipe_state & PIPE_EOF) {
+			pipelock(wpipe, 0);
+			pipe_destroy_write_buffer(wpipe);
+			pipeunlock(wpipe);
+			pipeselwakeup(wpipe);
+			error = EPIPE;
+			goto error1;
+		}
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+		pipeselwakeup(wpipe);
+		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
+	}
+
+	pipelock(wpipe,0);
+	if (wpipe->pipe_state & PIPE_DIRECTW) {
+		/*
+		 * this bit of trickery substitutes a kernel buffer for
+		 * the process that might be going away.
+		 */
+		pipe_clone_write_buffer(wpipe);
+	} else {
+		pipe_destroy_write_buffer(wpipe);
+	}
+	pipeunlock(wpipe);
+	return error;
+
+error1:
+	wakeup(wpipe);
+	return error;
+}
+#endif
+	
+static int
+pipe_write(fp, uio, cred)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+{
+	int error = 0;
+	int orig_resid;
+
+	struct pipe *wpipe, *rpipe;
+
+	rpipe = (struct pipe *) fp->f_data;
+	wpipe = rpipe->pipe_peer;
+
+	/*
+	 * detect loss of pipe read side, issue SIGPIPE if lost.
+	 */
+	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+		return EPIPE;
+	}
+
+	/*
+	 * If it is advantageous to resize the pipe buffer, do
+	 * so.
+	 */
+	if ((uio->uio_resid > PIPE_SIZE) &&
+		(nbigpipe < LIMITBIGPIPES) &&
+		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
+		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
+		(wpipe->pipe_buffer.cnt == 0)) {
+
+		if (wpipe->pipe_buffer.buffer) {
+			amountpipekva -= wpipe->pipe_buffer.size;
+			kmem_free(kernel_map,
+				(vm_offset_t)wpipe->pipe_buffer.buffer,
+				wpipe->pipe_buffer.size);
+		}
+
+#ifndef PIPE_NODIRECT
+		if (wpipe->pipe_map.kva) {
+			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
+			kmem_free(kernel_map,
+				wpipe->pipe_map.kva,
+				wpipe->pipe_buffer.size + PAGE_SIZE);
+		}
+#endif
+
+		wpipe->pipe_buffer.in = 0;
+		wpipe->pipe_buffer.out = 0;
+		wpipe->pipe_buffer.cnt = 0;
+		wpipe->pipe_buffer.size = BIG_PIPE_SIZE;
+		wpipe->pipe_buffer.buffer = NULL;
+		++nbigpipe;
+
+#ifndef PIPE_NODIRECT
+		wpipe->pipe_map.cnt = 0;
+		wpipe->pipe_map.kva = 0;
+		wpipe->pipe_map.pos = 0;
+		wpipe->pipe_map.npages = 0;
+#endif
+
+	}
+		
+
+	if( wpipe->pipe_buffer.buffer == NULL) {
+		if ((error = pipelock(wpipe,1)) == 0) {
+			pipespace(wpipe);
+			pipeunlock(wpipe);
+		} else {
+			return error;
+		}
+	}
+
+	++wpipe->pipe_busy;
+	orig_resid = uio->uio_resid;
+	while (uio->uio_resid) {
+		int space;
+#ifndef PIPE_NODIRECT
+		/*
+		 * If the transfer is large, we can gain performance if
+		 * we do process-to-process copies directly.
+		 * If the write is non-blocking, we don't use the
+		 * direct write mechanism.
+		 */
+		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
+		    (fp->f_flag & FNONBLOCK) == 0 &&
+			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
+			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
+			error = pipe_direct_write( wpipe, uio);
+			if (error) {
+				break;
+			}
+			continue;
+		}
+#endif
+
+		/*
+		 * Pipe buffered writes cannot be coincidental with
+		 * direct writes.  We wait until the currently executing
+		 * direct write is completed before we start filling the
+		 * pipe buffer.
+		 */
+	retrywrite:
+		while (wpipe->pipe_state & PIPE_DIRECTW) {
+			if (wpipe->pipe_state & PIPE_WANTR) {
+				wpipe->pipe_state &= ~PIPE_WANTR;
+				wakeup(wpipe);
+			}
+			error = tsleep(wpipe,
+					PRIBIO|PCATCH, "pipbww", 0);
+			if (error)
+				break;
+		}
+
+		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
+
+		/* Writes of size <= PIPE_BUF must be atomic. */
+		/* XXX perhaps they need to be contiguous to be atomic? */
+		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
+			space = 0;
+
+		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
+			/*
+			 * This set the maximum transfer as a segment of
+			 * the buffer.
+			 */
+			int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in;
+			/*
+			 * space is the size left in the buffer
+			 */
+			if (size > space)
+				size = space;
+			/*
+			 * now limit it to the size of the uio transfer
+			 */
+			if (size > uio->uio_resid)
+				size = uio->uio_resid;
+			if ((error = pipelock(wpipe,1)) == 0) {
+				/*
+				 * It is possible for a direct write to
+				 * slip in on us... handle it here...
+				 */
+				if (wpipe->pipe_state & PIPE_DIRECTW) {
+					pipeunlock(wpipe);
+					goto retrywrite;
+				}
+				error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 
+					size, uio);
+				pipeunlock(wpipe);
+			}
+			if (error)
+				break;
+
+			wpipe->pipe_buffer.in += size;
+			if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size)
+				wpipe->pipe_buffer.in = 0;
+
+			wpipe->pipe_buffer.cnt += size;
+		} else {
+			/*
+			 * If the "read-side" has been blocked, wake it up now.
+			 */
+			if (wpipe->pipe_state & PIPE_WANTR) {
+				wpipe->pipe_state &= ~PIPE_WANTR;
+				wakeup(wpipe);
+			}
+
+			/*
+			 * don't block on non-blocking I/O
+			 */
+			if (fp->f_flag & FNONBLOCK) {
+				error = EAGAIN;
+				break;
+			}
+
+			/*
+			 * We have no more space and have something to offer,
+			 * wake up select/poll.
+			 */
+			pipeselwakeup(wpipe);
+
+			wpipe->pipe_state |= PIPE_WANTW;
+			if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) {
+				break;
+			}
+			/*
+			 * If read side wants to go away, we just issue a signal
+			 * to ourselves.
+			 */
+			if (wpipe->pipe_state & PIPE_EOF) {
+				error = EPIPE;
+				break;
+			}	
+		}
+	}
+
+	--wpipe->pipe_busy;
+	if ((wpipe->pipe_busy == 0) &&
+		(wpipe->pipe_state & PIPE_WANT)) {
+		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
+		wakeup(wpipe);
+	} else if (wpipe->pipe_buffer.cnt > 0) {
+		/*
+		 * If we have put any characters in the buffer, we wake up
+		 * the reader.
+		 */
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+	}
+
+	/*
+	 * Don't return EPIPE if I/O was successful
+	 */
+	if ((wpipe->pipe_buffer.cnt == 0) &&
+		(uio->uio_resid == 0) &&
+		(error == EPIPE))
+		error = 0;
+
+	if (error == 0)
+		getnanotime(&wpipe->pipe_mtime);
+
+	/*
+	 * We have something to offer,
+	 * wake up select/poll.
+	 */
+	if (wpipe->pipe_buffer.cnt)
+		pipeselwakeup(wpipe);
+
+	return error;
+}
+
+/*
+ * we implement a very minimal set of ioctls for compatibility with sockets.
+ */
+int
+pipe_ioctl(fp, cmd, data, p)
+	struct file *fp;
+	u_long cmd;
+	register caddr_t data;
+	struct proc *p;
+{
+	register struct pipe *mpipe = (struct pipe *)fp->f_data;
+
+	switch (cmd) {
+
+	case FIONBIO:
+		return (0);
+
+	case FIOASYNC:
+		if (*(int *)data) {
+			mpipe->pipe_state |= PIPE_ASYNC;
+		} else {
+			mpipe->pipe_state &= ~PIPE_ASYNC;
+		}
+		return (0);
+
+	case FIONREAD:
+		if (mpipe->pipe_state & PIPE_DIRECTW)
+			*(int *)data = mpipe->pipe_map.cnt;
+		else
+			*(int *)data = mpipe->pipe_buffer.cnt;
+		return (0);
+
+	case FIOSETOWN:
+		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
+
+	case FIOGETOWN:
+		*(int *)data = fgetown(mpipe->pipe_sigio);
+		return (0);
+
+	/* This is deprecated, FIOSETOWN should be used instead. */
+	case TIOCSPGRP:
+		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
+
+	/* This is deprecated, FIOGETOWN should be used instead. */
+	case TIOCGPGRP:
+		*(int *)data = -fgetown(mpipe->pipe_sigio);
+		return (0);
+
+	}
+	return (ENOTTY);
+}
+
+int
+pipe_poll(fp, events, cred, p)
+	struct file *fp;
+	int events;
+	struct ucred *cred;
+	struct proc *p;
+{
+	register struct pipe *rpipe = (struct pipe *)fp->f_data;
+	struct pipe *wpipe;
+	int revents = 0;
+
+	wpipe = rpipe->pipe_peer;
+	if (events & (POLLIN | POLLRDNORM))
+		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
+		    (rpipe->pipe_buffer.cnt > 0) ||
+		    (rpipe->pipe_state & PIPE_EOF))
+			revents |= events & (POLLIN | POLLRDNORM);
+
+	if (events & (POLLOUT | POLLWRNORM))
+		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
+		    ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
+		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)
+			revents |= events & (POLLOUT | POLLWRNORM);
+
+	if ((rpipe->pipe_state & PIPE_EOF) ||
+	    (wpipe == NULL) ||
+	    (wpipe->pipe_state & PIPE_EOF))
+		revents |= POLLHUP;
+
+	if (revents == 0) {
+		if (events & (POLLIN | POLLRDNORM)) {
+			selrecord(p, &rpipe->pipe_sel);
+			rpipe->pipe_state |= PIPE_SEL;
+		}
+
+		if (events & (POLLOUT | POLLWRNORM)) {
+			selrecord(p, &wpipe->pipe_sel);
+			wpipe->pipe_state |= PIPE_SEL;
+		}
+	}
+
+	return (revents);
+}
+
+int
+pipe_stat(pipe, ub)
+	register struct pipe *pipe;
+	register struct stat *ub;
+{
+	bzero((caddr_t)ub, sizeof (*ub));
+	ub->st_mode = S_IFIFO;
+	ub->st_blksize = pipe->pipe_buffer.size;
+	ub->st_size = pipe->pipe_buffer.cnt;
+	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
+	ub->st_atimespec = pipe->pipe_atime;
+	ub->st_mtimespec = pipe->pipe_mtime;
+	ub->st_ctimespec = pipe->pipe_ctime;
+	/*
+	 * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev,
+	 * st_flags, st_gen.
+	 * XXX (st_dev, st_ino) should be unique.
+	 */
+	return 0;
+}
+
+/* ARGSUSED */
+static int
+pipe_close(fp, p)
+	struct file *fp;
+	struct proc *p;
+{
+	struct pipe *cpipe = (struct pipe *)fp->f_data;
+
+	funsetown(cpipe->pipe_sigio);
+	pipeclose(cpipe);
+	fp->f_data = NULL;
+	return 0;
+}
+
+/*
+ * shutdown the pipe
+ */
+static void
+pipeclose(cpipe)
+	struct pipe *cpipe;
+{
+	struct pipe *ppipe;
+	if (cpipe) {
+		
+		pipeselwakeup(cpipe);
+
+		/*
+		 * If the other side is blocked, wake it up saying that
+		 * we want to close it down.
+		 */
+		while (cpipe->pipe_busy) {
+			wakeup(cpipe);
+			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
+			tsleep(cpipe, PRIBIO, "pipecl", 0);
+		}
+
+		/*
+		 * Disconnect from peer
+		 */
+		if (ppipe = cpipe->pipe_peer) {
+			pipeselwakeup(ppipe);
+
+			ppipe->pipe_state |= PIPE_EOF;
+			wakeup(ppipe);
+			ppipe->pipe_peer = NULL;
+		}
+
+		/*
+		 * free resources
+		 */
+		if (cpipe->pipe_buffer.buffer) {
+			if (cpipe->pipe_buffer.size > PIPE_SIZE)
+				--nbigpipe;
+			amountpipekva -= cpipe->pipe_buffer.size;
+			kmem_free(kernel_map,
+				(vm_offset_t)cpipe->pipe_buffer.buffer,
+				cpipe->pipe_buffer.size);
+		}
+#ifndef PIPE_NODIRECT
+		if (cpipe->pipe_map.kva) {
+			amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
+			kmem_free(kernel_map,
+				cpipe->pipe_map.kva,
+				cpipe->pipe_buffer.size + PAGE_SIZE);
+		}
+#endif
+		zfree(pipe_zone, cpipe);
+	}
+}
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
new file mode 100644
index 0000000..4756127
--- /dev/null
+++ b/sys/kern/sys_process.c
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 1994, Sean Eric Fagan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Sean Eric Fagan.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id: sys_process.c,v 1.40 1998/07/29 18:41:30 dfr Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/ptrace.h>
+
+#include <machine/reg.h>
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+
+#include <sys/user.h>
+#include <miscfs/procfs/procfs.h>
+
+/* use the equivalent procfs code */
+#if 0
+static int
+pread (struct proc *procp, unsigned int addr, unsigned int *retval) {
+	int		rv;
+	vm_map_t	map, tmap;
+	vm_object_t	object;
+	vm_offset_t	kva = 0;
+	int		page_offset;	/* offset into page */
+	vm_offset_t	pageno;		/* page number */
+	vm_map_entry_t	out_entry;
+	vm_prot_t	out_prot;
+	boolean_t	wired;
+	vm_pindex_t	pindex;
+
+	/* Map page into kernel space */
+
+	map = &procp->p_vmspace->vm_map;
+
+	page_offset = addr - trunc_page(addr);
+	pageno = trunc_page(addr);
+
+	tmap = map;
+	rv = vm_map_lookup (&tmap, pageno, VM_PROT_READ, &out_entry,
+		&object, &pindex, &out_prot, &wired);
+
+	if (rv != KERN_SUCCESS)
+		return EINVAL;
+
+	vm_map_lookup_done (tmap, out_entry);
+
+	/* Find space in kernel_map for the page we're interested in */
+	rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex),
+		&kva, PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0);
+
+	if (!rv) {
+		vm_object_reference (object);
+
+		rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0);
+		if (!rv) {
+			*retval = 0;
+			bcopy ((caddr_t)kva + page_offset,
+			       retval, sizeof *retval);
+		}
+		vm_map_remove (kernel_map, kva, kva + PAGE_SIZE);
+	}
+
+	return rv;
+}
+
+static int
+pwrite (struct proc *procp, unsigned int addr, unsigned int datum) {
+	int		rv;
+	vm_map_t	map, tmap;
+	vm_object_t	object;
+	vm_offset_t	kva = 0;
+	int		page_offset;	/* offset into page */
+	vm_offset_t	pageno;		/* page number */
+	vm_map_entry_t	out_entry;
+	vm_prot_t	out_prot;
+	boolean_t	wired;
+	vm_pindex_t	pindex;
+	boolean_t	fix_prot = 0;
+
+	/* Map page into kernel space */
+
+	map = &procp->p_vmspace->vm_map;
+
+	page_offset = addr - trunc_page(addr);
+	pageno = trunc_page(addr);
+
+	/*
+	 * Check the permissions for the area we're interested in.
+	 */
+
+	if (vm_map_check_protection (map, pageno, pageno + PAGE_SIZE,
+		VM_PROT_WRITE) == FALSE) {
+		/*
+		 * If the page was not writable, we make it so.
+		 * XXX It is possible a page may *not* be read/executable,
+		 * if a process changes that!
+		 */
+		fix_prot = 1;
+		/* The page isn't writable, so let's try making it so... */
+		if ((rv = vm_map_protect (map, pageno, pageno + PAGE_SIZE,
+			VM_PROT_ALL, 0)) != KERN_SUCCESS)
+		  return EFAULT;	/* I guess... */
+	}
+
+	/*
+	 * Now we need to get the page.  out_entry, out_prot, wired, and
+	 * single_use aren't used.  One would think the vm code would be
+	 * a *bit* nicer...  We use tmap because vm_map_lookup() can
+	 * change the map argument.
+	 */
+
+	tmap = map;
+	rv = vm_map_lookup (&tmap, pageno, VM_PROT_WRITE, &out_entry,
+		&object, &pindex, &out_prot, &wired);
+	if (rv != KERN_SUCCESS) {
+		return EINVAL;
+	}
+
+	/*
+	 * Okay, we've got the page.  Let's release tmap.
+	 */
+
+	vm_map_lookup_done (tmap, out_entry);
+
+	/*
+	 * Fault the page in...
+	 */
+
+	rv = vm_fault(map, pageno, VM_PROT_WRITE|VM_PROT_READ, FALSE);
+	if (rv != KERN_SUCCESS)
+		return EFAULT;
+
+	/* Find space in kernel_map for the page we're interested in */
+	rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex),
+		&kva, PAGE_SIZE, 0,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
+	if (!rv) {
+		vm_object_reference (object);
+
+		rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0);
+		if (!rv) {
+		  bcopy (&datum, (caddr_t)kva + page_offset, sizeof datum);
+		}
+		vm_map_remove (kernel_map, kva, kva + PAGE_SIZE);
+	}
+
+	if (fix_prot)
+		vm_map_protect (map, pageno, pageno + PAGE_SIZE,
+			VM_PROT_READ|VM_PROT_EXECUTE, 0);
+	return rv;
+}
+#endif
+
+/*
+ * Process debugging system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ptrace_args {
+	int	req;
+	pid_t	pid;
+	caddr_t	addr;
+	int	data;
+};
+#endif
+
+int
+ptrace(curp, uap)
+	struct proc *curp;
+	struct ptrace_args *uap;
+{
+	struct proc *p;
+	struct iovec iov;
+	struct uio uio;
+	int error = 0;
+	int write;
+	int s;
+
+	if (uap->req == PT_TRACE_ME)
+		p = curp;
+	else {
+		if ((p = pfind(uap->pid)) == NULL)
+			return ESRCH;
+	}
+
+	/*
+	 * Permissions check
+	 */
+	switch (uap->req) {
+	case PT_TRACE_ME:
+		/* Always legal. */
+		break;
+
+	case PT_ATTACH:
+		/* Self */
+		if (p->p_pid == curp->p_pid)
+			return EINVAL;
+
+		/* Already traced */
+		if (p->p_flag & P_TRACED)
+			return EBUSY;
+
+		/* not owned by you, has done setuid (unless you're root) */
+		if ((p->p_cred->p_ruid != curp->p_cred->p_ruid) ||
+		     (p->p_flag & P_SUGID)) {
+			if (error = suser(curp->p_ucred, &curp->p_acflag))
+				return error;
+		}
+
+		/* can't trace init when securelevel > 0 */
+		if (securelevel > 0 && p->p_pid == 1)
+			return EPERM;
+
+		/* OK */
+		break;
+
+	case PT_READ_I:
+	case PT_READ_D:
+	case PT_READ_U:
+	case PT_WRITE_I:
+	case PT_WRITE_D:
+	case PT_WRITE_U:
+	case PT_CONTINUE:
+	case PT_KILL:
+	case PT_STEP:
+	case PT_DETACH:
+#ifdef PT_GETREGS
+	case PT_GETREGS:
+#endif
+#ifdef PT_SETREGS
+	case PT_SETREGS:
+#endif
+#ifdef PT_GETFPREGS
+	case PT_GETFPREGS:
+#endif
+#ifdef PT_SETFPREGS
+	case PT_SETFPREGS:
+#endif
+		/* not being traced... */
+		if ((p->p_flag & P_TRACED) == 0)
+			return EPERM;
+
+		/* not being traced by YOU */
+		if (p->p_pptr != curp)
+			return EBUSY;
+
+		/* not currently stopped */
+		if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0)
+			return EBUSY;
+
+		/* OK */
+		break;
+
+	default:
+		return EINVAL;
+	}
+
+#ifdef FIX_SSTEP
+	/*
+	 * Single step fixup ala procfs
+	 */
+	FIX_SSTEP(p);
+#endif
+
+	/*
+	 * Actually do the requests
+	 */
+
+	write = 0;
+	curp->p_retval[0] = 0;
+
+	switch (uap->req) {
+	case PT_TRACE_ME:
+		/* set my trace flag and "owner" so it can read/write me */
+		p->p_flag |= P_TRACED;
+		p->p_oppid = p->p_pptr->p_pid;
+		return 0;
+
+	case PT_ATTACH:
+		/* security check done above */
+		p->p_flag |= P_TRACED;
+		p->p_oppid = p->p_pptr->p_pid;
+		if (p->p_pptr != curp)
+			proc_reparent(p, curp);
+		uap->data = SIGSTOP;
+		goto sendsig;	/* in PT_CONTINUE below */
+
+	case PT_STEP:
+	case PT_CONTINUE:
+	case PT_DETACH:
+		if ((unsigned)uap->data >= NSIG)
+			return EINVAL;
+
+		PHOLD(p);
+
+		if (uap->req == PT_STEP) {
+			if ((error = ptrace_single_step (p))) {
+				PRELE(p);
+				return error;
+			}
+		}
+
+		if (uap->addr != (caddr_t)1) {
+			fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+			if ((error = ptrace_set_pc (p,
+			    (u_long)(uintfptr_t)uap->addr))) {
+				PRELE(p);
+				return error;
+			}
+		}
+		PRELE(p);
+
+		if (uap->req == PT_DETACH) {
+			/* reset process parent */
+			if (p->p_oppid != p->p_pptr->p_pid) {
+				struct proc *pp;
+
+				pp = pfind(p->p_oppid);
+				proc_reparent(p, pp ? pp : initproc);
+			}
+
+			p->p_flag &= ~(P_TRACED | P_WAITED);
+			p->p_oppid = 0;
+
+			/* should we send SIGCHLD? */
+
+		}
+
+	sendsig:
+		/* deliver or queue signal */
+		s = splhigh();
+		if (p->p_stat == SSTOP) {
+			p->p_xstat = uap->data;
+			setrunnable(p);
+		} else if (uap->data) {
+			psignal(p, uap->data);
+		}
+		splx(s);
+		return 0;
+
+	case PT_WRITE_I:
+	case PT_WRITE_D:
+		write = 1;
+		/* fallthrough */
+	case PT_READ_I:
+	case PT_READ_D:
+		/* write = 0 set above */
+		iov.iov_base = write ? (caddr_t)&uap->data : (caddr_t)curp->p_retval;
+		iov.iov_len = sizeof(int);
+		uio.uio_iov = &iov;
+		uio.uio_iovcnt = 1;
+		uio.uio_offset = (off_t)(uintptr_t)uap->addr;
+		uio.uio_resid = sizeof(int);
+		uio.uio_segflg = UIO_SYSSPACE;	/* ie: the uap */
+		uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+		uio.uio_procp = p;
+		error = procfs_domem(curp, p, NULL, &uio);
+		if (uio.uio_resid != 0) {
+			/*
+			 * XXX procfs_domem() doesn't currently return ENOSPC,
+			 * so I think write() can bogusly return 0.
+			 * XXX what happens for short writes?  We don't want
+			 * to write partial data.
+			 * XXX procfs_domem() returns EPERM for other invalid
+			 * addresses.  Convert this to EINVAL.  Does this
+			 * clobber returns of EPERM for other reasons?
+			 */
+			if (error == 0 || error == ENOSPC || error == EPERM)
+				error = EINVAL;	/* EOF */
+		}
+		return (error);
+
+	case PT_READ_U:
+		if ((uintptr_t)uap->addr > UPAGES * PAGE_SIZE - sizeof(int)) {
+			return EFAULT;
+		}
+		if ((uintptr_t)uap->addr & (sizeof(int) - 1)) {
+			return EFAULT;
+		}
+		if (ptrace_read_u_check(p,(vm_offset_t) uap->addr,
+					sizeof(long)) &&
+		    !procfs_kmemaccess(curp)) {
+			return EFAULT;
+		}
+		error = 0;
+		PHOLD(p);	/* user had damn well better be incore! */
+		if (p->p_flag & P_INMEM) {
+			p->p_addr->u_kproc.kp_proc = *p;
+			fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+			curp->p_retval[0] = *(int *)
+			    ((uintptr_t)p->p_addr + (uintptr_t)uap->addr);
+		} else {
+			curp->p_retval[0] = 0;
+			error = EFAULT;
+		}
+		PRELE(p);
+		return error;
+
+	case PT_WRITE_U:
+		PHOLD(p);	/* user had damn well better be incore! */
+		if (p->p_flag & P_INMEM) {
+			p->p_addr->u_kproc.kp_proc = *p;
+			fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+			error = ptrace_write_u(p, (vm_offset_t)uap->addr, uap->data);
+		} else {
+			error = EFAULT;
+		}
+		PRELE(p);
+		return error;
+
+	case PT_KILL:
+		uap->data = SIGKILL;
+		goto sendsig;	/* in PT_CONTINUE above */
+
+#ifdef PT_SETREGS
+	case PT_SETREGS:
+		write = 1;
+		/* fallthrough */
+#endif /* PT_SETREGS */
+#ifdef PT_GETREGS
+	case PT_GETREGS:
+		/* write = 0 above */
+#endif /* PT_SETREGS */
+#if defined(PT_SETREGS) || defined(PT_GETREGS)
+		if (!procfs_validregs(p))	/* no P_SYSTEM procs please */
+			return EINVAL;
+		else {
+			iov.iov_base = uap->addr;
+			iov.iov_len = sizeof(struct reg);
+			uio.uio_iov = &iov;
+			uio.uio_iovcnt = 1;
+			uio.uio_offset = 0;
+			uio.uio_resid = sizeof(struct reg);
+			uio.uio_segflg = UIO_USERSPACE;
+			uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+			uio.uio_procp = curp;
+			return (procfs_doregs(curp, p, NULL, &uio));
+		}
+#endif /* defined(PT_SETREGS) || defined(PT_GETREGS) */
+
+#ifdef PT_SETFPREGS
+	case PT_SETFPREGS:
+		write = 1;
+		/* fallthrough */
+#endif /* PT_SETFPREGS */
+#ifdef PT_GETFPREGS
+	case PT_GETFPREGS:
+		/* write = 0 above */
+#endif /* PT_SETFPREGS */
+#if defined(PT_SETFPREGS) || defined(PT_GETFPREGS)
+		if (!procfs_validfpregs(p))	/* no P_SYSTEM procs please */
+			return EINVAL;
+		else {
+			iov.iov_base = uap->addr;
+			iov.iov_len = sizeof(struct fpreg);
+			uio.uio_iov = &iov;
+			uio.uio_iovcnt = 1;
+			uio.uio_offset = 0;
+			uio.uio_resid = sizeof(struct fpreg);
+			uio.uio_segflg = UIO_USERSPACE;
+			uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+			uio.uio_procp = curp;
+			return (procfs_dofpregs(curp, p, NULL, &uio));
+		}
+#endif /* defined(PT_SETFPREGS) || defined(PT_GETFPREGS) */
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+int
+trace_req(p)
+	struct proc *p;
+{
+	return 1;
+}
+
+/*
+ * stopevent()
+ * Stop a process because of a procfs event;
+ * stay stopped until p->p_step is cleared
+ * (cleared by PIOCCONT in procfs).
+ */
+
+void
+stopevent(struct proc *p, unsigned int event, unsigned int val) {
+	p->p_step = 1;
+
+	do {
+		p->p_xstat = val;
+		p->p_stype = event;	/* Which event caused the stop? */
+		wakeup(&p->p_stype);	/* Wake up any PIOCWAIT'ing procs */
+		tsleep(&p->p_step, PWAIT, "stopevent", 0);
+	} while (p->p_step);
+}
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
new file mode 100644
index 0000000..8cf30cd
--- /dev/null
+++ b/sys/kern/sys_socket.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)sys_socket.c	8.1 (Berkeley) 6/10/93
+ * $Id: sys_socket.c,v 1.18 1998/06/07 17:11:40 dfr Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/filio.h>			/* XXX */
+#include <sys/sockio.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/filedesc.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+static int soo_read __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+static int soo_write __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+static int soo_close __P((struct file *fp, struct proc *p));
+
+struct	fileops socketops =
+    { soo_read, soo_write, soo_ioctl, soo_poll, soo_close };
+
+/* ARGSUSED */
+static int
+soo_read(fp, uio, cred)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+{
+	struct socket *so = (struct socket *)fp->f_data;
+	return so->so_proto->pr_usrreqs->pru_soreceive(so, 0, uio, 0, 0, 0);
+}
+
+/* ARGSUSED */
+static int
+soo_write(fp, uio, cred)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+{
+	struct socket *so = (struct socket *)fp->f_data;
+	return so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0,
+						    uio->uio_procp);
+}
+
+int
+soo_ioctl(fp, cmd, data, p)
+	struct file *fp;
+	u_long cmd;
+	register caddr_t data;
+	struct proc *p;
+{
+	register struct socket *so = (struct socket *)fp->f_data;
+
+	switch (cmd) {
+
+	case FIONBIO:
+		if (*(int *)data)
+			so->so_state |= SS_NBIO;
+		else
+			so->so_state &= ~SS_NBIO;
+		return (0);
+
+	case FIOASYNC:
+		if (*(int *)data) {
+			so->so_state |= SS_ASYNC;
+			so->so_rcv.sb_flags |= SB_ASYNC;
+			so->so_snd.sb_flags |= SB_ASYNC;
+		} else {
+			so->so_state &= ~SS_ASYNC;
+			so->so_rcv.sb_flags &= ~SB_ASYNC;
+			so->so_snd.sb_flags &= ~SB_ASYNC;
+		}
+		return (0);
+
+	case FIONREAD:
+		*(int *)data = so->so_rcv.sb_cc;
+		return (0);
+
+	case FIOSETOWN:
+		return (fsetown(*(int *)data, &so->so_sigio));
+
+	case FIOGETOWN:
+		*(int *)data = fgetown(so->so_sigio);
+		return (0);
+
+	case SIOCSPGRP:
+		return (fsetown(-(*(int *)data), &so->so_sigio));
+
+	case SIOCGPGRP:
+		*(int *)data = -fgetown(so->so_sigio);
+		return (0);
+
+	case SIOCATMARK:
+		*(int *)data = (so->so_state&SS_RCVATMARK) != 0;
+		return (0);
+	}
+	/*
+	 * Interface/routing/protocol specific ioctls:
+	 * interface and routing ioctls should have a
+	 * different entry since a socket's unnecessary
+	 */
+	if (IOCGROUP(cmd) == 'i')
+		return (ifioctl(so, cmd, data, p));
+	if (IOCGROUP(cmd) == 'r')
+		return (rtioctl(cmd, data, p));
+	return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0, p));
+}
+
+int
+soo_poll(fp, events, cred, p)
+	struct file *fp;
+	int events;
+	struct ucred *cred;
+	struct proc *p;
+{
+	struct socket *so = (struct socket *)fp->f_data;
+	return so->so_proto->pr_usrreqs->pru_sopoll(so, events, cred, p);
+}
+
+int
+soo_stat(so, ub)
+	register struct socket *so;
+	register struct stat *ub;
+{
+
+	bzero((caddr_t)ub, sizeof (*ub));
+	ub->st_mode = S_IFSOCK;
+	return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub));
+}
+
+/* ARGSUSED */
+static int
+soo_close(fp, p)
+	struct file *fp;
+	struct proc *p;
+{
+	int error = 0;
+
+	if (fp->f_data)
+		error = soclose((struct socket *)fp->f_data);
+	fp->f_data = 0;
+	return (error);
+}
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
new file mode 100644
index 0000000..22e9e8e
--- /dev/null
+++ b/sys/kern/syscalls.c
@@ -0,0 +1,347 @@
+/*
+ * System call names.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * created from	Id: syscalls.master,v 1.55 1998/11/11 12:45:14 peter Exp 
+ */
+
+char *syscallnames[] = {
+	"syscall",			/* 0 = syscall */
+	"exit",			/* 1 = exit */
+	"fork",			/* 2 = fork */
+	"read",			/* 3 = read */
+	"write",			/* 4 = write */
+	"open",			/* 5 = open */
+	"close",			/* 6 = close */
+	"wait4",			/* 7 = wait4 */
+	"old.creat",		/* 8 = old creat */
+	"link",			/* 9 = link */
+	"unlink",			/* 10 = unlink */
+	"obs_execv",			/* 11 = obsolete execv */
+	"chdir",			/* 12 = chdir */
+	"fchdir",			/* 13 = fchdir */
+	"mknod",			/* 14 = mknod */
+	"chmod",			/* 15 = chmod */
+	"chown",			/* 16 = chown */
+	"break",			/* 17 = break */
+	"getfsstat",			/* 18 = getfsstat */
+	"old.lseek",		/* 19 = old lseek */
+	"getpid",			/* 20 = getpid */
+	"mount",			/* 21 = mount */
+	"unmount",			/* 22 = unmount */
+	"setuid",			/* 23 = setuid */
+	"getuid",			/* 24 = getuid */
+	"geteuid",			/* 25 = geteuid */
+	"ptrace",			/* 26 = ptrace */
+	"recvmsg",			/* 27 = recvmsg */
+	"sendmsg",			/* 28 = sendmsg */
+	"recvfrom",			/* 29 = recvfrom */
+	"accept",			/* 30 = accept */
+	"getpeername",			/* 31 = getpeername */
+	"getsockname",			/* 32 = getsockname */
+	"access",			/* 33 = access */
+	"chflags",			/* 34 = chflags */
+	"fchflags",			/* 35 = fchflags */
+	"sync",			/* 36 = sync */
+	"kill",			/* 37 = kill */
+	"old.stat",		/* 38 = old stat */
+	"getppid",			/* 39 = getppid */
+	"old.lstat",		/* 40 = old lstat */
+	"dup",			/* 41 = dup */
+	"pipe",			/* 42 = pipe */
+	"getegid",			/* 43 = getegid */
+	"profil",			/* 44 = profil */
+	"ktrace",			/* 45 = ktrace */
+	"sigaction",			/* 46 = sigaction */
+	"getgid",			/* 47 = getgid */
+	"sigprocmask",			/* 48 = sigprocmask */
+	"getlogin",			/* 49 = getlogin */
+	"setlogin",			/* 50 = setlogin */
+	"acct",			/* 51 = acct */
+	"sigpending",			/* 52 = sigpending */
+	"sigaltstack",			/* 53 = sigaltstack */
+	"ioctl",			/* 54 = ioctl */
+	"reboot",			/* 55 = reboot */
+	"revoke",			/* 56 = revoke */
+	"symlink",			/* 57 = symlink */
+	"readlink",			/* 58 = readlink */
+	"execve",			/* 59 = execve */
+	"umask",			/* 60 = umask */
+	"chroot",			/* 61 = chroot */
+	"old.fstat",		/* 62 = old fstat */
+	"old.getkerninfo",		/* 63 = old getkerninfo */
+	"old.getpagesize",		/* 64 = old getpagesize */
+	"msync",			/* 65 = msync */
+	"vfork",			/* 66 = vfork */
+	"obs_vread",			/* 67 = obsolete vread */
+	"obs_vwrite",			/* 68 = obsolete vwrite */
+	"sbrk",			/* 69 = sbrk */
+	"sstk",			/* 70 = sstk */
+	"old.mmap",		/* 71 = old mmap */
+	"vadvise",			/* 72 = vadvise */
+	"munmap",			/* 73 = munmap */
+	"mprotect",			/* 74 = mprotect */
+	"madvise",			/* 75 = madvise */
+	"obs_vhangup",			/* 76 = obsolete vhangup */
+	"obs_vlimit",			/* 77 = obsolete vlimit */
+	"mincore",			/* 78 = mincore */
+	"getgroups",			/* 79 = getgroups */
+	"setgroups",			/* 80 = setgroups */
+	"getpgrp",			/* 81 = getpgrp */
+	"setpgid",			/* 82 = setpgid */
+	"setitimer",			/* 83 = setitimer */
+	"old.wait",		/* 84 = old wait */
+	"swapon",			/* 85 = swapon */
+	"getitimer",			/* 86 = getitimer */
+	"old.gethostname",		/* 87 = old gethostname */
+	"old.sethostname",		/* 88 = old sethostname */
+	"getdtablesize",			/* 89 = getdtablesize */
+	"dup2",			/* 90 = dup2 */
+	"#91",			/* 91 = getdopt */
+	"fcntl",			/* 92 = fcntl */
+	"select",			/* 93 = select */
+	"#94",			/* 94 = setdopt */
+	"fsync",			/* 95 = fsync */
+	"setpriority",			/* 96 = setpriority */
+	"socket",			/* 97 = socket */
+	"connect",			/* 98 = connect */
+	"old.accept",		/* 99 = old accept */
+	"getpriority",			/* 100 = getpriority */
+	"old.send",		/* 101 = old send */
+	"old.recv",		/* 102 = old recv */
+	"sigreturn",			/* 103 = sigreturn */
+	"bind",			/* 104 = bind */
+	"setsockopt",			/* 105 = setsockopt */
+	"listen",			/* 106 = listen */
+	"obs_vtimes",			/* 107 = obsolete vtimes */
+	"old.sigvec",		/* 108 = old sigvec */
+	"old.sigblock",		/* 109 = old sigblock */
+	"old.sigsetmask",		/* 110 = old sigsetmask */
+	"sigsuspend",			/* 111 = sigsuspend */
+	"old.sigstack",		/* 112 = old sigstack */
+	"old.recvmsg",		/* 113 = old recvmsg */
+	"old.sendmsg",		/* 114 = old sendmsg */
+	"obs_vtrace",			/* 115 = obsolete vtrace */
+	"gettimeofday",			/* 116 = gettimeofday */
+	"getrusage",			/* 117 = getrusage */
+	"getsockopt",			/* 118 = getsockopt */
+	"#119",			/* 119 = resuba */
+	"readv",			/* 120 = readv */
+	"writev",			/* 121 = writev */
+	"settimeofday",			/* 122 = settimeofday */
+	"fchown",			/* 123 = fchown */
+	"fchmod",			/* 124 = fchmod */
+	"old.recvfrom",		/* 125 = old recvfrom */
+	"setreuid",			/* 126 = setreuid */
+	"setregid",			/* 127 = setregid */
+	"rename",			/* 128 = rename */
+	"old.truncate",		/* 129 = old truncate */
+	"old.ftruncate",		/* 130 = old ftruncate */
+	"flock",			/* 131 = flock */
+	"mkfifo",			/* 132 = mkfifo */
+	"sendto",			/* 133 = sendto */
+	"shutdown",			/* 134 = shutdown */
+	"socketpair",			/* 135 = socketpair */
+	"mkdir",			/* 136 = mkdir */
+	"rmdir",			/* 137 = rmdir */
+	"utimes",			/* 138 = utimes */
+	"obs_4.2",			/* 139 = obsolete 4.2 sigreturn */
+	"adjtime",			/* 140 = adjtime */
+	"old.getpeername",		/* 141 = old getpeername */
+	"old.gethostid",		/* 142 = old gethostid */
+	"old.sethostid",		/* 143 = old sethostid */
+	"old.getrlimit",		/* 144 = old getrlimit */
+	"old.setrlimit",		/* 145 = old setrlimit */
+	"old.killpg",		/* 146 = old killpg */
+	"setsid",			/* 147 = setsid */
+	"quotactl",			/* 148 = quotactl */
+	"old.quota",		/* 149 = old quota */
+	"old.getsockname",		/* 150 = old getsockname */
+	"#151",			/* 151 = sem_lock */
+	"#152",			/* 152 = sem_wakeup */
+	"#153",			/* 153 = asyncdaemon */
+	"#154",			/* 154 = nosys */
+	"nfssvc",			/* 155 = nfssvc */
+	"old.getdirentries",		/* 156 = old getdirentries */
+	"statfs",			/* 157 = statfs */
+	"fstatfs",			/* 158 = fstatfs */
+	"#159",			/* 159 = nosys */
+	"#160",			/* 160 = nosys */
+	"getfh",			/* 161 = getfh */
+	"getdomainname",			/* 162 = getdomainname */
+	"setdomainname",			/* 163 = setdomainname */
+	"uname",			/* 164 = uname */
+	"sysarch",			/* 165 = sysarch */
+	"rtprio",			/* 166 = rtprio */
+	"#167",			/* 167 = nosys */
+	"#168",			/* 168 = nosys */
+	"semsys",			/* 169 = semsys */
+	"msgsys",			/* 170 = msgsys */
+	"shmsys",			/* 171 = shmsys */
+	"#172",			/* 172 = nosys */
+	"#173",			/* 173 = nosys */
+	"#174",			/* 174 = nosys */
+	"#175",			/* 175 = nosys */
+	"ntp_adjtime",			/* 176 = ntp_adjtime */
+	"#177",			/* 177 = sfork */
+	"#178",			/* 178 = getdescriptor */
+	"#179",			/* 179 = setdescriptor */
+	"#180",			/* 180 = nosys */
+	"setgid",			/* 181 = setgid */
+	"setegid",			/* 182 = setegid */
+	"seteuid",			/* 183 = seteuid */
+	"#184",			/* 184 = lfs_bmapv */
+	"#185",			/* 185 = lfs_markv */
+	"#186",			/* 186 = lfs_segclean */
+	"#187",			/* 187 = lfs_segwait */
+	"stat",			/* 188 = stat */
+	"fstat",			/* 189 = fstat */
+	"lstat",			/* 190 = lstat */
+	"pathconf",			/* 191 = pathconf */
+	"fpathconf",			/* 192 = fpathconf */
+	"#193",			/* 193 = nosys */
+	"getrlimit",			/* 194 = getrlimit */
+	"setrlimit",			/* 195 = setrlimit */
+	"getdirentries",			/* 196 = getdirentries */
+	"mmap",			/* 197 = mmap */
+	"__syscall",			/* 198 = __syscall */
+	"lseek",			/* 199 = lseek */
+	"truncate",			/* 200 = truncate */
+	"ftruncate",			/* 201 = ftruncate */
+	"__sysctl",			/* 202 = __sysctl */
+	"mlock",			/* 203 = mlock */
+	"munlock",			/* 204 = munlock */
+	"undelete",			/* 205 = undelete */
+	"futimes",			/* 206 = futimes */
+	"getpgid",			/* 207 = getpgid */
+	"#208",			/* 208 = newreboot */
+	"poll",			/* 209 = poll */
+	"lkmnosys",			/* 210 = lkmnosys */
+	"lkmnosys",			/* 211 = lkmnosys */
+	"lkmnosys",			/* 212 = lkmnosys */
+	"lkmnosys",			/* 213 = lkmnosys */
+	"lkmnosys",			/* 214 = lkmnosys */
+	"lkmnosys",			/* 215 = lkmnosys */
+	"lkmnosys",			/* 216 = lkmnosys */
+	"lkmnosys",			/* 217 = lkmnosys */
+	"lkmnosys",			/* 218 = lkmnosys */
+	"lkmnosys",			/* 219 = lkmnosys */
+	"__semctl",			/* 220 = __semctl */
+	"semget",			/* 221 = semget */
+	"semop",			/* 222 = semop */
+	"semconfig",			/* 223 = semconfig */
+	"msgctl",			/* 224 = msgctl */
+	"msgget",			/* 225 = msgget */
+	"msgsnd",			/* 226 = msgsnd */
+	"msgrcv",			/* 227 = msgrcv */
+	"shmat",			/* 228 = shmat */
+	"shmctl",			/* 229 = shmctl */
+	"shmdt",			/* 230 = shmdt */
+	"shmget",			/* 231 = shmget */
+	"clock_gettime",			/* 232 = clock_gettime */
+	"clock_settime",			/* 233 = clock_settime */
+	"clock_getres",			/* 234 = clock_getres */
+	"#235",			/* 235 = timer_create */
+	"#236",			/* 236 = timer_delete */
+	"#237",			/* 237 = timer_settime */
+	"#238",			/* 238 = timer_gettime */
+	"#239",			/* 239 = timer_getoverrun */
+	"nanosleep",			/* 240 = nanosleep */
+	"#241",			/* 241 = nosys */
+	"#242",			/* 242 = nosys */
+	"#243",			/* 243 = nosys */
+	"#244",			/* 244 = nosys */
+	"#245",			/* 245 = nosys */
+	"#246",			/* 246 = nosys */
+	"#247",			/* 247 = nosys */
+	"#248",			/* 248 = nosys */
+	"#249",			/* 249 = nosys */
+	"minherit",			/* 250 = minherit */
+	"rfork",			/* 251 = rfork */
+	"openbsd_poll",			/* 252 = openbsd_poll */
+	"issetugid",			/* 253 = issetugid */
+	"lchown",			/* 254 = lchown */
+	"#255",			/* 255 = nosys */
+	"#256",			/* 256 = nosys */
+	"#257",			/* 257 = nosys */
+	"#258",			/* 258 = nosys */
+	"#259",			/* 259 = nosys */
+	"#260",			/* 260 = nosys */
+	"#261",			/* 261 = nosys */
+	"#262",			/* 262 = nosys */
+	"#263",			/* 263 = nosys */
+	"#264",			/* 264 = nosys */
+	"#265",			/* 265 = nosys */
+	"#266",			/* 266 = nosys */
+	"#267",			/* 267 = nosys */
+	"#268",			/* 268 = nosys */
+	"#269",			/* 269 = nosys */
+	"#270",			/* 270 = nosys */
+	"#271",			/* 271 = nosys */
+	"getdents",			/* 272 = getdents */
+	"#273",			/* 273 = nosys */
+	"lchmod",			/* 274 = lchmod */
+	"netbsd_lchown",			/* 275 = netbsd_lchown */
+	"lutimes",			/* 276 = lutimes */
+	"netbsd_msync",			/* 277 = netbsd_msync */
+	"nstat",			/* 278 = nstat */
+	"nfstat",			/* 279 = nfstat */
+	"nlstat",			/* 280 = nlstat */
+	"#281",			/* 281 = nosys */
+	"#282",			/* 282 = nosys */
+	"#283",			/* 283 = nosys */
+	"#284",			/* 284 = nosys */
+	"#285",			/* 285 = nosys */
+	"#286",			/* 286 = nosys */
+	"#287",			/* 287 = nosys */
+	"#288",			/* 288 = nosys */
+	"#289",			/* 289 = nosys */
+	"#290",			/* 290 = nosys */
+	"#291",			/* 291 = nosys */
+	"#292",			/* 292 = nosys */
+	"#293",			/* 293 = nosys */
+	"#294",			/* 294 = nosys */
+	"#295",			/* 295 = nosys */
+	"#296",			/* 296 = nosys */
+	"#297",			/* 297 = nosys */
+	"#298",			/* 298 = nosys */
+	"#299",			/* 299 = nosys */
+	"modnext",			/* 300 = modnext */
+	"modstat",			/* 301 = modstat */
+	"modfnext",			/* 302 = modfnext */
+	"modfind",			/* 303 = modfind */
+	"kldload",			/* 304 = kldload */
+	"kldunload",			/* 305 = kldunload */
+	"kldfind",			/* 306 = kldfind */
+	"kldnext",			/* 307 = kldnext */
+	"kldstat",			/* 308 = kldstat */
+	"kldfirstmod",			/* 309 = kldfirstmod */
+	"getsid",			/* 310 = getsid */
+	"#311",			/* 311 = setresuid */
+	"#312",			/* 312 = setresgid */
+	"obs_signanosleep",			/* 313 = obsolete signanosleep */
+	"aio_return",			/* 314 = aio_return */
+	"aio_suspend",			/* 315 = aio_suspend */
+	"aio_cancel",			/* 316 = aio_cancel */
+	"aio_error",			/* 317 = aio_error */
+	"aio_read",			/* 318 = aio_read */
+	"aio_write",			/* 319 = aio_write */
+	"lio_listio",			/* 320 = lio_listio */
+	"yield",			/* 321 = yield */
+	"thr_sleep",			/* 322 = thr_sleep */
+	"thr_wakeup",			/* 323 = thr_wakeup */
+	"mlockall",			/* 324 = mlockall */
+	"munlockall",			/* 325 = munlockall */
+	"__getcwd",			/* 326 = __getcwd */
+	"sched_setparam",			/* 327 = sched_setparam */
+	"sched_getparam",			/* 328 = sched_getparam */
+	"sched_setscheduler",			/* 329 = sched_setscheduler */
+	"sched_getscheduler",			/* 330 = sched_getscheduler */
+	"sched_yield",			/* 331 = sched_yield */
+	"sched_get_priority_max",			/* 332 = sched_get_priority_max */
+	"sched_get_priority_min",			/* 333 = sched_get_priority_min */
+	"sched_rr_get_interval",			/* 334 = sched_rr_get_interval */
+	"utrace",			/* 335 = utrace */
+	"sendfile",			/* 336 = sendfile */
+	"kldsym",			/* 337 = kldsym */
+};
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
new file mode 100644
index 0000000..6772363
--- /dev/null
+++ b/sys/kern/syscalls.master
@@ -0,0 +1,473 @@
+	$Id: syscalls.master,v 1.54 1998/11/05 14:28:24 dg Exp $
+;	from: @(#)syscalls.master	8.2 (Berkeley) 1/13/94
+;
+; System call name/number master file.
+; Processed to created init_sysent.c, syscalls.c and syscall.h.
+
+; Columns: number type nargs namespc name alt{name,tag,rtyp}/comments
+;	number	system call number, must be in order
+;	type	one of STD, OBSOL, UNIMPL, COMPAT
+;	namespc one of POSIX, BSD, NOHIDE
+;	name	psuedo-prototype of syscall routine
+;		If one of the following alts is different, then all appear:
+;	altname	name of system call if different
+;	alttag	name of args struct tag if different from [o]`name'"_args"
+;	altrtyp	return type if not int (bogus - syscalls always return int)
+;		for UNIMPL/OBSOL, name continues with comments
+
+; types:
+;	STD	always included
+;	COMPAT	included on COMPAT #ifdef
+;	LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h
+;	OBSOL	obsolete, not included in system, only specifies name
+;	UNIMPL	not implemented, placeholder only
+
+; #ifdef's, etc. may be included, and are copied to the output files.
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+; Reserved/unimplemented system calls in the range 0-150 inclusive
+; are reserved for use in future Berkeley releases.
+; Additional system calls implemented in vendor and other
+; redistributions should be placed in the reserved range at the end
+; of the current calls.
+
+0	STD	NOHIDE	{ int nosys(void); } syscall nosys_args int
+1	STD	NOHIDE	{ void exit(int rval); } exit rexit_args void
+2	STD	POSIX	{ int fork(void); }
+3	STD	POSIX	{ ssize_t read(int fd, void *buf, size_t nbyte); }
+4	STD	POSIX	{ ssize_t write(int fd, const void *buf, size_t nbyte); }
+5	STD	POSIX	{ int open(char *path, int flags, int mode); }
+; XXX should be		{ int open(const char *path, int flags, ...); }
+; but we're not ready for `const' or varargs.
+; XXX man page says `mode_t mode'.
+6	STD	POSIX	{ int close(int fd); }
+7	STD	BSD	{ int wait4(int pid, int *status, int options, \
+			    struct rusage *rusage); } wait4 wait_args int
+8	COMPAT	BSD	{ int creat(char *path, int mode); }
+9	STD	POSIX	{ int link(char *path, char *link); }
+10	STD	POSIX	{ int unlink(char *path); }
+11	OBSOL	NOHIDE	execv
+12	STD	POSIX	{ int chdir(char *path); }
+13	STD	BSD	{ int fchdir(int fd); }
+14	STD	POSIX	{ int mknod(char *path, int mode, int dev); }
+15	STD	POSIX	{ int chmod(char *path, int mode); }
+16	STD	POSIX	{ int chown(char *path, int uid, int gid); }
+17	STD	BSD	{ int obreak(char *nsize); } break obreak_args int
+18	STD	BSD	{ int getfsstat(struct statfs *buf, long bufsize, \
+			    int flags); }
+19	COMPAT	POSIX	{ long lseek(int fd, long offset, int whence); }
+20	STD	POSIX	{ pid_t getpid(void); }
+21	STD	BSD	{ int mount(char *type, char *path, int flags, \
+			    caddr_t data); }
+; XXX 4.4lite2 uses `char *type' but we're not ready for that.
+; XXX `path' should have type `const char *' but we're not ready for that.
+22	STD	BSD	{ int unmount(char *path, int flags); }
+23	STD	POSIX	{ int setuid(uid_t uid); }
+24	STD	POSIX	{ uid_t getuid(void); }
+25	STD	POSIX	{ uid_t geteuid(void); }
+26	STD	BSD	{ int ptrace(int req, pid_t pid, caddr_t addr, \
+			    int data); }
+27	STD	BSD	{ int recvmsg(int s, struct msghdr *msg, int flags); }
+28	STD	BSD	{ int sendmsg(int s, caddr_t msg, int flags); }
+29	STD	BSD	{ int recvfrom(int s, caddr_t buf, size_t len, \
+			    int flags, caddr_t from, int *fromlenaddr); }
+30	STD	BSD	{ int accept(int s, caddr_t name, int *anamelen); }
+31	STD	BSD	{ int getpeername(int fdes, caddr_t asa, int *alen); }
+32	STD	BSD	{ int getsockname(int fdes, caddr_t asa, int *alen); }
+33	STD	POSIX	{ int access(char *path, int flags); }
+34	STD	BSD	{ int chflags(char *path, int flags); }
+35	STD	BSD	{ int fchflags(int fd, int flags); }
+36	STD	BSD	{ int sync(void); }
+37	STD	POSIX	{ int kill(int pid, int signum); }
+38	COMPAT	POSIX	{ int stat(char *path, struct ostat *ub); }
+39	STD	POSIX	{ pid_t getppid(void); }
+40	COMPAT	POSIX	{ int lstat(char *path, struct ostat *ub); }
+41	STD	POSIX	{ int dup(u_int fd); }
+42	STD	POSIX	{ int pipe(void); }
+43	STD	POSIX	{ gid_t getegid(void); }
+44	STD	BSD	{ int profil(caddr_t samples, size_t size, \
+			    size_t offset, u_int scale); }
+45	STD	BSD	{ int ktrace(char *fname, int ops, int facs, \
+			    int pid); }
+46	STD	POSIX	{ int sigaction(int signum, struct sigaction *nsa, \
+			    struct sigaction *osa); }
+47	STD	POSIX	{ gid_t getgid(void); }
+48	STD	POSIX	{ int sigprocmask(int how, sigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it, and we return the old mask as the
+; (int) return value.
+49	STD	BSD	{ int getlogin(char *namebuf, u_int namelen); }
+50	STD	BSD	{ int setlogin(char *namebuf); }
+51	STD	BSD	{ int acct(char *path); }
+52	STD	POSIX	{ int sigpending(void); }
+53	STD	BSD	{ int sigaltstack(struct sigaltstack *nss, \
+			    struct sigaltstack *oss); }
+54	STD	POSIX	{ int ioctl(int fd, u_long com, caddr_t data); }
+55	STD	BSD	{ int reboot(int opt); }
+56	STD	POSIX	{ int revoke(char *path); }
+57	STD	POSIX	{ int symlink(char *path, char *link); }
+58	STD	POSIX	{ int readlink(char *path, char *buf, int count); }
+59	STD	POSIX	{ int execve(char *fname, char **argv, char **envv); }
+60	STD	POSIX	{ int umask(int newmask); } umask umask_args int
+61	STD	BSD	{ int chroot(char *path); }
+62	COMPAT	POSIX	{ int fstat(int fd, struct ostat *sb); }
+63	COMPAT	BSD	{ int getkerninfo(int op, char *where, size_t *size, \
+			    int arg); } getkerninfo getkerninfo_args int
+64	COMPAT	BSD	{ int getpagesize(void); } \
+			    getpagesize getpagesize_args int
+65	STD	BSD	{ int msync(void *addr, size_t len, int flags); }
+66	STD	BSD	{ int vfork(void); }
+67	OBSOL	NOHIDE	vread
+68	OBSOL	NOHIDE	vwrite
+69	STD	BSD	{ int sbrk(int incr); }
+70	STD	BSD	{ int sstk(int incr); }
+71	COMPAT	BSD	{ int mmap(void *addr, int len, int prot, \
+			    int flags, int fd, long pos); }
+72	STD	BSD	{ int ovadvise(int anom); } vadvise ovadvise_args int
+73	STD	BSD	{ int munmap(void *addr, size_t len); }
+74	STD	BSD	{ int mprotect(const void *addr, size_t len, int prot); }
+75	STD	BSD	{ int madvise(void *addr, size_t len, int behav); }
+76	OBSOL	NOHIDE	vhangup
+77	OBSOL	NOHIDE	vlimit
+78	STD	BSD	{ int mincore(const void *addr, size_t len, \
+			    char *vec); }
+79	STD	POSIX	{ int getgroups(u_int gidsetsize, gid_t *gidset); }
+80	STD	POSIX	{ int setgroups(u_int gidsetsize, gid_t *gidset); }
+81	STD	POSIX	{ int getpgrp(void); }
+82	STD	POSIX	{ int setpgid(int pid, int pgid); }
+83	STD	BSD	{ int setitimer(u_int which, struct itimerval *itv, \
+			    struct itimerval *oitv); }
+84	COMPAT	BSD	{ int wait(void); }
+85	STD	BSD	{ int swapon(char *name); }
+86	STD	BSD	{ int getitimer(u_int which, struct itimerval *itv); }
+87	COMPAT	BSD	{ int gethostname(char *hostname, u_int len); } \
+			    gethostname gethostname_args int
+88	COMPAT	BSD	{ int sethostname(char *hostname, u_int len); } \
+			    sethostname sethostname_args int
+89	STD	BSD	{ int getdtablesize(void); }
+90	STD	POSIX	{ int dup2(u_int from, u_int to); }
+91	UNIMPL	BSD	getdopt
+92	STD	POSIX	{ int fcntl(int fd, int cmd, long arg); }
+; XXX should be		{ int fcntl(int fd, int cmd, ...); }
+; but we're not ready for varargs.
+; XXX man page says `int arg' too.
+93	STD	BSD	{ int select(int nd, fd_set *in, fd_set *ou, \
+			    fd_set *ex, struct timeval *tv); }
+94	UNIMPL	BSD	setdopt
+95	STD	POSIX	{ int fsync(int fd); }
+96	STD	BSD	{ int setpriority(int which, int who, int prio); }
+97	STD	BSD	{ int socket(int domain, int type, int protocol); }
+98	STD	BSD	{ int connect(int s, caddr_t name, int namelen); }
+99	CPT_NOA	BSD	{ int accept(int s, caddr_t name, int *anamelen); } \
+			    accept accept_args int
+100	STD	BSD	{ int getpriority(int which, int who); }
+101	COMPAT	BSD	{ int send(int s, caddr_t buf, int len, int flags); }
+102	COMPAT	BSD	{ int recv(int s, caddr_t buf, int len, int flags); }
+103	STD	BSD	{ int sigreturn(struct sigcontext *sigcntxp); }
+104	STD	BSD	{ int bind(int s, caddr_t name, int namelen); }
+105	STD	BSD	{ int setsockopt(int s, int level, int name, \
+			    caddr_t val, int valsize); }
+106	STD	BSD	{ int listen(int s, int backlog); }
+107	OBSOL	NOHIDE	vtimes
+108	COMPAT	BSD	{ int sigvec(int signum, struct sigvec *nsv, \
+			    struct sigvec *osv); }
+109	COMPAT	BSD	{ int sigblock(int mask); }
+110	COMPAT	BSD	{ int sigsetmask(int mask); }
+111	STD	POSIX	{ int sigsuspend(sigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it.
+112	COMPAT	BSD	{ int sigstack(struct sigstack *nss, \
+			    struct sigstack *oss); }
+113	COMPAT	BSD	{ int recvmsg(int s, struct omsghdr *msg, int flags); }
+114	COMPAT	BSD	{ int sendmsg(int s, caddr_t msg, int flags); }
+115	OBSOL	NOHIDE	vtrace
+116	STD	BSD	{ int gettimeofday(struct timeval *tp, \
+			    struct timezone *tzp); }
+117	STD	BSD	{ int getrusage(int who, struct rusage *rusage); }
+118	STD	BSD	{ int getsockopt(int s, int level, int name, \
+			    caddr_t val, int *avalsize); }
+119	UNIMPL	NOHIDE	resuba (BSD/OS 2.x)
+120	STD	BSD	{ int readv(int fd, struct iovec *iovp, u_int iovcnt); }
+121	STD	BSD	{ int writev(int fd, struct iovec *iovp, \
+			    u_int iovcnt); }
+122	STD	BSD	{ int settimeofday(struct timeval *tv, \
+			    struct timezone *tzp); }
+123	STD	BSD	{ int fchown(int fd, int uid, int gid); }
+124	STD	BSD	{ int fchmod(int fd, int mode); }
+125	CPT_NOA	BSD	{ int recvfrom(int s, caddr_t buf, size_t len, \
+			    int flags, caddr_t from, int *fromlenaddr); } \
+			    recvfrom recvfrom_args int
+126	STD	BSD	{ int setreuid(int ruid, int euid); }
+127	STD	BSD	{ int setregid(int rgid, int egid); }
+128	STD	POSIX	{ int rename(char *from, char *to); }
+129	COMPAT	BSD	{ int truncate(char *path, long length); }
+130	COMPAT	BSD	{ int ftruncate(int fd, long length); }
+131	STD	BSD	{ int flock(int fd, int how); }
+132	STD	POSIX	{ int mkfifo(char *path, int mode); }
+133	STD	BSD	{ int sendto(int s, caddr_t buf, size_t len, \
+			    int flags, caddr_t to, int tolen); }
+134	STD	BSD	{ int shutdown(int s, int how); }
+135	STD	BSD	{ int socketpair(int domain, int type, int protocol, \
+			    int *rsv); }
+136	STD	POSIX	{ int mkdir(char *path, int mode); }
+137	STD	POSIX	{ int rmdir(char *path); }
+138	STD	BSD	{ int utimes(char *path, struct timeval *tptr); }
+139	OBSOL	NOHIDE	4.2 sigreturn
+140	STD	BSD	{ int adjtime(struct timeval *delta, \
+			    struct timeval *olddelta); }
+141	COMPAT	BSD	{ int getpeername(int fdes, caddr_t asa, int *alen); }
+142	COMPAT	BSD	{ long gethostid(void); }
+143	COMPAT	BSD	{ int sethostid(long hostid); }
+144	COMPAT	BSD	{ int getrlimit(u_int which, struct ogetrlimit *rlp); }
+145	COMPAT	BSD	{ int setrlimit(u_int which, struct ogetrlimit *rlp); }
+146	COMPAT	BSD	{ int killpg(int pgid, int signum); }
+147	STD	POSIX	{ int setsid(void); }
+148	STD	BSD	{ int quotactl(char *path, int cmd, int uid, \
+			    caddr_t arg); }
+149	COMPAT	BSD	{ int quota(void); }
+150	CPT_NOA	BSD	{ int getsockname(int fdec, caddr_t asa, int *alen); }\
+			    getsockname getsockname_args int
+
+; Syscalls 151-180 inclusive are reserved for vendor-specific
+; system calls.  (This includes various calls added for compatibity
+; with other Unix variants.)
+; Some of these calls are now supported by BSD...
+151	UNIMPL	NOHIDE	sem_lock (BSD/OS 2.x)
+152	UNIMPL	NOHIDE	sem_wakeup (BSD/OS 2.x)
+153	UNIMPL	NOHIDE	asyncdaemon (BSD/OS 2.x)
+154	UNIMPL	NOHIDE	nosys
+; 155 is initialized by the NFS code, if present.
+155	NOIMPL	BSD	{ int nfssvc(int flag, caddr_t argp); }
+156	COMPAT	BSD	{ int getdirentries(int fd, char *buf, u_int count, \
+			    long *basep); }
+157	STD	BSD	{ int statfs(char *path, struct statfs *buf); }
+158	STD	BSD	{ int fstatfs(int fd, struct statfs *buf); }
+159	UNIMPL	NOHIDE	nosys
+160	UNIMPL	NOHIDE	nosys
+; 161 is initialized by the NFS code, if present.
+161	NOIMPL	BSD	{ int getfh(char *fname, struct fhandle *fhp); }
+162	STD	BSD	{ int getdomainname(char *domainname, int len); }
+163	STD	BSD	{ int setdomainname(char *domainname, int len); }
+164	STD	BSD	{ int uname(struct utsname *name); }
+165	STD	BSD	{ int sysarch(int op, char *parms); }
+166	STD	BSD	{ int rtprio(int function, pid_t pid, \
+			    struct rtprio *rtp); }
+167	UNIMPL	NOHIDE	nosys
+168	UNIMPL	NOHIDE	nosys
+169	STD	BSD	{ int semsys(int which, int a2, int a3, int a4, \
+			    int a5); }
+; XXX should be		{ int semsys(int which, ...); }
+170	STD	BSD	{ int msgsys(int which, int a2, int a3, int a4, \
+			    int a5, int a6); }
+; XXX should be		{ int msgsys(int which, ...); }
+171	STD	BSD	{ int shmsys(int which, int a2, int a3, int a4); }
+; XXX should be		{ int shmsys(int which, ...); }
+172	UNIMPL	NOHIDE	nosys
+173	UNIMPL	NOHIDE	nosys
+174	UNIMPL	NOHIDE	nosys
+175	UNIMPL	NOHIDE	nosys
+176	STD	BSD	{ int ntp_adjtime(struct timex *tp); }
+177	UNIMPL	NOHIDE	sfork (BSD/OS 2.x)
+178	UNIMPL	NOHIDE	getdescriptor (BSD/OS 2.x)
+179	UNIMPL	NOHIDE	setdescriptor (BSD/OS 2.x)
+180	UNIMPL	NOHIDE	nosys
+
+; Syscalls 180-199 are used by/reserved for BSD
+181	STD	POSIX	{ int setgid(gid_t gid); }
+182	STD	BSD	{ int setegid(gid_t egid); }
+183	STD	BSD	{ int seteuid(uid_t euid); }
+184	UNIMPL	BSD	lfs_bmapv
+185	UNIMPL	BSD	lfs_markv
+186	UNIMPL	BSD	lfs_segclean
+187	UNIMPL	BSD	lfs_segwait
+188	STD	POSIX	{ int stat(char *path, struct stat *ub); }
+189	STD	POSIX	{ int fstat(int fd, struct stat *sb); }
+190	STD	POSIX	{ int lstat(char *path, struct stat *ub); }
+191	STD	POSIX	{ int pathconf(char *path, int name); }
+192	STD	POSIX	{ int fpathconf(int fd, int name); }
+193	UNIMPL	NOHIDE	nosys
+194	STD	BSD	{ int getrlimit(u_int which, \
+			    struct orlimit *rlp); } \
+			    getrlimit __getrlimit_args int
+195	STD	BSD	{ int setrlimit(u_int which, \
+			    struct orlimit *rlp); } \
+			    setrlimit __setrlimit_args int
+196	STD	BSD	{ int getdirentries(int fd, char *buf, u_int count, \
+			    long *basep); }
+197	STD	BSD	{ caddr_t mmap(caddr_t addr, size_t len, int prot, \
+			    int flags, int fd, long pad, off_t pos); }
+198	STD	NOHIDE	{ int nosys(void); } __syscall __syscall_args int
+199	STD	POSIX	{ off_t lseek(int fd, int pad, off_t offset, \
+			    int whence); }
+200	STD	BSD	{ int truncate(char *path, int pad, off_t length); }
+201	STD	BSD	{ int ftruncate(int fd, int pad, off_t length); }
+202	STD	BSD	{ int __sysctl(int *name, u_int namelen, void *old, \
+			    size_t *oldlenp, void *new, size_t newlen); } \
+			    __sysctl sysctl_args int
+; properly, __sysctl should be a NOHIDE, but making an exception
+; here allows to avoid one in libc/sys/Makefile.inc.
+203	STD	BSD	{ int mlock(const void *addr, size_t len); }
+204	STD	BSD	{ int munlock(const void *addr, size_t len); }
+205	STD	BSD	{ int undelete(char *path); }
+206	STD	BSD	{ int futimes(int fd, struct timeval *tptr); }
+207	STD	BSD	{ int getpgid(pid_t pid); }
+208	UNIMPL	NOHIDE	newreboot (NetBSD)
+209	STD	BSD	{ int poll(struct pollfd *fds, u_int nfds, \
+			    int timeout); }
+
+;
+; The following are reserved for loadable syscalls
+;
+210	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+211	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+212	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+213	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+214	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+215	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+216	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+217	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+218	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+219	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+
+;
+; The following were introduced with NetBSD/4.4Lite-2
+;
+220	STD	BSD	{ int __semctl(int semid, int semnum, int cmd, \
+			    union semun *arg); }
+221	STD	BSD	{ int semget(key_t key, int nsems, int semflg); }
+222	STD	BSD	{ int semop(int semid, struct sembuf *sops, \
+			    u_int nsops); }
+223	STD	BSD	{ int semconfig(int flag); }
+224	STD	BSD	{ int msgctl(int msqid, int cmd, \
+			    struct msqid_ds *buf); }
+225	STD	BSD	{ int msgget(key_t key, int msgflg); }
+226	STD	BSD	{ int msgsnd(int msqid, void *msgp, size_t msgsz, \
+			    int msgflg); }
+227	STD	BSD	{ int msgrcv(int msqid, void *msgp, size_t msgsz, \
+			    long msgtyp, int msgflg); }
+228	STD	BSD	{ int shmat(int shmid, void *shmaddr, int shmflg); }
+229	STD	BSD	{ int shmctl(int shmid, int cmd, \
+			    struct shmid_ds *buf); }
+230	STD	BSD	{ int shmdt(void *shmaddr); }
+231	STD	BSD	{ int shmget(key_t key, int size, int shmflg); }
+;
+232	STD	POSIX	{ int clock_gettime(clockid_t clock_id, \
+			    struct timespec *tp); }
+233	STD	POSIX	{ int clock_settime(clockid_t clock_id, \
+			    const struct timespec *tp); }
+234	STD	POSIX	{ int clock_getres(clockid_t clock_id, \
+			    struct timespec *tp); }
+235	UNIMPL	NOHIDE	timer_create
+236	UNIMPL	NOHIDE	timer_delete
+237	UNIMPL	NOHIDE	timer_settime
+238	UNIMPL	NOHIDE	timer_gettime
+239	UNIMPL	NOHIDE	timer_getoverrun
+240	STD	POSIX	{ int nanosleep(const struct timespec *rqtp, \
+			    struct timespec *rmtp); }
+241	UNIMPL	NOHIDE	nosys
+242	UNIMPL	NOHIDE	nosys
+243	UNIMPL	NOHIDE	nosys
+244	UNIMPL	NOHIDE	nosys
+245	UNIMPL	NOHIDE	nosys
+246	UNIMPL	NOHIDE	nosys
+247	UNIMPL	NOHIDE	nosys
+248	UNIMPL	NOHIDE	nosys
+249	UNIMPL	NOHIDE	nosys
+; syscall numbers initially used in OpenBSD
+250	STD	BSD	{ int minherit(void *addr, size_t len, int inherit); }
+251	STD	BSD	{ int rfork(int flags); }
+252	STD	BSD	{ int openbsd_poll(struct pollfd *fds, u_int nfds, \
+			    int timeout); }
+253	STD	BSD	{ int issetugid(void); }
+254	STD	BSD	{ int lchown(char *path, int uid, int gid); }
+255	UNIMPL	NOHIDE	nosys
+256	UNIMPL	NOHIDE	nosys
+257	UNIMPL	NOHIDE	nosys
+258	UNIMPL	NOHIDE	nosys
+259	UNIMPL	NOHIDE	nosys
+260	UNIMPL	NOHIDE	nosys
+261	UNIMPL	NOHIDE	nosys
+262	UNIMPL	NOHIDE	nosys
+263	UNIMPL	NOHIDE	nosys
+264	UNIMPL	NOHIDE	nosys
+265	UNIMPL	NOHIDE	nosys
+266	UNIMPL	NOHIDE	nosys
+267	UNIMPL	NOHIDE	nosys
+268	UNIMPL	NOHIDE	nosys
+269	UNIMPL	NOHIDE	nosys
+270	UNIMPL	NOHIDE	nosys
+271	UNIMPL	NOHIDE	nosys
+272	STD	BSD	{ int getdents(int fd, char *buf, size_t count); }
+273	UNIMPL	NOHIDE	nosys
+274	STD	BSD	{ int lchmod(char *path, mode_t mode); }
+275	NOPROTO BSD	{ int lchown(char *path, uid_t uid, gid_t gid); } netbsd_lchown netbsd_lchown int
+276	STD	BSD	{ int lutimes(char *path, struct timeval *tptr); }
+277	NOPROTO	BSD	{ int msync(void *addr, size_t len, int flags); } netbsd_msync netbsd_msync int
+278	STD	BSD	{ int nstat(char *path, struct nstat *ub); }
+279	STD	BSD	{ int nfstat(int fd, struct nstat *sb); }
+280	STD	BSD	{ int nlstat(char *path, struct nstat *ub); }
+281	UNIMPL	NOHIDE	nosys
+282	UNIMPL	NOHIDE	nosys
+283	UNIMPL	NOHIDE	nosys
+284	UNIMPL	NOHIDE	nosys
+285	UNIMPL	NOHIDE	nosys
+286	UNIMPL	NOHIDE	nosys
+287	UNIMPL	NOHIDE	nosys
+288	UNIMPL	NOHIDE	nosys
+289	UNIMPL	NOHIDE	nosys
+290	UNIMPL	NOHIDE	nosys
+291	UNIMPL	NOHIDE	nosys
+292	UNIMPL	NOHIDE	nosys
+293	UNIMPL	NOHIDE	nosys
+294	UNIMPL	NOHIDE	nosys
+295	UNIMPL	NOHIDE	nosys
+296	UNIMPL	NOHIDE	nosys
+297	UNIMPL	NOHIDE	nosys
+298	UNIMPL	NOHIDE	nosys
+299	UNIMPL	NOHIDE	nosys
+; syscall numbers for FreeBSD
+300	STD	BSD	{ int modnext(int modid); }
+301	STD	BSD	{ int modstat(int modid, struct module_stat* stat); }
+302	STD	BSD	{ int modfnext(int modid); }
+303	STD	BSD	{ int modfind(char *name); }
+304	STD	BSD	{ int kldload(const char *file); }
+305	STD	BSD	{ int kldunload(int fileid); }
+306	STD	BSD	{ int kldfind(const char *file); }
+307	STD	BSD	{ int kldnext(int fileid); }
+308	STD	BSD	{ int kldstat(int fileid, struct kld_file_stat* stat); }
+309	STD	BSD	{ int kldfirstmod(int fileid); }
+310	STD	BSD	{ int getsid(pid_t pid); }
+311	UNIMPL	NOHIDE	setresuid
+312	UNIMPL	NOHIDE	setresgid
+313	OBSOL	NOHIDE	signanosleep
+314     STD     BSD     { int aio_return(struct aiocb *aiocbp); }
+315     STD     BSD     { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); }
+316     STD     BSD     { int aio_cancel(int fd, struct aiocb *aiocbp); }
+317     STD     BSD     { int aio_error(struct aiocb *aiocbp); }
+318     STD     BSD     { int aio_read(struct aiocb *aiocbp); }
+319     STD     BSD     { int aio_write(struct aiocb *aiocbp); }
+320     STD     BSD     { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); }
+321     STD     BSD     { int yield(void); }
+322     STD     BSD     { int thr_sleep(const struct timespec *timeout); }
+323     STD     BSD     { int thr_wakeup(pid_t pid); }
+324     STD     BSD     { int mlockall(int how); }
+325     STD     BSD     { int munlockall(void); }
+326     STD     BSD     { int __getcwd(u_char *buf, u_int buflen); }
+
+327     STD     POSIX   { int sched_setparam (pid_t pid, const struct sched_param *param); }
+328     STD     POSIX   { int sched_getparam (pid_t pid, struct sched_param *param); }
+
+329     STD     POSIX   { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); }
+330     STD     POSIX   { int sched_getscheduler (pid_t pid); }
+
+331     STD     POSIX   { int sched_yield (void); }
+332     STD     POSIX   { int sched_get_priority_max (int policy); }
+333     STD     POSIX   { int sched_get_priority_min (int policy); }
+334     STD     POSIX   { int sched_rr_get_interval (pid_t pid, struct timespec *interval); }
+335	STD	BSD	{ int utrace(caddr_t addr, size_t len); }
+336	STD	BSD	{ int sendfile(int fd, int s, off_t offset, size_t nbytes, \
+				struct sf_hdtr *hdtr, off_t *sbytes, int flags); }
+337	STD	BSD	{ int kldsym(int fileid, int cmd, void *data); }
diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c
new file mode 100644
index 0000000..553c213
--- /dev/null
+++ b/sys/kern/sysv_ipc.c
@@ -0,0 +1,283 @@
+/*	$Id: sysv_ipc.c,v 1.7 1997/11/06 19:29:22 phk Exp $ */
+/*	$NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $	*/
+
+/*
+ * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Herb Peyerl.
+ * 4. The name of Herb Peyerl may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/ipc.h>
+#include <sys/ucred.h>
+
+#if defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG)
+
+/*
+ * Check for ipc permission
+ */
+
+int
+ipcperm(cred, perm, mode)
+	struct ucred *cred;
+	struct ipc_perm *perm;
+	int mode;
+{
+
+	if (cred->cr_uid == 0)
+		return (0);
+
+	/* Check for user match. */
+	if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) {
+		if (mode & IPC_M)
+			return (EPERM);
+		/* Check for group match. */
+		mode >>= 3;
+		if (!groupmember(perm->gid, cred) &&
+		    !groupmember(perm->cgid, cred))
+			/* Check for `other' match. */
+			mode >>= 3;
+	}
+
+	if (mode & IPC_M)
+		return (0);
+	return ((mode & perm->mode) == mode ? 0 : EACCES);
+}
+
+#endif /* defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG) */
+
+
+#if !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG)
+
+#include <sys/proc.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+
+static void sysv_nosys __P((struct proc *p, char *s));
+
+static void 
+sysv_nosys(p, s)
+	struct proc *p;
+	char *s;
+{
+	log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+			p->p_comm, p->p_pid, s);
+}
+
+#if !defined(SYSVSEM)
+
+/*
+ * SYSVSEM stubs
+ */
+
+int
+semsys(p, uap)
+	struct proc *p;
+	struct semsys_args *uap;
+{
+	sysv_nosys(p, "SYSVSEM");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+semconfig(p, uap)
+	struct proc *p;
+	struct semconfig_args *uap;
+{
+	sysv_nosys(p, "SYSVSEM");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+__semctl(p, uap)
+	struct proc *p;
+	register struct __semctl_args *uap;
+{
+	sysv_nosys(p, "SYSVSEM");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+semget(p, uap)
+	struct proc *p;
+	register struct semget_args *uap;
+{
+	sysv_nosys(p, "SYSVSEM");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+semop(p, uap)
+	struct proc *p;
+	register struct semop_args *uap;
+{
+	sysv_nosys(p, "SYSVSEM");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+/* called from kern_exit.c */
+void
+semexit(p)
+	struct proc *p;
+{
+	return;
+}
+
+#endif /* !defined(SYSVSEM) */
+
+
+#if !defined(SYSVMSG)
+
+/*
+ * SYSVMSG stubs
+ */
+
+int
+msgsys(p, uap)
+	struct proc *p;
+	/* XXX actually varargs. */
+	struct msgsys_args *uap;
+{
+	sysv_nosys(p, "SYSVMSG");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+msgctl(p, uap)
+	struct proc *p;
+	register struct msgctl_args *uap;
+{
+	sysv_nosys(p, "SYSVMSG");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+msgget(p, uap)
+	struct proc *p;
+	register struct msgget_args *uap;
+{
+	sysv_nosys(p, "SYSVMSG");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+msgsnd(p, uap)
+	struct proc *p;
+	register struct msgsnd_args *uap;
+{
+	sysv_nosys(p, "SYSVMSG");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+msgrcv(p, uap)
+	struct proc *p;
+	register struct msgrcv_args *uap;
+{
+	sysv_nosys(p, "SYSVMSG");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+#endif /* !defined(SYSVMSG) */
+
+
+#if !defined(SYSVSHM)
+
+/*
+ * SYSVSHM stubs
+ */
+
+int
+shmdt(p, uap)
+	struct proc *p;
+	struct shmdt_args *uap;
+{
+	sysv_nosys(p, "SYSVSHM");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+shmat(p, uap)
+	struct proc *p;
+	struct shmat_args *uap;
+{
+	sysv_nosys(p, "SYSVSHM");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+shmctl(p, uap)
+	struct proc *p;
+	struct shmctl_args *uap;
+{
+	sysv_nosys(p, "SYSVSHM");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+shmget(p, uap)
+	struct proc *p;
+	struct shmget_args *uap;
+{
+	sysv_nosys(p, "SYSVSHM");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+int
+shmsys(p, uap)
+	struct proc *p;
+	/* XXX actually varargs. */
+	struct shmsys_args *uap;
+{
+	sysv_nosys(p, "SYSVSHM");
+	return nosys(p, (struct nosys_args *)uap);
+};
+
+/* called from kern_fork.c */
+void
+shmfork(p1, p2)
+	struct proc *p1, *p2;
+{
+	return;
+}
+
+/* called from kern_exit.c */
+void
+shmexit(p)
+	struct proc *p;
+{
+	return;
+}
+
+#endif /* !defined(SYSVSHM) */
+
+#endif /* !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG) */
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
new file mode 100644
index 0000000..d3b8a98
--- /dev/null
+++ b/sys/kern/sysv_msg.c
@@ -0,0 +1,1027 @@
+/*	$Id: sysv_msg.c,v 1.17 1997/11/06 19:29:24 phk Exp $ */
+
+/*
+ * Implementation of SVID messages
+ *
+ * Author:  Daniel Boulet
+ *
+ * Copyright 1993 Daniel Boulet and RTMX Inc.
+ *
+ * This system call was implemented by Daniel Boulet under contract from RTMX.
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/msg.h>
+#include <sys/sysent.h>
+
+static void msginit __P((void *));
+SYSINIT(sysv_msg, SI_SUB_SYSV_MSG, SI_ORDER_FIRST, msginit, NULL)
+
+#define MSG_DEBUG
+#undef MSG_DEBUG_OK
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args;
+int msgctl __P((struct proc *p, struct msgctl_args *uap));
+struct msgget_args;
+int msgget __P((struct proc *p, struct msgget_args *uap));
+struct msgsnd_args;
+int msgsnd __P((struct proc *p, struct msgsnd_args *uap));
+struct msgrcv_args;
+int msgrcv __P((struct proc *p, struct msgrcv_args *uap));
+#endif
+static void msg_freehdr __P((struct msg *msghdr));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *msgcalls[] = {
+	(sy_call_t *)msgctl, (sy_call_t *)msgget,
+	(sy_call_t *)msgsnd, (sy_call_t *)msgrcv
+};
+
+static int nfree_msgmaps;	/* # of free map entries */
+static short free_msgmaps;	/* head of linked list of free map entries */
+static struct msg *free_msghdrs;	/* list of free msg headers */
+char *msgpool;			/* MSGMAX byte long msg buffer pool */
+struct msgmap *msgmaps;		/* MSGSEG msgmap structures */
+struct msg *msghdrs;		/* MSGTQL msg headers */
+struct msqid_ds *msqids;	/* MSGMNI msqid_ds struct's */
+
+void
+msginit(dummy)
+	void *dummy;
+{
+	register int i;
+
+	/*
+	 * msginfo.msgssz should be a power of two for efficiency reasons.
+	 * It is also pretty silly if msginfo.msgssz is less than 8
+	 * or greater than about 256 so ...
+	 */
+
+	i = 8;
+	while (i < 1024 && i != msginfo.msgssz)
+		i <<= 1;
+    	if (i != msginfo.msgssz) {
+		printf("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
+		    msginfo.msgssz);
+		panic("msginfo.msgssz not a small power of 2");
+	}
+
+	if (msginfo.msgseg > 32767) {
+		printf("msginfo.msgseg=%d\n", msginfo.msgseg);
+		panic("msginfo.msgseg > 32767");
+	}
+
+	if (msgmaps == NULL)
+		panic("msgmaps is NULL");
+
+	for (i = 0; i < msginfo.msgseg; i++) {
+		if (i > 0)
+			msgmaps[i-1].next = i;
+		msgmaps[i].next = -1;	/* implies entry is available */
+	}
+	free_msgmaps = 0;
+	nfree_msgmaps = msginfo.msgseg;
+
+	if (msghdrs == NULL)
+		panic("msghdrs is NULL");
+
+	for (i = 0; i < msginfo.msgtql; i++) {
+		msghdrs[i].msg_type = 0;
+		if (i > 0)
+			msghdrs[i-1].msg_next = &msghdrs[i];
+		msghdrs[i].msg_next = NULL;
+    	}
+	free_msghdrs = &msghdrs[0];
+
+	if (msqids == NULL)
+		panic("msqids is NULL");
+
+	for (i = 0; i < msginfo.msgmni; i++) {
+		msqids[i].msg_qbytes = 0;	/* implies entry is available */
+		msqids[i].msg_perm.seq = 0;	/* reset to a known value */
+	}
+}
+
+/*
+ * Entry point for all MSG calls
+ */
+int
+msgsys(p, uap)
+	struct proc *p;
+	/* XXX actually varargs. */
+	struct msgsys_args /* {
+		u_int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+		int	a5;
+		int	a6;
+	} */ *uap;
+{
+
+	if (uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
+		return (EINVAL);
+	return ((*msgcalls[uap->which])(p, &uap->a2));
+}
+
+static void
+msg_freehdr(msghdr)
+	struct msg *msghdr;
+{
+	while (msghdr->msg_ts > 0) {
+		short next;
+		if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
+			panic("msghdr->msg_spot out of range");
+		next = msgmaps[msghdr->msg_spot].next;
+		msgmaps[msghdr->msg_spot].next = free_msgmaps;
+		free_msgmaps = msghdr->msg_spot;
+		nfree_msgmaps++;
+		msghdr->msg_spot = next;
+		if (msghdr->msg_ts >= msginfo.msgssz)
+			msghdr->msg_ts -= msginfo.msgssz;
+		else
+			msghdr->msg_ts = 0;
+	}
+	if (msghdr->msg_spot != -1)
+		panic("msghdr->msg_spot != -1");
+	msghdr->msg_next = free_msghdrs;
+	free_msghdrs = msghdr;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args {
+	int	msqid;
+	int	cmd;
+	struct	msqid_ds *buf;
+};
+#endif
+
+int
+msgctl(p, uap)
+	struct proc *p;
+	register struct msgctl_args *uap;
+{
+	int msqid = uap->msqid;
+	int cmd = uap->cmd;
+	struct msqid_ds *user_msqptr = uap->buf;
+	struct ucred *cred = p->p_ucred;
+	int rval, eval;
+	struct msqid_ds msqbuf;
+	register struct msqid_ds *msqptr;
+
+#ifdef MSG_DEBUG_OK
+	printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr);
+#endif
+
+	msqid = IPCID_TO_IX(msqid);
+
+	if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+		printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+		    msginfo.msgmni);
+#endif
+		return(EINVAL);
+	}
+
+	msqptr = &msqids[msqid];
+
+	if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+		printf("no such msqid\n");
+#endif
+		return(EINVAL);
+	}
+	if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+		printf("wrong sequence number\n");
+#endif
+		return(EINVAL);
+	}
+
+	eval = 0;
+	rval = 0;
+
+	switch (cmd) {
+
+	case IPC_RMID:
+	{
+		struct msg *msghdr;
+		if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M)))
+			return(eval);
+		/* Free the message headers */
+		msghdr = msqptr->msg_first;
+		while (msghdr != NULL) {
+			struct msg *msghdr_tmp;
+
+			/* Free the segments of each message */
+			msqptr->msg_cbytes -= msghdr->msg_ts;
+			msqptr->msg_qnum--;
+			msghdr_tmp = msghdr;
+			msghdr = msghdr->msg_next;
+			msg_freehdr(msghdr_tmp);
+		}
+
+		if (msqptr->msg_cbytes != 0)
+			panic("msg_cbytes is screwed up");
+		if (msqptr->msg_qnum != 0)
+			panic("msg_qnum is screwed up");
+
+		msqptr->msg_qbytes = 0;	/* Mark it as free */
+
+		wakeup((caddr_t)msqptr);
+	}
+
+		break;
+
+	case IPC_SET:
+		if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M)))
+			return(eval);
+		if ((eval = copyin(user_msqptr, &msqbuf, sizeof(msqbuf))) != 0)
+			return(eval);
+		if (msqbuf.msg_qbytes > msqptr->msg_qbytes && cred->cr_uid != 0)
+			return(EPERM);
+		if (msqbuf.msg_qbytes > msginfo.msgmnb) {
+#ifdef MSG_DEBUG_OK
+			printf("can't increase msg_qbytes beyond %d (truncating)\n",
+			    msginfo.msgmnb);
+#endif
+			msqbuf.msg_qbytes = msginfo.msgmnb;	/* silently restrict qbytes to system limit */
+		}
+		if (msqbuf.msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+			printf("can't reduce msg_qbytes to 0\n");
+#endif
+			return(EINVAL);		/* non-standard errno! */
+		}
+		msqptr->msg_perm.uid = msqbuf.msg_perm.uid;	/* change the owner */
+		msqptr->msg_perm.gid = msqbuf.msg_perm.gid;	/* change the owner */
+		msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) |
+		    (msqbuf.msg_perm.mode & 0777);
+		msqptr->msg_qbytes = msqbuf.msg_qbytes;
+		msqptr->msg_ctime = time_second;
+		break;
+
+	case IPC_STAT:
+		if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+			printf("requester doesn't have read access\n");
+#endif
+			return(eval);
+		}
+		eval = copyout((caddr_t)msqptr, user_msqptr,
+		    sizeof(struct msqid_ds));
+		break;
+
+	default:
+#ifdef MSG_DEBUG_OK
+		printf("invalid command %d\n", cmd);
+#endif
+		return(EINVAL);
+	}
+
+	if (eval == 0)
+		p->p_retval[0] = rval;
+	return(eval);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgget_args {
+	key_t	key;
+	int	msgflg;
+};
+#endif
+
+int
+msgget(p, uap)
+	struct proc *p;
+	register struct msgget_args *uap;
+{
+	int msqid, eval;
+	int key = uap->key;
+	int msgflg = uap->msgflg;
+	struct ucred *cred = p->p_ucred;
+	register struct msqid_ds *msqptr = NULL;
+
+#ifdef MSG_DEBUG_OK
+	printf("msgget(0x%x, 0%o)\n", key, msgflg);
+#endif
+
+	if (key != IPC_PRIVATE) {
+		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+			msqptr = &msqids[msqid];
+			if (msqptr->msg_qbytes != 0 &&
+			    msqptr->msg_perm.key == key)
+				break;
+		}
+		if (msqid < msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+			printf("found public key\n");
+#endif
+			if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
+#ifdef MSG_DEBUG_OK
+				printf("not exclusive\n");
+#endif
+				return(EEXIST);
+			}
+			if ((eval = ipcperm(cred, &msqptr->msg_perm, msgflg & 0700 ))) {
+#ifdef MSG_DEBUG_OK
+				printf("requester doesn't have 0%o access\n",
+				    msgflg & 0700);
+#endif
+				return(eval);
+			}
+			goto found;
+		}
+	}
+
+#ifdef MSG_DEBUG_OK
+	printf("need to allocate the msqid_ds\n");
+#endif
+	if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
+		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+			/*
+			 * Look for an unallocated and unlocked msqid_ds.
+			 * msqid_ds's can be locked by msgsnd or msgrcv while
+			 * they are copying the message in/out.  We can't
+			 * re-use the entry until they release it.
+			 */
+			msqptr = &msqids[msqid];
+			if (msqptr->msg_qbytes == 0 &&
+			    (msqptr->msg_perm.mode & MSG_LOCKED) == 0)
+				break;
+		}
+		if (msqid == msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+			printf("no more msqid_ds's available\n");
+#endif
+			return(ENOSPC);
+		}
+#ifdef MSG_DEBUG_OK
+		printf("msqid %d is available\n", msqid);
+#endif
+		msqptr->msg_perm.key = key;
+		msqptr->msg_perm.cuid = cred->cr_uid;
+		msqptr->msg_perm.uid = cred->cr_uid;
+		msqptr->msg_perm.cgid = cred->cr_gid;
+		msqptr->msg_perm.gid = cred->cr_gid;
+		msqptr->msg_perm.mode = (msgflg & 0777);
+		/* Make sure that the returned msqid is unique */
+		msqptr->msg_perm.seq++;
+		msqptr->msg_first = NULL;
+		msqptr->msg_last = NULL;
+		msqptr->msg_cbytes = 0;
+		msqptr->msg_qnum = 0;
+		msqptr->msg_qbytes = msginfo.msgmnb;
+		msqptr->msg_lspid = 0;
+		msqptr->msg_lrpid = 0;
+		msqptr->msg_stime = 0;
+		msqptr->msg_rtime = 0;
+		msqptr->msg_ctime = time_second;
+	} else {
+#ifdef MSG_DEBUG_OK
+		printf("didn't find it and wasn't asked to create it\n");
+#endif
+		return(ENOENT);
+	}
+
+found:
+	/* Construct the unique msqid */
+	p->p_retval[0] = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm);
+	return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgsnd_args {
+	int	msqid;
+	void	*msgp;
+	size_t	msgsz;
+	int	msgflg;
+};
+#endif
+
+int
+msgsnd(p, uap)
+	struct proc *p;
+	register struct msgsnd_args *uap;
+{
+	int msqid = uap->msqid;
+	void *user_msgp = uap->msgp;
+	size_t msgsz = uap->msgsz;
+	int msgflg = uap->msgflg;
+	int segs_needed, eval;
+	struct ucred *cred = p->p_ucred;
+	register struct msqid_ds *msqptr;
+	register struct msg *msghdr;
+	short next;
+
+#ifdef MSG_DEBUG_OK
+	printf("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz,
+	    msgflg);
+#endif
+
+	msqid = IPCID_TO_IX(msqid);
+
+	if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+		printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+		    msginfo.msgmni);
+#endif
+		return(EINVAL);
+	}
+
+	msqptr = &msqids[msqid];
+	if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+		printf("no such message queue id\n");
+#endif
+		return(EINVAL);
+	}
+	if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+		printf("wrong sequence number\n");
+#endif
+		return(EINVAL);
+	}
+
+	if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_W))) {
+#ifdef MSG_DEBUG_OK
+		printf("requester doesn't have write access\n");
+#endif
+		return(eval);
+	}
+
+	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
+#ifdef MSG_DEBUG_OK
+	printf("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz,
+	    segs_needed);
+#endif
+	for (;;) {
+		int need_more_resources = 0;
+
+		/*
+		 * check msgsz
+		 * (inside this loop in case msg_qbytes changes while we sleep)
+		 */
+
+		if (msgsz > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+			printf("msgsz > msqptr->msg_qbytes\n");
+#endif
+			return(EINVAL);
+		}
+
+		if (msqptr->msg_perm.mode & MSG_LOCKED) {
+#ifdef MSG_DEBUG_OK
+			printf("msqid is locked\n");
+#endif
+			need_more_resources = 1;
+		}
+		if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+			printf("msgsz + msg_cbytes > msg_qbytes\n");
+#endif
+			need_more_resources = 1;
+		}
+		if (segs_needed > nfree_msgmaps) {
+#ifdef MSG_DEBUG_OK
+			printf("segs_needed > nfree_msgmaps\n");
+#endif
+			need_more_resources = 1;
+		}
+		if (free_msghdrs == NULL) {
+#ifdef MSG_DEBUG_OK
+			printf("no more msghdrs\n");
+#endif
+			need_more_resources = 1;
+		}
+
+		if (need_more_resources) {
+			int we_own_it;
+
+			if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+				printf("need more resources but caller doesn't want to wait\n");
+#endif
+				return(EAGAIN);
+			}
+
+			if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) {
+#ifdef MSG_DEBUG_OK
+				printf("we don't own the msqid_ds\n");
+#endif
+				we_own_it = 0;
+			} else {
+				/* Force later arrivals to wait for our
+				   request */
+#ifdef MSG_DEBUG_OK
+				printf("we own the msqid_ds\n");
+#endif
+				msqptr->msg_perm.mode |= MSG_LOCKED;
+				we_own_it = 1;
+			}
+#ifdef MSG_DEBUG_OK
+			printf("goodnight\n");
+#endif
+			eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH,
+			    "msgwait", 0);
+#ifdef MSG_DEBUG_OK
+			printf("good morning, eval=%d\n", eval);
+#endif
+			if (we_own_it)
+				msqptr->msg_perm.mode &= ~MSG_LOCKED;
+			if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+				printf("msgsnd:  interrupted system call\n");
+#endif
+				return(EINTR);
+			}
+
+			/*
+			 * Make sure that the msq queue still exists
+			 */
+
+			if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+				printf("msqid deleted\n");
+#endif
+				/* The SVID says to return EIDRM. */
+#ifdef EIDRM
+				return(EIDRM);
+#else
+				/* Unfortunately, BSD doesn't define that code
+				   yet! */
+				return(EINVAL);
+#endif
+			}
+
+		} else {
+#ifdef MSG_DEBUG_OK
+			printf("got all the resources that we need\n");
+#endif
+			break;
+		}
+	}
+
+	/*
+	 * We have the resources that we need.
+	 * Make sure!
+	 */
+
+	if (msqptr->msg_perm.mode & MSG_LOCKED)
+		panic("msg_perm.mode & MSG_LOCKED");
+	if (segs_needed > nfree_msgmaps)
+		panic("segs_needed > nfree_msgmaps");
+	if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes)
+		panic("msgsz + msg_cbytes > msg_qbytes");
+	if (free_msghdrs == NULL)
+		panic("no more msghdrs");
+
+	/*
+	 * Re-lock the msqid_ds in case we page-fault when copying in the
+	 * message
+	 */
+
+	if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0)
+		panic("msqid_ds is already locked");
+	msqptr->msg_perm.mode |= MSG_LOCKED;
+
+	/*
+	 * Allocate a message header
+	 */
+
+	msghdr = free_msghdrs;
+	free_msghdrs = msghdr->msg_next;
+	msghdr->msg_spot = -1;
+	msghdr->msg_ts = msgsz;
+
+	/*
+	 * Allocate space for the message
+	 */
+
+	while (segs_needed > 0) {
+		if (nfree_msgmaps <= 0)
+			panic("not enough msgmaps");
+		if (free_msgmaps == -1)
+			panic("nil free_msgmaps");
+		next = free_msgmaps;
+		if (next <= -1)
+			panic("next too low #1");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #1");
+#ifdef MSG_DEBUG_OK
+		printf("allocating segment %d to message\n", next);
+#endif
+		free_msgmaps = msgmaps[next].next;
+		nfree_msgmaps--;
+		msgmaps[next].next = msghdr->msg_spot;
+		msghdr->msg_spot = next;
+		segs_needed--;
+	}
+
+	/*
+	 * Copy in the message type
+	 */
+
+	if ((eval = copyin(user_msgp, &msghdr->msg_type,
+	    sizeof(msghdr->msg_type))) != 0) {
+#ifdef MSG_DEBUG_OK
+		printf("error %d copying the message type\n", eval);
+#endif
+		msg_freehdr(msghdr);
+		msqptr->msg_perm.mode &= ~MSG_LOCKED;
+		wakeup((caddr_t)msqptr);
+		return(eval);
+	}
+	user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+	/*
+	 * Validate the message type
+	 */
+
+	if (msghdr->msg_type < 1) {
+		msg_freehdr(msghdr);
+		msqptr->msg_perm.mode &= ~MSG_LOCKED;
+		wakeup((caddr_t)msqptr);
+#ifdef MSG_DEBUG_OK
+		printf("mtype (%d) < 1\n", msghdr->msg_type);
+#endif
+		return(EINVAL);
+	}
+
+	/*
+	 * Copy in the message body
+	 */
+
+	next = msghdr->msg_spot;
+	while (msgsz > 0) {
+		size_t tlen;
+		if (msgsz > msginfo.msgssz)
+			tlen = msginfo.msgssz;
+		else
+			tlen = msgsz;
+		if (next <= -1)
+			panic("next too low #2");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #2");
+		if ((eval = copyin(user_msgp, &msgpool[next * msginfo.msgssz],
+		    tlen)) != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("error %d copying in message segment\n", eval);
+#endif
+			msg_freehdr(msghdr);
+			msqptr->msg_perm.mode &= ~MSG_LOCKED;
+			wakeup((caddr_t)msqptr);
+			return(eval);
+		}
+		msgsz -= tlen;
+		user_msgp = (char *)user_msgp + tlen;
+		next = msgmaps[next].next;
+	}
+	if (next != -1)
+		panic("didn't use all the msg segments");
+
+	/*
+	 * We've got the message.  Unlock the msqid_ds.
+	 */
+
+	msqptr->msg_perm.mode &= ~MSG_LOCKED;
+
+	/*
+	 * Make sure that the msqid_ds is still allocated.
+	 */
+
+	if (msqptr->msg_qbytes == 0) {
+		msg_freehdr(msghdr);
+		wakeup((caddr_t)msqptr);
+		/* The SVID says to return EIDRM. */
+#ifdef EIDRM
+		return(EIDRM);
+#else
+		/* Unfortunately, BSD doesn't define that code yet! */
+		return(EINVAL);
+#endif
+	}
+
+	/*
+	 * Put the message into the queue
+	 */
+
+	if (msqptr->msg_first == NULL) {
+		msqptr->msg_first = msghdr;
+		msqptr->msg_last = msghdr;
+	} else {
+		msqptr->msg_last->msg_next = msghdr;
+		msqptr->msg_last = msghdr;
+	}
+	msqptr->msg_last->msg_next = NULL;
+
+	msqptr->msg_cbytes += msghdr->msg_ts;
+	msqptr->msg_qnum++;
+	msqptr->msg_lspid = p->p_pid;
+	msqptr->msg_stime = time_second;
+
+	wakeup((caddr_t)msqptr);
+	p->p_retval[0] = 0;
+	return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgrcv_args {
+	int	msqid;
+	void	*msgp;
+	size_t	msgsz;
+	long	msgtyp;
+	int	msgflg;
+};
+#endif
+
+int
+msgrcv(p, uap)
+	struct proc *p;
+	register struct msgrcv_args *uap;
+{
+	int msqid = uap->msqid;
+	void *user_msgp = uap->msgp;
+	size_t msgsz = uap->msgsz;
+	long msgtyp = uap->msgtyp;
+	int msgflg = uap->msgflg;
+	size_t len;
+	struct ucred *cred = p->p_ucred;
+	register struct msqid_ds *msqptr;
+	register struct msg *msghdr;
+	int eval;
+	short next;
+
+#ifdef MSG_DEBUG_OK
+	printf("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp,
+	    msgsz, msgtyp, msgflg);
+#endif
+
+	msqid = IPCID_TO_IX(msqid);
+
+	if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+		printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+		    msginfo.msgmni);
+#endif
+		return(EINVAL);
+	}
+
+	msqptr = &msqids[msqid];
+	if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+		printf("no such message queue id\n");
+#endif
+		return(EINVAL);
+	}
+	if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+		printf("wrong sequence number\n");
+#endif
+		return(EINVAL);
+	}
+
+	if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+		printf("requester doesn't have read access\n");
+#endif
+		return(eval);
+	}
+
+	msghdr = NULL;
+	while (msghdr == NULL) {
+		if (msgtyp == 0) {
+			msghdr = msqptr->msg_first;
+			if (msghdr != NULL) {
+				if (msgsz < msghdr->msg_ts &&
+				    (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+					printf("first message on the queue is too big (want %d, got %d)\n",
+					    msgsz, msghdr->msg_ts);
+#endif
+					return(E2BIG);
+				}
+				if (msqptr->msg_first == msqptr->msg_last) {
+					msqptr->msg_first = NULL;
+					msqptr->msg_last = NULL;
+				} else {
+					msqptr->msg_first = msghdr->msg_next;
+					if (msqptr->msg_first == NULL)
+						panic("msg_first/last screwed up #1");
+				}
+			}
+		} else {
+			struct msg *previous;
+			struct msg **prev;
+
+			previous = NULL;
+			prev = &(msqptr->msg_first);
+			while ((msghdr = *prev) != NULL) {
+				/*
+				 * Is this message's type an exact match or is
+				 * this message's type less than or equal to
+				 * the absolute value of a negative msgtyp?
+				 * Note that the second half of this test can
+				 * NEVER be true if msgtyp is positive since
+				 * msg_type is always positive!
+				 */
+
+				if (msgtyp == msghdr->msg_type ||
+				    msghdr->msg_type <= -msgtyp) {
+#ifdef MSG_DEBUG_OK
+					printf("found message type %d, requested %d\n",
+					    msghdr->msg_type, msgtyp);
+#endif
+					if (msgsz < msghdr->msg_ts &&
+					    (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+						printf("requested message on the queue is too big (want %d, got %d)\n",
+						    msgsz, msghdr->msg_ts);
+#endif
+						return(E2BIG);
+					}
+					*prev = msghdr->msg_next;
+					if (msghdr == msqptr->msg_last) {
+						if (previous == NULL) {
+							if (prev !=
+							    &msqptr->msg_first)
+								panic("msg_first/last screwed up #2");
+							msqptr->msg_first =
+							    NULL;
+							msqptr->msg_last =
+							    NULL;
+						} else {
+							if (prev ==
+							    &msqptr->msg_first)
+								panic("msg_first/last screwed up #3");
+							msqptr->msg_last =
+							    previous;
+						}
+					}
+					break;
+				}
+				previous = msghdr;
+				prev = &(msghdr->msg_next);
+			}
+		}
+
+		/*
+		 * We've either extracted the msghdr for the appropriate
+		 * message or there isn't one.
+		 * If there is one then bail out of this loop.
+		 */
+
+		if (msghdr != NULL)
+			break;
+
+		/*
+		 * Hmph!  No message found.  Does the user want to wait?
+		 */
+
+		if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("no appropriate message found (msgtyp=%d)\n",
+			    msgtyp);
+#endif
+			/* The SVID says to return ENOMSG. */
+#ifdef ENOMSG
+			return(ENOMSG);
+#else
+			/* Unfortunately, BSD doesn't define that code yet! */
+			return(EAGAIN);
+#endif
+		}
+
+		/*
+		 * Wait for something to happen
+		 */
+
+#ifdef MSG_DEBUG_OK
+		printf("msgrcv:  goodnight\n");
+#endif
+		eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, "msgwait",
+		    0);
+#ifdef MSG_DEBUG_OK
+		printf("msgrcv:  good morning (eval=%d)\n", eval);
+#endif
+
+		if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("msgsnd:  interrupted system call\n");
+#endif
+			return(EINTR);
+		}
+
+		/*
+		 * Make sure that the msq queue still exists
+		 */
+
+		if (msqptr->msg_qbytes == 0 ||
+		    msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+			printf("msqid deleted\n");
+#endif
+			/* The SVID says to return EIDRM. */
+#ifdef EIDRM
+			return(EIDRM);
+#else
+			/* Unfortunately, BSD doesn't define that code yet! */
+			return(EINVAL);
+#endif
+		}
+	}
+
+	/*
+	 * Return the message to the user.
+	 *
+	 * First, do the bookkeeping (before we risk being interrupted).
+	 */
+
+	msqptr->msg_cbytes -= msghdr->msg_ts;
+	msqptr->msg_qnum--;
+	msqptr->msg_lrpid = p->p_pid;
+	msqptr->msg_rtime = time_second;
+
+	/*
+	 * Make msgsz the actual amount that we'll be returning.
+	 * Note that this effectively truncates the message if it is too long
+	 * (since msgsz is never increased).
+	 */
+
+#ifdef MSG_DEBUG_OK
+	printf("found a message, msgsz=%d, msg_ts=%d\n", msgsz,
+	    msghdr->msg_ts);
+#endif
+	if (msgsz > msghdr->msg_ts)
+		msgsz = msghdr->msg_ts;
+
+	/*
+	 * Return the type to the user.
+	 */
+
+	eval = copyout((caddr_t)&(msghdr->msg_type), user_msgp,
+	    sizeof(msghdr->msg_type));
+	if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+		printf("error (%d) copying out message type\n", eval);
+#endif
+		msg_freehdr(msghdr);
+		wakeup((caddr_t)msqptr);
+		return(eval);
+	}
+	user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+	/*
+	 * Return the segments to the user
+	 */
+
+	next = msghdr->msg_spot;
+	for (len = 0; len < msgsz; len += msginfo.msgssz) {
+		size_t tlen;
+
+		if (msgsz > msginfo.msgssz)
+			tlen = msginfo.msgssz;
+		else
+			tlen = msgsz;
+		if (next <= -1)
+			panic("next too low #3");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #3");
+		eval = copyout((caddr_t)&msgpool[next * msginfo.msgssz],
+		    user_msgp, tlen);
+		if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("error (%d) copying out message segment\n",
+			    eval);
+#endif
+			msg_freehdr(msghdr);
+			wakeup((caddr_t)msqptr);
+			return(eval);
+		}
+		user_msgp = (char *)user_msgp + tlen;
+		next = msgmaps[next].next;
+	}
+
+	/*
+	 * Done, return the actual number of bytes copied out.
+	 */
+
+	msg_freehdr(msghdr);
+	wakeup((caddr_t)msqptr);
+	p->p_retval[0] = msgsz;
+	return(0);
+}
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
new file mode 100644
index 0000000..fb04c42
--- /dev/null
+++ b/sys/kern/sysv_sem.c
@@ -0,0 +1,977 @@
+/*	$Id: sysv_sem.c,v 1.21 1998/03/30 09:50:41 phk Exp $ */
+
+/*
+ * Implementation of SVID semaphores
+ *
+ * Author:  Daniel Boulet
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sem.h>
+#include <sys/sysent.h>
+
+static void seminit __P((void *));
+SYSINIT(sysv_sem, SI_SUB_SYSV_SEM, SI_ORDER_FIRST, seminit, NULL)
+
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args;
+int __semctl __P((struct proc *p, struct __semctl_args *uap));
+struct semget_args;
+int semget __P((struct proc *p, struct semget_args *uap));
+struct semop_args;
+int semop __P((struct proc *p, struct semop_args *uap));
+struct semconfig_args;
+int semconfig __P((struct proc *p, struct semconfig_args *uap));
+#endif
+
+static struct sem_undo *semu_alloc __P((struct proc *p));
+static int semundo_adjust __P((struct proc *p, struct sem_undo **supptr, 
+		int semid, int semnum, int adjval));
+static void semundo_clear __P((int semid, int semnum));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *semcalls[] = {
+	(sy_call_t *)__semctl, (sy_call_t *)semget,
+	(sy_call_t *)semop, (sy_call_t *)semconfig
+};
+
+static int	semtot = 0;
+struct semid_ds *sema;		/* semaphore id pool */
+struct sem *sem;		/* semaphore pool */
+static struct sem_undo *semu_list; 	/* list of active undo structures */
+int	*semu;			/* undo structure pool */
+
+static struct proc *semlock_holder = NULL;
+
+void
+seminit(dummy)
+	void *dummy;
+{
+	register int i;
+
+	if (sema == NULL)
+		panic("sema is NULL");
+	if (semu == NULL)
+		panic("semu is NULL");
+
+	for (i = 0; i < seminfo.semmni; i++) {
+		sema[i].sem_base = 0;
+		sema[i].sem_perm.mode = 0;
+	}
+	for (i = 0; i < seminfo.semmnu; i++) {
+		register struct sem_undo *suptr = SEMU(i);
+		suptr->un_proc = NULL;
+	}
+	semu_list = NULL;
+}
+
+/*
+ * Entry point for all SEM calls
+ */
+int
+semsys(p, uap)
+	struct proc *p;
+	/* XXX actually varargs. */
+	struct semsys_args /* {
+		u_int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+		int	a5;
+	} */ *uap;
+{
+
+	while (semlock_holder != NULL && semlock_holder != p)
+		(void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semsys", 0);
+
+	if (uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
+		return (EINVAL);
+	return ((*semcalls[uap->which])(p, &uap->a2));
+}
+
+/*
+ * Lock or unlock the entire semaphore facility.
+ *
+ * This will probably eventually evolve into a general purpose semaphore
+ * facility status enquiry mechanism (I don't like the "read /dev/kmem"
+ * approach currently taken by ipcs and the amount of info that we want
+ * to be able to extract for ipcs is probably beyond what the capability
+ * of the getkerninfo facility.
+ *
+ * At the time that the current version of semconfig was written, ipcs is
+ * the only user of the semconfig facility.  It uses it to ensure that the
+ * semaphore facility data structures remain static while it fishes around
+ * in /dev/kmem.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct semconfig_args {
+	semconfig_ctl_t	flag;
+};
+#endif
+
+int
+semconfig(p, uap)
+	struct proc *p;
+	struct semconfig_args *uap;
+{
+	int eval = 0;
+
+	switch (uap->flag) {
+	case SEM_CONFIG_FREEZE:
+		semlock_holder = p;
+		break;
+
+	case SEM_CONFIG_THAW:
+		semlock_holder = NULL;
+		wakeup((caddr_t)&semlock_holder);
+		break;
+
+	default:
+		printf("semconfig: unknown flag parameter value (%d) - ignored\n",
+		    uap->flag);
+		eval = EINVAL;
+		break;
+	}
+
+	p->p_retval[0] = 0;
+	return(eval);
+}
+
+/*
+ * Allocate a new sem_undo structure for a process
+ * (returns ptr to structure or NULL if no more room)
+ */
+
+static struct sem_undo *
+semu_alloc(p)
+	struct proc *p;
+{
+	register int i;
+	register struct sem_undo *suptr;
+	register struct sem_undo **supptr;
+	int attempt;
+
+	/*
+	 * Try twice to allocate something.
+	 * (we'll purge any empty structures after the first pass so
+	 * two passes are always enough)
+	 */
+
+	for (attempt = 0; attempt < 2; attempt++) {
+		/*
+		 * Look for a free structure.
+		 * Fill it in and return it if we find one.
+		 */
+
+		for (i = 0; i < seminfo.semmnu; i++) {
+			suptr = SEMU(i);
+			if (suptr->un_proc == NULL) {
+				suptr->un_next = semu_list;
+				semu_list = suptr;
+				suptr->un_cnt = 0;
+				suptr->un_proc = p;
+				return(suptr);
+			}
+		}
+
+		/*
+		 * We didn't find a free one, if this is the first attempt
+		 * then try to free some structures.
+		 */
+
+		if (attempt == 0) {
+			/* All the structures are in use - try to free some */
+			int did_something = 0;
+
+			supptr = &semu_list;
+			while ((suptr = *supptr) != NULL) {
+				if (suptr->un_cnt == 0)  {
+					suptr->un_proc = NULL;
+					*supptr = suptr->un_next;
+					did_something = 1;
+				} else
+					supptr = &(suptr->un_next);
+			}
+
+			/* If we didn't free anything then just give-up */
+			if (!did_something)
+				return(NULL);
+		} else {
+			/*
+			 * The second pass failed even though we freed
+			 * something after the first pass!
+			 * This is IMPOSSIBLE!
+			 */
+			panic("semu_alloc - second attempt failed");
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Adjust a particular entry for a particular proc
+ */
+
+static int
+semundo_adjust(p, supptr, semid, semnum, adjval)
+	register struct proc *p;
+	struct sem_undo **supptr;
+	int semid, semnum;
+	int adjval;
+{
+	register struct sem_undo *suptr;
+	register struct undo *sunptr;
+	int i;
+
+	/* Look for and remember the sem_undo if the caller doesn't provide
+	   it */
+
+	suptr = *supptr;
+	if (suptr == NULL) {
+		for (suptr = semu_list; suptr != NULL;
+		    suptr = suptr->un_next) {
+			if (suptr->un_proc == p) {
+				*supptr = suptr;
+				break;
+			}
+		}
+		if (suptr == NULL) {
+			if (adjval == 0)
+				return(0);
+			suptr = semu_alloc(p);
+			if (suptr == NULL)
+				return(ENOSPC);
+			*supptr = suptr;
+		}
+	}
+
+	/*
+	 * Look for the requested entry and adjust it (delete if adjval becomes
+	 * 0).
+	 */
+	sunptr = &suptr->un_ent[0];
+	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
+		if (sunptr->un_id != semid || sunptr->un_num != semnum)
+			continue;
+		if (adjval == 0)
+			sunptr->un_adjval = 0;
+		else
+			sunptr->un_adjval += adjval;
+		if (sunptr->un_adjval == 0) {
+			suptr->un_cnt--;
+			if (i < suptr->un_cnt)
+				suptr->un_ent[i] =
+				    suptr->un_ent[suptr->un_cnt];
+		}
+		return(0);
+	}
+
+	/* Didn't find the right entry - create it */
+	if (adjval == 0)
+		return(0);
+	if (suptr->un_cnt != seminfo.semume) {
+		sunptr = &suptr->un_ent[suptr->un_cnt];
+		suptr->un_cnt++;
+		sunptr->un_adjval = adjval;
+		sunptr->un_id = semid; sunptr->un_num = semnum;
+	} else
+		return(EINVAL);
+	return(0);
+}
+
+static void
+semundo_clear(semid, semnum)
+	int semid, semnum;
+{
+	register struct sem_undo *suptr;
+
+	for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) {
+		register struct undo *sunptr = &suptr->un_ent[0];
+		register int i = 0;
+
+		while (i < suptr->un_cnt) {
+			if (sunptr->un_id == semid) {
+				if (semnum == -1 || sunptr->un_num == semnum) {
+					suptr->un_cnt--;
+					if (i < suptr->un_cnt) {
+						suptr->un_ent[i] =
+						  suptr->un_ent[suptr->un_cnt];
+						continue;
+					}
+				}
+				if (semnum != -1)
+					break;
+			}
+			i++, sunptr++;
+		}
+	}
+}
+
+/*
+ * Note that the user-mode half of this passes a union, not a pointer
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args {
+	int	semid;
+	int	semnum;
+	int	cmd;
+	union	semun *arg;
+};
+#endif
+
+int
+__semctl(p, uap)
+	struct proc *p;
+	register struct __semctl_args *uap;
+{
+	int semid = uap->semid;
+	int semnum = uap->semnum;
+	int cmd = uap->cmd;
+	union semun *arg = uap->arg;
+	union semun real_arg;
+	struct ucred *cred = p->p_ucred;
+	int i, rval, eval;
+	struct semid_ds sbuf;
+	register struct semid_ds *semaptr;
+
+#ifdef SEM_DEBUG
+	printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg);
+#endif
+
+	semid = IPCID_TO_IX(semid);
+	if (semid < 0 || semid >= seminfo.semmsl)
+		return(EINVAL);
+
+	semaptr = &sema[semid];
+	if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+	    semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid))
+		return(EINVAL);
+
+	eval = 0;
+	rval = 0;
+
+	switch (cmd) {
+	case IPC_RMID:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M)))
+			return(eval);
+		semaptr->sem_perm.cuid = cred->cr_uid;
+		semaptr->sem_perm.uid = cred->cr_uid;
+		semtot -= semaptr->sem_nsems;
+		for (i = semaptr->sem_base - sem; i < semtot; i++)
+			sem[i] = sem[i + semaptr->sem_nsems];
+		for (i = 0; i < seminfo.semmni; i++) {
+			if ((sema[i].sem_perm.mode & SEM_ALLOC) &&
+			    sema[i].sem_base > semaptr->sem_base)
+				sema[i].sem_base -= semaptr->sem_nsems;
+		}
+		semaptr->sem_perm.mode = 0;
+		semundo_clear(semid, -1);
+		wakeup((caddr_t)semaptr);
+		break;
+
+	case IPC_SET:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M)))
+			return(eval);
+		if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			return(eval);
+		if ((eval = copyin(real_arg.buf, (caddr_t)&sbuf,
+		    sizeof(sbuf))) != 0)
+			return(eval);
+		semaptr->sem_perm.uid = sbuf.sem_perm.uid;
+		semaptr->sem_perm.gid = sbuf.sem_perm.gid;
+		semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) |
+		    (sbuf.sem_perm.mode & 0777);
+		semaptr->sem_ctime = time_second;
+		break;
+
+	case IPC_STAT:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			return(eval);
+		eval = copyout((caddr_t)semaptr, real_arg.buf,
+		    sizeof(struct semid_ds));
+		break;
+
+	case GETNCNT:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if (semnum < 0 || semnum >= semaptr->sem_nsems)
+			return(EINVAL);
+		rval = semaptr->sem_base[semnum].semncnt;
+		break;
+
+	case GETPID:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if (semnum < 0 || semnum >= semaptr->sem_nsems)
+			return(EINVAL);
+		rval = semaptr->sem_base[semnum].sempid;
+		break;
+
+	case GETVAL:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if (semnum < 0 || semnum >= semaptr->sem_nsems)
+			return(EINVAL);
+		rval = semaptr->sem_base[semnum].semval;
+		break;
+
+	case GETALL:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			return(eval);
+		for (i = 0; i < semaptr->sem_nsems; i++) {
+			eval = copyout((caddr_t)&semaptr->sem_base[i].semval,
+			    &real_arg.array[i], sizeof(real_arg.array[0]));
+			if (eval != 0)
+				break;
+		}
+		break;
+
+	case GETZCNT:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if (semnum < 0 || semnum >= semaptr->sem_nsems)
+			return(EINVAL);
+		rval = semaptr->sem_base[semnum].semzcnt;
+		break;
+
+	case SETVAL:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W)))
+			return(eval);
+		if (semnum < 0 || semnum >= semaptr->sem_nsems)
+			return(EINVAL);
+		if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			return(eval);
+		semaptr->sem_base[semnum].semval = real_arg.val;
+		semundo_clear(semid, semnum);
+		wakeup((caddr_t)semaptr);
+		break;
+
+	case SETALL:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W)))
+			return(eval);
+		if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			return(eval);
+		for (i = 0; i < semaptr->sem_nsems; i++) {
+			eval = copyin(&real_arg.array[i],
+			    (caddr_t)&semaptr->sem_base[i].semval,
+			    sizeof(real_arg.array[0]));
+			if (eval != 0)
+				break;
+		}
+		semundo_clear(semid, -1);
+		wakeup((caddr_t)semaptr);
+		break;
+
+	default:
+		return(EINVAL);
+	}
+
+	if (eval == 0)
+		p->p_retval[0] = rval;
+	return(eval);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semget_args {
+	key_t	key;
+	int	nsems;
+	int	semflg;
+};
+#endif
+
+int
+semget(p, uap)
+	struct proc *p;
+	register struct semget_args *uap;
+{
+	int semid, eval;
+	int key = uap->key;
+	int nsems = uap->nsems;
+	int semflg = uap->semflg;
+	struct ucred *cred = p->p_ucred;
+
+#ifdef SEM_DEBUG
+	printf("semget(0x%x, %d, 0%o)\n", key, nsems, semflg);
+#endif
+
+	if (key != IPC_PRIVATE) {
+		for (semid = 0; semid < seminfo.semmni; semid++) {
+			if ((sema[semid].sem_perm.mode & SEM_ALLOC) &&
+			    sema[semid].sem_perm.key == key)
+				break;
+		}
+		if (semid < seminfo.semmni) {
+#ifdef SEM_DEBUG
+			printf("found public key\n");
+#endif
+			if ((eval = ipcperm(cred, &sema[semid].sem_perm,
+			    semflg & 0700)))
+				return(eval);
+			if (nsems > 0 && sema[semid].sem_nsems < nsems) {
+#ifdef SEM_DEBUG
+				printf("too small\n");
+#endif
+				return(EINVAL);
+			}
+			if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
+#ifdef SEM_DEBUG
+				printf("not exclusive\n");
+#endif
+				return(EEXIST);
+			}
+			goto found;
+		}
+	}
+
+#ifdef SEM_DEBUG
+	printf("need to allocate the semid_ds\n");
+#endif
+	if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
+		if (nsems <= 0 || nsems > seminfo.semmsl) {
+#ifdef SEM_DEBUG
+			printf("nsems out of range (0<%d<=%d)\n", nsems,
+			    seminfo.semmsl);
+#endif
+			return(EINVAL);
+		}
+		if (nsems > seminfo.semmns - semtot) {
+#ifdef SEM_DEBUG
+			printf("not enough semaphores left (need %d, got %d)\n",
+			    nsems, seminfo.semmns - semtot);
+#endif
+			return(ENOSPC);
+		}
+		for (semid = 0; semid < seminfo.semmni; semid++) {
+			if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0)
+				break;
+		}
+		if (semid == seminfo.semmni) {
+#ifdef SEM_DEBUG
+			printf("no more semid_ds's available\n");
+#endif
+			return(ENOSPC);
+		}
+#ifdef SEM_DEBUG
+		printf("semid %d is available\n", semid);
+#endif
+		sema[semid].sem_perm.key = key;
+		sema[semid].sem_perm.cuid = cred->cr_uid;
+		sema[semid].sem_perm.uid = cred->cr_uid;
+		sema[semid].sem_perm.cgid = cred->cr_gid;
+		sema[semid].sem_perm.gid = cred->cr_gid;
+		sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
+		sema[semid].sem_perm.seq =
+		    (sema[semid].sem_perm.seq + 1) & 0x7fff;
+		sema[semid].sem_nsems = nsems;
+		sema[semid].sem_otime = 0;
+		sema[semid].sem_ctime = time_second;
+		sema[semid].sem_base = &sem[semtot];
+		semtot += nsems;
+		bzero(sema[semid].sem_base,
+		    sizeof(sema[semid].sem_base[0])*nsems);
+#ifdef SEM_DEBUG
+		printf("sembase = 0x%x, next = 0x%x\n", sema[semid].sem_base,
+		    &sem[semtot]);
+#endif
+	} else {
+#ifdef SEM_DEBUG
+		printf("didn't find it and wasn't asked to create it\n");
+#endif
+		return(ENOENT);
+	}
+
+found:
+	p->p_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm);
+	return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semop_args {
+	int	semid;
+	struct	sembuf *sops;
+	int	nsops;
+};
+#endif
+
+int
+semop(p, uap)
+	struct proc *p;
+	register struct semop_args *uap;
+{
+	int semid = uap->semid;
+	int nsops = uap->nsops;
+	struct sembuf sops[MAX_SOPS];
+	register struct semid_ds *semaptr;
+	register struct sembuf *sopptr;
+	register struct sem *semptr;
+	struct sem_undo *suptr = NULL;
+	struct ucred *cred = p->p_ucred;
+	int i, j, eval;
+	int do_wakeup, do_undos;
+
+#ifdef SEM_DEBUG
+	printf("call to semop(%d, 0x%x, %d)\n", semid, sops, nsops);
+#endif
+
+	semid = IPCID_TO_IX(semid);	/* Convert back to zero origin */
+
+	if (semid < 0 || semid >= seminfo.semmsl)
+		return(EINVAL);
+
+	semaptr = &sema[semid];
+	if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
+		return(EINVAL);
+	if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid))
+		return(EINVAL);
+
+	if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) {
+#ifdef SEM_DEBUG
+		printf("eval = %d from ipaccess\n", eval);
+#endif
+		return(eval);
+	}
+
+	if (nsops > MAX_SOPS) {
+#ifdef SEM_DEBUG
+		printf("too many sops (max=%d, nsops=%d)\n", MAX_SOPS, nsops);
+#endif
+		return(E2BIG);
+	}
+
+	if ((eval = copyin(uap->sops, &sops, nsops * sizeof(sops[0]))) != 0) {
+#ifdef SEM_DEBUG
+		printf("eval = %d from copyin(%08x, %08x, %d)\n", eval,
+		    uap->sops, &sops, nsops * sizeof(sops[0]));
+#endif
+		return(eval);
+	}
+
+	/*
+	 * Loop trying to satisfy the vector of requests.
+	 * If we reach a point where we must wait, any requests already
+	 * performed are rolled back and we go to sleep until some other
+	 * process wakes us up.  At this point, we start all over again.
+	 *
+	 * This ensures that from the perspective of other tasks, a set
+	 * of requests is atomic (never partially satisfied).
+	 */
+	do_undos = 0;
+
+	for (;;) {
+		do_wakeup = 0;
+
+		for (i = 0; i < nsops; i++) {
+			sopptr = &sops[i];
+
+			if (sopptr->sem_num >= semaptr->sem_nsems)
+				return(EFBIG);
+
+			semptr = &semaptr->sem_base[sopptr->sem_num];
+
+#ifdef SEM_DEBUG
+			printf("semop:  semaptr=%x, sem_base=%x, semptr=%x, sem[%d]=%d : op=%d, flag=%s\n",
+			    semaptr, semaptr->sem_base, semptr,
+			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
+			    (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait");
+#endif
+
+			if (sopptr->sem_op < 0) {
+				if (semptr->semval + sopptr->sem_op < 0) {
+#ifdef SEM_DEBUG
+					printf("semop:  can't do it now\n");
+#endif
+					break;
+				} else {
+					semptr->semval += sopptr->sem_op;
+					if (semptr->semval == 0 &&
+					    semptr->semzcnt > 0)
+						do_wakeup = 1;
+				}
+				if (sopptr->sem_flg & SEM_UNDO)
+					do_undos = 1;
+			} else if (sopptr->sem_op == 0) {
+				if (semptr->semval > 0) {
+#ifdef SEM_DEBUG
+					printf("semop:  not zero now\n");
+#endif
+					break;
+				}
+			} else {
+				if (semptr->semncnt > 0)
+					do_wakeup = 1;
+				semptr->semval += sopptr->sem_op;
+				if (sopptr->sem_flg & SEM_UNDO)
+					do_undos = 1;
+			}
+		}
+
+		/*
+		 * Did we get through the entire vector?
+		 */
+		if (i >= nsops)
+			goto done;
+
+		/*
+		 * No ... rollback anything that we've already done
+		 */
+#ifdef SEM_DEBUG
+		printf("semop:  rollback 0 through %d\n", i-1);
+#endif
+		for (j = 0; j < i; j++)
+			semaptr->sem_base[sops[j].sem_num].semval -=
+			    sops[j].sem_op;
+
+		/*
+		 * If the request that we couldn't satisfy has the
+		 * NOWAIT flag set then return with EAGAIN.
+		 */
+		if (sopptr->sem_flg & IPC_NOWAIT)
+			return(EAGAIN);
+
+		if (sopptr->sem_op == 0)
+			semptr->semzcnt++;
+		else
+			semptr->semncnt++;
+
+#ifdef SEM_DEBUG
+		printf("semop:  good night!\n");
+#endif
+		eval = tsleep((caddr_t)semaptr, (PZERO - 4) | PCATCH,
+		    "semwait", 0);
+#ifdef SEM_DEBUG
+		printf("semop:  good morning (eval=%d)!\n", eval);
+#endif
+
+		suptr = NULL;	/* sem_undo may have been reallocated */
+
+		if (eval != 0)
+			return(EINTR);
+#ifdef SEM_DEBUG
+		printf("semop:  good morning!\n");
+#endif
+
+		/*
+		 * Make sure that the semaphore still exists
+		 */
+		if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+		    semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
+			/* The man page says to return EIDRM. */
+			/* Unfortunately, BSD doesn't define that code! */
+#ifdef EIDRM
+			return(EIDRM);
+#else
+			return(EINVAL);
+#endif
+		}
+
+		/*
+		 * The semaphore is still alive.  Readjust the count of
+		 * waiting processes.
+		 */
+		if (sopptr->sem_op == 0)
+			semptr->semzcnt--;
+		else
+			semptr->semncnt--;
+	}
+
+done:
+	/*
+	 * Process any SEM_UNDO requests.
+	 */
+	if (do_undos) {
+		for (i = 0; i < nsops; i++) {
+			/*
+			 * We only need to deal with SEM_UNDO's for non-zero
+			 * op's.
+			 */
+			int adjval;
+
+			if ((sops[i].sem_flg & SEM_UNDO) == 0)
+				continue;
+			adjval = sops[i].sem_op;
+			if (adjval == 0)
+				continue;
+			eval = semundo_adjust(p, &suptr, semid,
+			    sops[i].sem_num, -adjval);
+			if (eval == 0)
+				continue;
+
+			/*
+			 * Oh-Oh!  We ran out of either sem_undo's or undo's.
+			 * Rollback the adjustments to this point and then
+			 * rollback the semaphore ups and down so we can return
+			 * with an error with all structures restored.  We
+			 * rollback the undo's in the exact reverse order that
+			 * we applied them.  This guarantees that we won't run
+			 * out of space as we roll things back out.
+			 */
+			for (j = i - 1; j >= 0; j--) {
+				if ((sops[j].sem_flg & SEM_UNDO) == 0)
+					continue;
+				adjval = sops[j].sem_op;
+				if (adjval == 0)
+					continue;
+				if (semundo_adjust(p, &suptr, semid,
+				    sops[j].sem_num, adjval) != 0)
+					panic("semop - can't undo undos");
+			}
+
+			for (j = 0; j < nsops; j++)
+				semaptr->sem_base[sops[j].sem_num].semval -=
+				    sops[j].sem_op;
+
+#ifdef SEM_DEBUG
+			printf("eval = %d from semundo_adjust\n", eval);
+#endif
+			return(eval);
+		} /* loop through the sops */
+	} /* if (do_undos) */
+
+	/* We're definitely done - set the sempid's */
+	for (i = 0; i < nsops; i++) {
+		sopptr = &sops[i];
+		semptr = &semaptr->sem_base[sopptr->sem_num];
+		semptr->sempid = p->p_pid;
+	}
+
+	/* Do a wakeup if any semaphore was up'd. */
+	if (do_wakeup) {
+#ifdef SEM_DEBUG
+		printf("semop:  doing wakeup\n");
+#ifdef SEM_WAKEUP
+		sem_wakeup((caddr_t)semaptr);
+#else
+		wakeup((caddr_t)semaptr);
+#endif
+		printf("semop:  back from wakeup\n");
+#else
+		wakeup((caddr_t)semaptr);
+#endif
+	}
+#ifdef SEM_DEBUG
+	printf("semop:  done\n");
+#endif
+	p->p_retval[0] = 0;
+	return(0);
+}
+
+/*
+ * Go through the undo structures for this process and apply the adjustments to
+ * semaphores.
+ */
+void
+semexit(p)
+	struct proc *p;
+{
+	register struct sem_undo *suptr;
+	register struct sem_undo **supptr;
+	int did_something;
+
+	/*
+	 * If somebody else is holding the global semaphore facility lock
+	 * then sleep until it is released.
+	 */
+	while (semlock_holder != NULL && semlock_holder != p) {
+#ifdef SEM_DEBUG
+		printf("semaphore facility locked - sleeping ...\n");
+#endif
+		(void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semext", 0);
+	}
+
+	did_something = 0;
+
+	/*
+	 * Go through the chain of undo vectors looking for one
+	 * associated with this process.
+	 */
+
+	for (supptr = &semu_list; (suptr = *supptr) != NULL;
+	    supptr = &suptr->un_next) {
+		if (suptr->un_proc == p)
+			break;
+	}
+
+	if (suptr == NULL)
+		goto unlock;
+
+#ifdef SEM_DEBUG
+	printf("proc @%08x has undo structure with %d entries\n", p,
+	    suptr->un_cnt);
+#endif
+
+	/*
+	 * If there are any active undo elements then process them.
+	 */
+	if (suptr->un_cnt > 0) {
+		int ix;
+
+		for (ix = 0; ix < suptr->un_cnt; ix++) {
+			int semid = suptr->un_ent[ix].un_id;
+			int semnum = suptr->un_ent[ix].un_num;
+			int adjval = suptr->un_ent[ix].un_adjval;
+			struct semid_ds *semaptr;
+
+			semaptr = &sema[semid];
+			if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
+				panic("semexit - semid not allocated");
+			if (semnum >= semaptr->sem_nsems)
+				panic("semexit - semnum out of range");
+
+#ifdef SEM_DEBUG
+			printf("semexit:  %08x id=%d num=%d(adj=%d) ; sem=%d\n",
+			    suptr->un_proc, suptr->un_ent[ix].un_id,
+			    suptr->un_ent[ix].un_num,
+			    suptr->un_ent[ix].un_adjval,
+			    semaptr->sem_base[semnum].semval);
+#endif
+
+			if (adjval < 0) {
+				if (semaptr->sem_base[semnum].semval < -adjval)
+					semaptr->sem_base[semnum].semval = 0;
+				else
+					semaptr->sem_base[semnum].semval +=
+					    adjval;
+			} else
+				semaptr->sem_base[semnum].semval += adjval;
+
+#ifdef SEM_WAKEUP
+			sem_wakeup((caddr_t)semaptr);
+#else
+			wakeup((caddr_t)semaptr);
+#endif
+#ifdef SEM_DEBUG
+			printf("semexit:  back from wakeup\n");
+#endif
+		}
+	}
+
+	/*
+	 * Deallocate the undo vector.
+	 */
+#ifdef SEM_DEBUG
+	printf("removing vector\n");
+#endif
+	suptr->un_proc = NULL;
+	*supptr = suptr->un_next;
+
+unlock:
+	/*
+	 * If the exiting process is holding the global semaphore facility
+	 * lock then release it.
+	 */
+	if (semlock_holder == p) {
+		semlock_holder = NULL;
+		wakeup((caddr_t)&semlock_holder);
+	}
+}
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
new file mode 100644
index 0000000..a6c2dfe
--- /dev/null
+++ b/sys/kern/sysv_shm.c
@@ -0,0 +1,617 @@
+/*	$Id: sysv_shm.c,v 1.39 1998/10/13 08:24:40 dg Exp $ */
+/*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
+
+/*
+ * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Adam Glass and Charles
+ *	Hannum.
+ * 4. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_compat.h"
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/shm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysent.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_inherit.h>
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args;
+extern int shmat __P((struct proc *p, struct shmat_args *uap));
+struct shmctl_args;
+extern int shmctl __P((struct proc *p, struct shmctl_args *uap));
+struct shmdt_args;
+extern int shmdt __P((struct proc *p, struct shmdt_args *uap));
+struct shmget_args;
+extern int shmget __P((struct proc *p, struct shmget_args *uap));
+#endif
+
+static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
+
+static void shminit __P((void *));
+SYSINIT(sysv_shm, SI_SUB_SYSV_SHM, SI_ORDER_FIRST, shminit, NULL)
+
+struct oshmctl_args;
+static int oshmctl __P((struct proc *p, struct oshmctl_args *uap));
+static int shmget_allocate_segment __P((struct proc *p, struct shmget_args *uap, int mode));
+static int shmget_existing __P((struct proc *p, struct shmget_args *uap, int mode, int segnum));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *shmcalls[] = {
+	(sy_call_t *)shmat, (sy_call_t *)oshmctl,
+	(sy_call_t *)shmdt, (sy_call_t *)shmget,
+	(sy_call_t *)shmctl
+};
+
+#define	SHMSEG_FREE     	0x0200
+#define	SHMSEG_REMOVED  	0x0400
+#define	SHMSEG_ALLOCATED	0x0800
+#define	SHMSEG_WANTED		0x1000
+
+static int shm_last_free, shm_nused, shm_committed;
+struct shmid_ds	*shmsegs;
+
+struct shm_handle {
+	/* vm_offset_t kva; */
+	vm_object_t shm_object;
+};
+
+struct shmmap_state {
+	vm_offset_t va;
+	int shmid;
+};
+
+static void shm_deallocate_segment __P((struct shmid_ds *));
+static int shm_find_segment_by_key __P((key_t));
+static struct shmid_ds *shm_find_segment_by_shmid __P((int));
+static int shm_delete_mapping __P((struct proc *, struct shmmap_state *));
+
+static int
+shm_find_segment_by_key(key)
+	key_t key;
+{
+	int i;
+
+	for (i = 0; i < shminfo.shmmni; i++)
+		if ((shmsegs[i].shm_perm.mode & SHMSEG_ALLOCATED) &&
+		    shmsegs[i].shm_perm.key == key)
+			return i;
+	return -1;
+}
+
+static struct shmid_ds *
+shm_find_segment_by_shmid(shmid)
+	int shmid;
+{
+	int segnum;
+	struct shmid_ds *shmseg;
+
+	segnum = IPCID_TO_IX(shmid);
+	if (segnum < 0 || segnum >= shminfo.shmmni)
+		return NULL;
+	shmseg = &shmsegs[segnum];
+	if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED))
+	    != SHMSEG_ALLOCATED ||
+	    shmseg->shm_perm.seq != IPCID_TO_SEQ(shmid))
+		return NULL;
+	return shmseg;
+}
+
+static void
+shm_deallocate_segment(shmseg)
+	struct shmid_ds *shmseg;
+{
+	struct shm_handle *shm_handle;
+	size_t size;
+
+	shm_handle = shmseg->shm_internal;
+	vm_object_deallocate(shm_handle->shm_object);
+	free((caddr_t)shm_handle, M_SHM);
+	shmseg->shm_internal = NULL;
+	size = round_page(shmseg->shm_segsz);
+	shm_committed -= btoc(size);
+	shm_nused--;
+	shmseg->shm_perm.mode = SHMSEG_FREE;
+}
+
+static int
+shm_delete_mapping(p, shmmap_s)
+	struct proc *p;
+	struct shmmap_state *shmmap_s;
+{
+	struct shmid_ds *shmseg;
+	int segnum, result;
+	size_t size;
+
+	segnum = IPCID_TO_IX(shmmap_s->shmid);
+	shmseg = &shmsegs[segnum];
+	size = round_page(shmseg->shm_segsz);
+	result = vm_map_remove(&p->p_vmspace->vm_map, shmmap_s->va, shmmap_s->va + size);
+	if (result != KERN_SUCCESS)
+		return EINVAL;
+	shmmap_s->shmid = -1;
+	shmseg->shm_dtime = time_second;
+	if ((--shmseg->shm_nattch <= 0) &&
+	    (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
+		shm_deallocate_segment(shmseg);
+		shm_last_free = segnum;
+	}
+	return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmdt_args {
+	void *shmaddr;
+};
+#endif
+
+int
+shmdt(p, uap)
+	struct proc *p;
+	struct shmdt_args *uap;
+{
+	struct shmmap_state *shmmap_s;
+	int i;
+
+	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ 	if (shmmap_s == NULL)
+ 	    return EINVAL;
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+		if (shmmap_s->shmid != -1 &&
+		    shmmap_s->va == (vm_offset_t)uap->shmaddr)
+			break;
+	if (i == shminfo.shmseg)
+		return EINVAL;
+	return shm_delete_mapping(p, shmmap_s);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args {
+	int shmid;
+	void *shmaddr;
+	int shmflg;
+};
+#endif
+
+int
+shmat(p, uap)
+	struct proc *p;
+	struct shmat_args *uap;
+{
+	int error, i, flags;
+	struct ucred *cred = p->p_ucred;
+	struct shmid_ds *shmseg;
+	struct shmmap_state *shmmap_s = NULL;
+	struct shm_handle *shm_handle;
+	vm_offset_t attach_va;
+	vm_prot_t prot;
+	vm_size_t size;
+	int rv;
+
+	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+	if (shmmap_s == NULL) {
+		size = shminfo.shmseg * sizeof(struct shmmap_state);
+		shmmap_s = malloc(size, M_SHM, M_WAITOK);
+		for (i = 0; i < shminfo.shmseg; i++)
+			shmmap_s[i].shmid = -1;
+		p->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+	}
+	shmseg = shm_find_segment_by_shmid(uap->shmid);
+	if (shmseg == NULL)
+		return EINVAL;
+	error = ipcperm(cred, &shmseg->shm_perm,
+	    (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
+	if (error)
+		return error;
+	for (i = 0; i < shminfo.shmseg; i++) {
+		if (shmmap_s->shmid == -1)
+			break;
+		shmmap_s++;
+	}
+	if (i >= shminfo.shmseg)
+		return EMFILE;
+	size = round_page(shmseg->shm_segsz);
+	prot = VM_PROT_READ;
+	if ((uap->shmflg & SHM_RDONLY) == 0)
+		prot |= VM_PROT_WRITE;
+	flags = MAP_ANON | MAP_SHARED;
+	if (uap->shmaddr) {
+		flags |= MAP_FIXED;
+		if (uap->shmflg & SHM_RND)
+			attach_va = (vm_offset_t)uap->shmaddr & ~(SHMLBA-1);
+		else if (((vm_offset_t)uap->shmaddr & (SHMLBA-1)) == 0)
+			attach_va = (vm_offset_t)uap->shmaddr;
+		else
+			return EINVAL;
+	} else {
+		/* This is just a hint to vm_map_find() about where to put it. */
+		attach_va = round_page((vm_offset_t)p->p_vmspace->vm_taddr + MAXTSIZ + MAXDSIZ);
+	}
+
+	shm_handle = shmseg->shm_internal;
+	vm_object_reference(shm_handle->shm_object);
+	rv = vm_map_find(&p->p_vmspace->vm_map, shm_handle->shm_object,
+		0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0);
+	if (rv != KERN_SUCCESS) {
+		return ENOMEM;
+	}
+	vm_map_inherit(&p->p_vmspace->vm_map,
+		attach_va, attach_va + size, VM_INHERIT_SHARE);
+
+	shmmap_s->va = attach_va;
+	shmmap_s->shmid = uap->shmid;
+	shmseg->shm_lpid = p->p_pid;
+	shmseg->shm_atime = time_second;
+	shmseg->shm_nattch++;
+	p->p_retval[0] = attach_va;
+	return 0;
+}
+
+struct oshmid_ds {
+	struct	ipc_perm shm_perm;	/* operation perms */
+	int	shm_segsz;		/* size of segment (bytes) */
+	ushort	shm_cpid;		/* pid, creator */
+	ushort	shm_lpid;		/* pid, last operation */
+	short	shm_nattch;		/* no. of current attaches */
+	time_t	shm_atime;		/* last attach time */
+	time_t	shm_dtime;		/* last detach time */
+	time_t	shm_ctime;		/* last change time */
+	void	*shm_handle;		/* internal handle for shm segment */
+};
+
+struct oshmctl_args {
+	int shmid;
+	int cmd;
+	struct oshmid_ds *ubuf;
+};
+
+static int
+oshmctl(p, uap)
+	struct proc *p;
+	struct oshmctl_args *uap;
+{
+#ifdef COMPAT_43
+	int error;
+	struct ucred *cred = p->p_ucred;
+	struct shmid_ds *shmseg;
+	struct oshmid_ds outbuf;
+
+	shmseg = shm_find_segment_by_shmid(uap->shmid);
+	if (shmseg == NULL)
+		return EINVAL;
+	switch (uap->cmd) {
+	case IPC_STAT:
+		error = ipcperm(cred, &shmseg->shm_perm, IPC_R);
+		if (error)
+			return error;
+		outbuf.shm_perm = shmseg->shm_perm;
+		outbuf.shm_segsz = shmseg->shm_segsz;
+		outbuf.shm_cpid = shmseg->shm_cpid;
+		outbuf.shm_lpid = shmseg->shm_lpid;
+		outbuf.shm_nattch = shmseg->shm_nattch;
+		outbuf.shm_atime = shmseg->shm_atime;
+		outbuf.shm_dtime = shmseg->shm_dtime;
+		outbuf.shm_ctime = shmseg->shm_ctime;
+		outbuf.shm_handle = shmseg->shm_internal;
+		error = copyout((caddr_t)&outbuf, uap->ubuf, sizeof(outbuf));
+		if (error)
+			return error;
+		break;
+	default:
+		/* XXX casting to (sy_call_t *) is bogus, as usual. */
+		return ((sy_call_t *)shmctl)(p, uap);
+	}
+	return 0;
+#else
+	return EINVAL;
+#endif
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmctl_args {
+	int shmid;
+	int cmd;
+	struct shmid_ds *buf;
+};
+#endif
+
+int
+shmctl(p, uap)
+	struct proc *p;
+	struct shmctl_args *uap;
+{
+	int error;
+	struct ucred *cred = p->p_ucred;
+	struct shmid_ds inbuf;
+	struct shmid_ds *shmseg;
+
+	shmseg = shm_find_segment_by_shmid(uap->shmid);
+	if (shmseg == NULL)
+		return EINVAL;
+	switch (uap->cmd) {
+	case IPC_STAT:
+		error = ipcperm(cred, &shmseg->shm_perm, IPC_R);
+		if (error)
+			return error;
+		error = copyout((caddr_t)shmseg, uap->buf, sizeof(inbuf));
+		if (error)
+			return error;
+		break;
+	case IPC_SET:
+		error = ipcperm(cred, &shmseg->shm_perm, IPC_M);
+		if (error)
+			return error;
+		error = copyin(uap->buf, (caddr_t)&inbuf, sizeof(inbuf));
+		if (error)
+			return error;
+		shmseg->shm_perm.uid = inbuf.shm_perm.uid;
+		shmseg->shm_perm.gid = inbuf.shm_perm.gid;
+		shmseg->shm_perm.mode =
+		    (shmseg->shm_perm.mode & ~ACCESSPERMS) |
+		    (inbuf.shm_perm.mode & ACCESSPERMS);
+		shmseg->shm_ctime = time_second;
+		break;
+	case IPC_RMID:
+		error = ipcperm(cred, &shmseg->shm_perm, IPC_M);
+		if (error)
+			return error;
+		shmseg->shm_perm.key = IPC_PRIVATE;
+		shmseg->shm_perm.mode |= SHMSEG_REMOVED;
+		if (shmseg->shm_nattch <= 0) {
+			shm_deallocate_segment(shmseg);
+			shm_last_free = IPCID_TO_IX(uap->shmid);
+		}
+		break;
+#if 0
+	case SHM_LOCK:
+	case SHM_UNLOCK:
+#endif
+	default:
+		return EINVAL;
+	}
+	return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmget_args {
+	key_t key;
+	size_t size;
+	int shmflg;
+};
+#endif
+
+static int
+shmget_existing(p, uap, mode, segnum)
+	struct proc *p;
+	struct shmget_args *uap;
+	int mode;
+	int segnum;
+{
+	struct shmid_ds *shmseg;
+	struct ucred *cred = p->p_ucred;
+	int error;
+
+	shmseg = &shmsegs[segnum];
+	if (shmseg->shm_perm.mode & SHMSEG_REMOVED) {
+		/*
+		 * This segment is in the process of being allocated.  Wait
+		 * until it's done, and look the key up again (in case the
+		 * allocation failed or it was freed).
+		 */
+		shmseg->shm_perm.mode |= SHMSEG_WANTED;
+		error = tsleep((caddr_t)shmseg, PLOCK | PCATCH, "shmget", 0);
+		if (error)
+			return error;
+		return EAGAIN;
+	}
+	error = ipcperm(cred, &shmseg->shm_perm, mode);
+	if (error)
+		return error;
+	if (uap->size && uap->size > shmseg->shm_segsz)
+		return EINVAL;
+       if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
+		return EEXIST;
+	p->p_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+	return 0;
+}
+
+static int
+shmget_allocate_segment(p, uap, mode)
+	struct proc *p;
+	struct shmget_args *uap;
+	int mode;
+{
+	int i, segnum, shmid, size;
+	struct ucred *cred = p->p_ucred;
+	struct shmid_ds *shmseg;
+	struct shm_handle *shm_handle;
+
+	if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
+		return EINVAL;
+	if (shm_nused >= shminfo.shmmni) /* any shmids left? */
+		return ENOSPC;
+	size = round_page(uap->size);
+	if (shm_committed + btoc(size) > shminfo.shmall)
+		return ENOMEM;
+	if (shm_last_free < 0) {
+		for (i = 0; i < shminfo.shmmni; i++)
+			if (shmsegs[i].shm_perm.mode & SHMSEG_FREE)
+				break;
+		if (i == shminfo.shmmni)
+			panic("shmseg free count inconsistent");
+		segnum = i;
+	} else  {
+		segnum = shm_last_free;
+		shm_last_free = -1;
+	}
+	shmseg = &shmsegs[segnum];
+	/*
+	 * In case we sleep in malloc(), mark the segment present but deleted
+	 * so that noone else tries to create the same key.
+	 */
+	shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
+	shmseg->shm_perm.key = uap->key;
+	shmseg->shm_perm.seq = (shmseg->shm_perm.seq + 1) & 0x7fff;
+	shm_handle = (struct shm_handle *)
+	    malloc(sizeof(struct shm_handle), M_SHM, M_WAITOK);
+	shmid = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+	
+	/*
+	 * We make sure that we have allocated a pager before we need
+	 * to.
+	 */
+	shm_handle->shm_object =
+		vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0);
+	vm_object_clear_flag(shm_handle->shm_object, OBJ_ONEMAPPING);
+	vm_object_set_flag(shm_handle->shm_object, OBJ_NOSPLIT);
+
+	shmseg->shm_internal = shm_handle;
+	shmseg->shm_perm.cuid = shmseg->shm_perm.uid = cred->cr_uid;
+	shmseg->shm_perm.cgid = shmseg->shm_perm.gid = cred->cr_gid;
+	shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) |
+	    (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
+	shmseg->shm_segsz = uap->size;
+	shmseg->shm_cpid = p->p_pid;
+	shmseg->shm_lpid = shmseg->shm_nattch = 0;
+	shmseg->shm_atime = shmseg->shm_dtime = 0;
+	shmseg->shm_ctime = time_second;
+	shm_committed += btoc(size);
+	shm_nused++;
+	if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
+		/*
+		 * Somebody else wanted this key while we were asleep.  Wake
+		 * them up now.
+		 */
+		shmseg->shm_perm.mode &= ~SHMSEG_WANTED;
+		wakeup((caddr_t)shmseg);
+	}
+	p->p_retval[0] = shmid;
+	return 0;
+}
+
+int
+shmget(p, uap)
+	struct proc *p;
+	struct shmget_args *uap;
+{
+	int segnum, mode, error;
+
+	mode = uap->shmflg & ACCESSPERMS;
+	if (uap->key != IPC_PRIVATE) {
+	again:
+		segnum = shm_find_segment_by_key(uap->key);
+		if (segnum >= 0) {
+			error = shmget_existing(p, uap, mode, segnum);
+			if (error == EAGAIN)
+				goto again;
+			return error;
+		}
+		if ((uap->shmflg & IPC_CREAT) == 0)
+			return ENOENT;
+	}
+	return shmget_allocate_segment(p, uap, mode);
+}
+
+int
+shmsys(p, uap)
+	struct proc *p;
+	/* XXX actually varargs. */
+	struct shmsys_args /* {
+		u_int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+	} */ *uap;
+{
+
+	if (uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
+		return EINVAL;
+	return ((*shmcalls[uap->which])(p, &uap->a2));
+}
+
+void
+shmfork(p1, p2)
+	struct proc *p1, *p2;
+{
+	struct shmmap_state *shmmap_s;
+	size_t size;
+	int i;
+
+	size = shminfo.shmseg * sizeof(struct shmmap_state);
+	shmmap_s = malloc(size, M_SHM, M_WAITOK);
+	bcopy((caddr_t)p1->p_vmspace->vm_shm, (caddr_t)shmmap_s, size);
+	p2->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+		if (shmmap_s->shmid != -1)
+			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++;
+}
+
+void
+shmexit(p)
+	struct proc *p;
+{
+	struct shmmap_state *shmmap_s;
+	int i;
+
+	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+		if (shmmap_s->shmid != -1)
+			shm_delete_mapping(p, shmmap_s);
+	free((caddr_t)p->p_vmspace->vm_shm, M_SHM);
+	p->p_vmspace->vm_shm = NULL;
+}
+
+void
+shminit(dummy)
+	void *dummy;
+{
+	int i;
+	for (i = 0; i < shminfo.shmmni; i++) {
+		shmsegs[i].shm_perm.mode = SHMSEG_FREE;
+		shmsegs[i].shm_perm.seq = 0;
+	}
+	shm_last_free = 0;
+	shm_nused = 0;
+	shm_committed = 0;
+}
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
new file mode 100644
index 0000000..1adf784
--- /dev/null
+++ b/sys/kern/tty.c
@@ -0,0 +1,2437 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty.c	8.8 (Berkeley) 1/21/94
+ * $Id: tty.c,v 1.110 1998/12/08 10:22:07 bde Exp $
+ */
+
+/*-
+ * TODO:
+ *	o Fix races for sending the start char in ttyflush().
+ *	o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect().
+ *	  With luck, there will be MIN chars before select() returns().
+ *	o Handle CLOCAL consistently for ptys.  Perhaps disallow setting it.
+ *	o Don't allow input in TS_ZOMBIE case.  It would be visible through
+ *	  FIONREAD.
+ *	o Do the new sio locking stuff here and use it to avoid special
+ *	  case for EXTPROC?
+ *	o Lock PENDIN too?
+ *	o Move EXTPROC and/or PENDIN to t_state?
+ *	o Wrap most of ttioctl in spltty/splx.
+ *	o Implement TIOCNOTTY or remove it from <sys/ioctl.h>.
+ *	o Send STOP if IXOFF is toggled off while TS_TBLOCK is set.
+ *	o Don't allow certain termios flags to affect disciplines other
+ *	  than TTYDISC.  Cancel their effects before switch disciplines
+ *	  and ignore them if they are set while we are in another
+ *	  discipline.
+ *	o Now that historical speed conversions are handled here, don't
+ *	  do them in drivers.
+ *	o Check for TS_CARR_ON being set while everything is closed and not
+ *	  waiting for carrier.  TS_CARR_ON isn't cleared if nothing is open,
+ *	  so it would live until the next open even if carrier drops.
+ *	o Restore TS_WOPEN since it is useful in pstat.  It must be cleared
+ *	  only when _all_ openers leave open().
+ */
+
+#include "snp.h"
+#include "opt_compat.h"
+#include "opt_uconsole.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/filio.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/proc.h>
+#define	TTYDEFCHARS
+#include <sys/tty.h>
+#undef	TTYDEFCHARS
+#include <sys/fcntl.h>
+#include <sys/conf.h>
+#include <sys/dkstat.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#if NSNP > 0
+#include <sys/snoop.h>
+#endif
+
+#include <vm/vm.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+MALLOC_DEFINE(M_TTYS, "ttys", "tty data structures");
+
+static int	proc_compare __P((struct proc *p1, struct proc *p2));
+static int	ttnread __P((struct tty *tp));
+static void	ttyecho __P((int c, struct tty *tp));
+static int	ttyoutput __P((int c, register struct tty *tp));
+static void	ttypend __P((struct tty *tp));
+static void	ttyretype __P((struct tty *tp));
+static void	ttyrub __P((int c, struct tty *tp));
+static void	ttyrubo __P((struct tty *tp, int cnt));
+static void	ttyunblock __P((struct tty *tp));
+static int	ttywflush __P((struct tty *tp));
+
+/*
+ * Table with character classes and parity. The 8th bit indicates parity,
+ * the 7th bit indicates the character is an alphameric or underscore (for
+ * ALTWERASE), and the low 6 bits indicate delay type.  If the low 6 bits
+ * are 0 then the character needs no special processing on output; classes
+ * other than 0 might be translated or (not currently) require delays.
+ */
+#define	E	0x00	/* Even parity. */
+#define	O	0x80	/* Odd parity. */
+#define	PARITY(c)	(char_type[c] & O)
+
+#define	ALPHA	0x40	/* Alpha or underscore. */
+#define	ISALPHA(c)	(char_type[(c) & TTY_CHARMASK] & ALPHA)
+
+#define	CCLASSMASK	0x3f
+#define	CCLASS(c)	(char_type[c] & CCLASSMASK)
+
+#define	BS	BACKSPACE
+#define	CC	CONTROL
+#define	CR	RETURN
+#define	NA	ORDINARY | ALPHA
+#define	NL	NEWLINE
+#define	NO	ORDINARY
+#define	TB	TAB
+#define	VT	VTAB
+
+static u_char const char_type[] = {
+	E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC,	/* nul - bel */
+	O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */
+	O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */
+	E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */
+	O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */
+	E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */
+	E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */
+	O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */
+	O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */
+	E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */
+	E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */
+	O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */
+	E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */
+	O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */
+	O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */
+	E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */
+	/*
+	 * Meta chars; should be settable per character set;
+	 * for now, treat them all as normal characters.
+	 */
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+};
+#undef	BS
+#undef	CC
+#undef	CR
+#undef	NA
+#undef	NL
+#undef	NO
+#undef	TB
+#undef	VT
+
+/* Macros to clear/set/test flags. */
+#define	SET(t, f)	(t) |= (f)
+#define	CLR(t, f)	(t) &= ~(f)
+#define	ISSET(t, f)	((t) & (f))
+
+#undef MAX_INPUT		/* XXX wrong in <sys/syslimits.h> */
+#define	MAX_INPUT	TTYHOG	/* XXX limit is usually larger for !ICANON */
+
+/*
+ * Initial open of tty, or (re)entry to standard tty line discipline.
+ */
+int
+ttyopen(device, tp)
+	dev_t device;
+	register struct tty *tp;
+{
+	int s;
+
+	s = spltty();
+	tp->t_dev = device;
+	if (!ISSET(tp->t_state, TS_ISOPEN)) {
+		SET(tp->t_state, TS_ISOPEN);
+		if (ISSET(tp->t_cflag, CLOCAL))
+			SET(tp->t_state, TS_CONNECTED);
+		bzero(&tp->t_winsize, sizeof(tp->t_winsize));
+	}
+	ttsetwater(tp);
+	splx(s);
+	return (0);
+}
+
+/*
+ * Handle close() on a tty line: flush and set to initial state,
+ * bumping generation number so that pending read/write calls
+ * can detect recycling of the tty.
+ * XXX our caller should have done `spltty(); l_close(); ttyclose();'
+ * and l_close() should have flushed, but we repeat the spltty() and
+ * the flush in case there are buggy callers.
+ */
+int
+ttyclose(tp)
+	register struct tty *tp;
+{
+	int s;
+
+	funsetown(tp->t_sigio);
+	s = spltty();
+	if (constty == tp)
+		constty = NULL;
+
+	ttyflush(tp, FREAD | FWRITE);
+	clist_free_cblocks(&tp->t_canq);
+	clist_free_cblocks(&tp->t_outq);
+	clist_free_cblocks(&tp->t_rawq);
+
+#if NSNP > 0
+	if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+		snpdown((struct snoop *)tp->t_sc);
+#endif
+
+	tp->t_gen++;
+	tp->t_line = TTYDISC;
+	tp->t_pgrp = NULL;
+	tp->t_session = NULL;
+	tp->t_state = 0;
+	splx(s);
+	return (0);
+}
+
+#define	FLUSHQ(q) {							\
+	if ((q)->c_cc)							\
+		ndflush(q, (q)->c_cc);					\
+}
+
+/* Is 'c' a line delimiter ("break" character)? */
+#define	TTBREAKC(c, lflag)							\
+	((c) == '\n' || (((c) == cc[VEOF] ||				\
+	  (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) &&	\
+	 (c) != _POSIX_VDISABLE))
+
+/*
+ * Process input of a single character received on a tty.
+ */
+int
+ttyinput(c, tp)
+	register int c;
+	register struct tty *tp;
+{
+	register tcflag_t iflag, lflag;
+	register cc_t *cc;
+	int i, err;
+
+	/*
+	 * If input is pending take it first.
+	 */
+	lflag = tp->t_lflag;
+	if (ISSET(lflag, PENDIN))
+		ttypend(tp);
+	/*
+	 * Gather stats.
+	 */
+	if (ISSET(lflag, ICANON)) {
+		++tk_cancc;
+		++tp->t_cancc;
+	} else {
+		++tk_rawcc;
+		++tp->t_rawcc;
+	}
+	++tk_nin;
+
+	/*
+	 * Block further input iff:
+	 * current input > threshold AND input is available to user program
+	 * AND input flow control is enabled and not yet invoked.
+	 * The 3 is slop for PARMRK.
+	 */
+	iflag = tp->t_iflag;
+	if (tp->t_rawq.c_cc + tp->t_canq.c_cc > tp->t_ihiwat - 3 &&
+	    (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) &&
+	    (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) &&
+	    !ISSET(tp->t_state, TS_TBLOCK))
+		ttyblock(tp);
+
+	/* Handle exceptional conditions (break, parity, framing). */
+	cc = tp->t_cc;
+	err = (ISSET(c, TTY_ERRORMASK));
+	if (err) {
+		CLR(c, TTY_ERRORMASK);
+		if (ISSET(err, TTY_BI)) {
+			if (ISSET(iflag, IGNBRK))
+				return (0);
+			if (ISSET(iflag, BRKINT)) {
+				ttyflush(tp, FREAD | FWRITE);
+				pgsignal(tp->t_pgrp, SIGINT, 1);
+				goto endcase;
+			}
+			if (ISSET(iflag, PARMRK))
+				goto parmrk;
+		} else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK))
+			|| ISSET(err, TTY_FE)) {
+			if (ISSET(iflag, IGNPAR))
+				return (0);
+			else if (ISSET(iflag, PARMRK)) {
+parmrk:
+				if (tp->t_rawq.c_cc + tp->t_canq.c_cc >
+				    MAX_INPUT - 3)
+					goto input_overflow;
+				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+				(void)putc(0 | TTY_QUOTE, &tp->t_rawq);
+				(void)putc(c | TTY_QUOTE, &tp->t_rawq);
+				goto endcase;
+			} else
+				c = 0;
+		}
+	}
+
+	if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP))
+		CLR(c, 0x80);
+	if (!ISSET(lflag, EXTPROC)) {
+		/*
+		 * Check for literal nexting very first
+		 */
+		if (ISSET(tp->t_state, TS_LNCH)) {
+			SET(c, TTY_QUOTE);
+			CLR(tp->t_state, TS_LNCH);
+		}
+		/*
+		 * Scan for special characters.  This code
+		 * is really just a big case statement with
+		 * non-constant cases.  The bottom of the
+		 * case statement is labeled ``endcase'', so goto
+		 * it after a case match, or similar.
+		 */
+
+		/*
+		 * Control chars which aren't controlled
+		 * by ICANON, ISIG, or IXON.
+		 */
+		if (ISSET(lflag, IEXTEN)) {
+			if (CCEQ(cc[VLNEXT], c)) {
+				if (ISSET(lflag, ECHO)) {
+					if (ISSET(lflag, ECHOE)) {
+						(void)ttyoutput('^', tp);
+						(void)ttyoutput('\b', tp);
+					} else
+						ttyecho(c, tp);
+				}
+				SET(tp->t_state, TS_LNCH);
+				goto endcase;
+			}
+			if (CCEQ(cc[VDISCARD], c)) {
+				if (ISSET(lflag, FLUSHO))
+					CLR(tp->t_lflag, FLUSHO);
+				else {
+					ttyflush(tp, FWRITE);
+					ttyecho(c, tp);
+					if (tp->t_rawq.c_cc + tp->t_canq.c_cc)
+						ttyretype(tp);
+					SET(tp->t_lflag, FLUSHO);
+				}
+				goto startoutput;
+			}
+		}
+		/*
+		 * Signals.
+		 */
+		if (ISSET(lflag, ISIG)) {
+			if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) {
+				if (!ISSET(lflag, NOFLSH))
+					ttyflush(tp, FREAD | FWRITE);
+				ttyecho(c, tp);
+				pgsignal(tp->t_pgrp,
+				    CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT, 1);
+				goto endcase;
+			}
+			if (CCEQ(cc[VSUSP], c)) {
+				if (!ISSET(lflag, NOFLSH))
+					ttyflush(tp, FREAD);
+				ttyecho(c, tp);
+				pgsignal(tp->t_pgrp, SIGTSTP, 1);
+				goto endcase;
+			}
+		}
+		/*
+		 * Handle start/stop characters.
+		 */
+		if (ISSET(iflag, IXON)) {
+			if (CCEQ(cc[VSTOP], c)) {
+				if (!ISSET(tp->t_state, TS_TTSTOP)) {
+					SET(tp->t_state, TS_TTSTOP);
+#ifdef sun4c						/* XXX */
+					(*tp->t_stop)(tp, 0);
+#else
+					(*cdevsw[major(tp->t_dev)]->d_stop)(tp,
+					   0);
+#endif
+					return (0);
+				}
+				if (!CCEQ(cc[VSTART], c))
+					return (0);
+				/*
+				 * if VSTART == VSTOP then toggle
+				 */
+				goto endcase;
+			}
+			if (CCEQ(cc[VSTART], c))
+				goto restartoutput;
+		}
+		/*
+		 * IGNCR, ICRNL, & INLCR
+		 */
+		if (c == '\r') {
+			if (ISSET(iflag, IGNCR))
+				return (0);
+			else if (ISSET(iflag, ICRNL))
+				c = '\n';
+		} else if (c == '\n' && ISSET(iflag, INLCR))
+			c = '\r';
+	}
+	if (!ISSET(tp->t_lflag, EXTPROC) && ISSET(lflag, ICANON)) {
+		/*
+		 * From here on down canonical mode character
+		 * processing takes place.
+		 */
+		/*
+		 * erase (^H / ^?)
+		 */
+		if (CCEQ(cc[VERASE], c)) {
+			if (tp->t_rawq.c_cc)
+				ttyrub(unputc(&tp->t_rawq), tp);
+			goto endcase;
+		}
+		/*
+		 * kill (^U)
+		 */
+		if (CCEQ(cc[VKILL], c)) {
+			if (ISSET(lflag, ECHOKE) &&
+			    tp->t_rawq.c_cc == tp->t_rocount &&
+			    !ISSET(lflag, ECHOPRT))
+				while (tp->t_rawq.c_cc)
+					ttyrub(unputc(&tp->t_rawq), tp);
+			else {
+				ttyecho(c, tp);
+				if (ISSET(lflag, ECHOK) ||
+				    ISSET(lflag, ECHOKE))
+					ttyecho('\n', tp);
+				FLUSHQ(&tp->t_rawq);
+				tp->t_rocount = 0;
+			}
+			CLR(tp->t_state, TS_LOCAL);
+			goto endcase;
+		}
+		/*
+		 * word erase (^W)
+		 */
+		if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) {
+			int ctype;
+
+			/*
+			 * erase whitespace
+			 */
+			while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t')
+				ttyrub(c, tp);
+			if (c == -1)
+				goto endcase;
+			/*
+			 * erase last char of word and remember the
+			 * next chars type (for ALTWERASE)
+			 */
+			ttyrub(c, tp);
+			c = unputc(&tp->t_rawq);
+			if (c == -1)
+				goto endcase;
+			if (c == ' ' || c == '\t') {
+				(void)putc(c, &tp->t_rawq);
+				goto endcase;
+			}
+			ctype = ISALPHA(c);
+			/*
+			 * erase rest of word
+			 */
+			do {
+				ttyrub(c, tp);
+				c = unputc(&tp->t_rawq);
+				if (c == -1)
+					goto endcase;
+			} while (c != ' ' && c != '\t' &&
+			    (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype));
+			(void)putc(c, &tp->t_rawq);
+			goto endcase;
+		}
+		/*
+		 * reprint line (^R)
+		 */
+		if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) {
+			ttyretype(tp);
+			goto endcase;
+		}
+		/*
+		 * ^T - kernel info and generate SIGINFO
+		 */
+		if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) {
+			if (ISSET(lflag, ISIG))
+				pgsignal(tp->t_pgrp, SIGINFO, 1);
+			if (!ISSET(lflag, NOKERNINFO))
+				ttyinfo(tp);
+			goto endcase;
+		}
+	}
+	/*
+	 * Check for input buffer overflow
+	 */
+	if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) {
+input_overflow:
+		if (ISSET(iflag, IMAXBEL)) {
+			if (tp->t_outq.c_cc < tp->t_ohiwat)
+				(void)ttyoutput(CTRL('g'), tp);
+		}
+		goto endcase;
+	}
+
+	if (   c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP)
+	     && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR))
+		(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+
+	/*
+	 * Put data char in q for user and
+	 * wakeup on seeing a line delimiter.
+	 */
+	if (putc(c, &tp->t_rawq) >= 0) {
+		if (!ISSET(lflag, ICANON)) {
+			ttwakeup(tp);
+			ttyecho(c, tp);
+			goto endcase;
+		}
+		if (TTBREAKC(c, lflag)) {
+			tp->t_rocount = 0;
+			catq(&tp->t_rawq, &tp->t_canq);
+			ttwakeup(tp);
+		} else if (tp->t_rocount++ == 0)
+			tp->t_rocol = tp->t_column;
+		if (ISSET(tp->t_state, TS_ERASE)) {
+			/*
+			 * end of prterase \.../
+			 */
+			CLR(tp->t_state, TS_ERASE);
+			(void)ttyoutput('/', tp);
+		}
+		i = tp->t_column;
+		ttyecho(c, tp);
+		if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) {
+			/*
+			 * Place the cursor over the '^' of the ^D.
+			 */
+			i = imin(2, tp->t_column - i);
+			while (i > 0) {
+				(void)ttyoutput('\b', tp);
+				i--;
+			}
+		}
+	}
+endcase:
+	/*
+	 * IXANY means allow any character to restart output.
+	 */
+	if (ISSET(tp->t_state, TS_TTSTOP) &&
+	    !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP])
+		return (0);
+restartoutput:
+	CLR(tp->t_lflag, FLUSHO);
+	CLR(tp->t_state, TS_TTSTOP);
+startoutput:
+	return (ttstart(tp));
+}
+
+/*
+ * Output a single character on a tty, doing output processing
+ * as needed (expanding tabs, newline processing, etc.).
+ * Returns < 0 if succeeds, otherwise returns char to resend.
+ * Must be recursive.
+ */
+static int
+ttyoutput(c, tp)
+	register int c;
+	register struct tty *tp;
+{
+	register tcflag_t oflag;
+	register int col, s;
+
+	oflag = tp->t_oflag;
+	if (!ISSET(oflag, OPOST)) {
+		if (ISSET(tp->t_lflag, FLUSHO))
+			return (-1);
+		if (putc(c, &tp->t_outq))
+			return (c);
+		tk_nout++;
+		tp->t_outcc++;
+		return (-1);
+	}
+	/*
+	 * Do tab expansion if OXTABS is set.  Special case if we external
+	 * processing, we don't do the tab expansion because we'll probably
+	 * get it wrong.  If tab expansion needs to be done, let it happen
+	 * externally.
+	 */
+	CLR(c, ~TTY_CHARMASK);
+	if (c == '\t' &&
+	    ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) {
+		c = 8 - (tp->t_column & 7);
+		if (!ISSET(tp->t_lflag, FLUSHO)) {
+			s = spltty();		/* Don't interrupt tabs. */
+			c -= b_to_q("        ", c, &tp->t_outq);
+			tk_nout += c;
+			tp->t_outcc += c;
+			splx(s);
+		}
+		tp->t_column += c;
+		return (c ? -1 : '\t');
+	}
+	if (c == CEOT && ISSET(oflag, ONOEOT))
+		return (-1);
+
+	/*
+	 * Newline translation: if ONLCR is set,
+	 * translate newline into "\r\n".
+	 */
+	if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) {
+		tk_nout++;
+		tp->t_outcc++;
+		if (putc('\r', &tp->t_outq))
+			return (c);
+	}
+	tk_nout++;
+	tp->t_outcc++;
+	if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
+		return (c);
+
+	col = tp->t_column;
+	switch (CCLASS(c)) {
+	case BACKSPACE:
+		if (col > 0)
+			--col;
+		break;
+	case CONTROL:
+		break;
+	case NEWLINE:
+	case RETURN:
+		col = 0;
+		break;
+	case ORDINARY:
+		++col;
+		break;
+	case TAB:
+		col = (col + 8) & ~7;
+		break;
+	}
+	tp->t_column = col;
+	return (-1);
+}
+
+/*
+ * Ioctls for all tty devices.  Called after line-discipline specific ioctl
+ * has been called to do discipline-specific functions and/or reject any
+ * of these ioctl commands.
+ */
+/* ARGSUSED */
+int
+ttioctl(tp, cmd, data, flag)
+	register struct tty *tp;
+	u_long cmd;
+	int flag;
+	void *data;
+{
+	register struct proc *p;
+	int s, error;
+
+	p = curproc;			/* XXX */
+
+	/* If the ioctl involves modification, hang if in the background. */
+	switch (cmd) {
+	case  TIOCCBRK:
+	case  TIOCCONS:
+	case  TIOCDRAIN:
+	case  TIOCEXCL:
+	case  TIOCFLUSH:
+#ifdef TIOCHPCL
+	case  TIOCHPCL:
+#endif
+	case  TIOCNXCL:
+	case  TIOCSBRK:
+	case  TIOCSCTTY:
+	case  TIOCSDRAINWAIT:
+	case  TIOCSETA:
+	case  TIOCSETAF:
+	case  TIOCSETAW:
+	case  TIOCSETD:
+	case  TIOCSPGRP:
+	case  TIOCSTART:
+	case  TIOCSTAT:
+	case  TIOCSTI:
+	case  TIOCSTOP:
+	case  TIOCSWINSZ:
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+	case  TIOCLBIC:
+	case  TIOCLBIS:
+	case  TIOCLSET:
+	case  TIOCSETC:
+	case OTIOCSETD:
+	case  TIOCSETN:
+	case  TIOCSETP:
+	case  TIOCSLTC:
+#endif
+		while (isbackground(p, tp) &&
+		    (p->p_flag & P_PPWAIT) == 0 &&
+		    (p->p_sigignore & sigmask(SIGTTOU)) == 0 &&
+		    (p->p_sigmask & sigmask(SIGTTOU)) == 0) {
+			if (p->p_pgrp->pg_jobc == 0)
+				return (EIO);
+			pgsignal(p->p_pgrp, SIGTTOU, 1);
+			error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1",
+					 0);
+			if (error)
+				return (error);
+		}
+		break;
+	}
+
+	switch (cmd) {			/* Process the ioctl. */
+	case FIOASYNC:			/* set/clear async i/o */
+		s = spltty();
+		if (*(int *)data)
+			SET(tp->t_state, TS_ASYNC);
+		else
+			CLR(tp->t_state, TS_ASYNC);
+		splx(s);
+		break;
+	case FIONBIO:			/* set/clear non-blocking i/o */
+		break;			/* XXX: delete. */
+	case FIONREAD:			/* get # bytes to read */
+		s = spltty();
+		*(int *)data = ttnread(tp);
+		splx(s);
+		break;
+
+	case FIOSETOWN:
+		/*
+		 * Policy -- Don't allow FIOSETOWN on someone else's 
+		 *           controlling tty
+		 */
+		if (tp->t_session != NULL && !isctty(p, tp))
+			return (ENOTTY);
+
+		error = fsetown(*(int *)data, &tp->t_sigio);
+		if (error)
+			return (error);
+		break;
+	case FIOGETOWN:
+		if (tp->t_session != NULL && !isctty(p, tp))
+			return (ENOTTY);
+		*(int *)data = fgetown(tp->t_sigio);
+		break;
+
+	case TIOCEXCL:			/* set exclusive use of tty */
+		s = spltty();
+		SET(tp->t_state, TS_XCLUDE);
+		splx(s);
+		break;
+	case TIOCFLUSH: {		/* flush buffers */
+		register int flags = *(int *)data;
+
+		if (flags == 0)
+			flags = FREAD | FWRITE;
+		else
+			flags &= FREAD | FWRITE;
+		ttyflush(tp, flags);
+		break;
+	}
+	case TIOCCONS:			/* become virtual console */
+		if (*(int *)data) {
+			if (constty && constty != tp &&
+			    ISSET(constty->t_state, TS_CONNECTED))
+				return (EBUSY);
+#ifndef	UCONSOLE
+			if (error = suser(p->p_ucred, &p->p_acflag))
+				return (error);
+#endif
+			constty = tp;
+		} else if (tp == constty)
+			constty = NULL;
+		break;
+	case TIOCDRAIN:			/* wait till output drained */
+		error = ttywait(tp);
+		if (error)
+			return (error);
+		break;
+	case TIOCGETA: {		/* get termios struct */
+		struct termios *t = (struct termios *)data;
+
+		bcopy(&tp->t_termios, t, sizeof(struct termios));
+		break;
+	}
+	case TIOCGETD:			/* get line discipline */
+		*(int *)data = tp->t_line;
+		break;
+	case TIOCGWINSZ:		/* get window size */
+		*(struct winsize *)data = tp->t_winsize;
+		break;
+	case TIOCGPGRP:			/* get pgrp of tty */
+		if (!isctty(p, tp))
+			return (ENOTTY);
+		*(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+		break;
+#ifdef TIOCHPCL
+	case TIOCHPCL:			/* hang up on last close */
+		s = spltty();
+		SET(tp->t_cflag, HUPCL);
+		splx(s);
+		break;
+#endif
+	case TIOCNXCL:			/* reset exclusive use of tty */
+		s = spltty();
+		CLR(tp->t_state, TS_XCLUDE);
+		splx(s);
+		break;
+	case TIOCOUTQ:			/* output queue size */
+		*(int *)data = tp->t_outq.c_cc;
+		break;
+	case TIOCSETA:			/* set termios struct */
+	case TIOCSETAW:			/* drain output, set */
+	case TIOCSETAF: {		/* drn out, fls in, set */
+		register struct termios *t = (struct termios *)data;
+
+		if (t->c_ispeed == 0)
+			t->c_ispeed = t->c_ospeed;
+		if (t->c_ispeed == 0)
+			t->c_ispeed = tp->t_ospeed;
+		if (t->c_ispeed == 0)
+			return (EINVAL);
+		s = spltty();
+		if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
+			error = ttywait(tp);
+			if (error) {
+				splx(s);
+				return (error);
+			}
+			if (cmd == TIOCSETAF)
+				ttyflush(tp, FREAD);
+		}
+		if (!ISSET(t->c_cflag, CIGNORE)) {
+			/*
+			 * Set device hardware.
+			 */
+			if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
+				splx(s);
+				return (error);
+			}
+			if (ISSET(t->c_cflag, CLOCAL) &&
+			    !ISSET(tp->t_cflag, CLOCAL)) {
+				/*
+				 * XXX disconnections would be too hard to
+				 * get rid of without this kludge.  The only
+				 * way to get rid of controlling terminals
+				 * is to exit from the session leader.
+				 */
+				CLR(tp->t_state, TS_ZOMBIE);
+
+				wakeup(TSA_CARR_ON(tp));
+				ttwakeup(tp);
+				ttwwakeup(tp);
+			}
+			if ((ISSET(tp->t_state, TS_CARR_ON) ||
+			     ISSET(t->c_cflag, CLOCAL)) &&
+			    !ISSET(tp->t_state, TS_ZOMBIE))
+				SET(tp->t_state, TS_CONNECTED);
+			else
+				CLR(tp->t_state, TS_CONNECTED);
+			tp->t_cflag = t->c_cflag;
+			tp->t_ispeed = t->c_ispeed;
+			if (t->c_ospeed != 0)
+				tp->t_ospeed = t->c_ospeed;
+			ttsetwater(tp);
+		}
+		if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) &&
+		    cmd != TIOCSETAF) {
+			if (ISSET(t->c_lflag, ICANON))
+				SET(tp->t_lflag, PENDIN);
+			else {
+				/*
+				 * XXX we really shouldn't allow toggling
+				 * ICANON while we're in a non-termios line
+				 * discipline.  Now we have to worry about
+				 * panicing for a null queue.
+				 */
+				if (tp->t_canq.c_cbreserved > 0 &&
+				    tp->t_rawq.c_cbreserved > 0) {
+					catq(&tp->t_rawq, &tp->t_canq);
+					/*
+					 * XXX the queue limits may be
+					 * different, so the old queue
+					 * swapping method no longer works.
+					 */
+					catq(&tp->t_canq, &tp->t_rawq);
+				}
+				CLR(tp->t_lflag, PENDIN);
+			}
+			ttwakeup(tp);
+		}
+		tp->t_iflag = t->c_iflag;
+		tp->t_oflag = t->c_oflag;
+		/*
+		 * Make the EXTPROC bit read only.
+		 */
+		if (ISSET(tp->t_lflag, EXTPROC))
+			SET(t->c_lflag, EXTPROC);
+		else
+			CLR(t->c_lflag, EXTPROC);
+		tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN);
+		if (t->c_cc[VMIN] != tp->t_cc[VMIN] ||
+		    t->c_cc[VTIME] != tp->t_cc[VTIME])
+			ttwakeup(tp);
+		bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc));
+		splx(s);
+		break;
+	}
+	case TIOCSETD: {		/* set line discipline */
+		register int t = *(int *)data;
+		dev_t device = tp->t_dev;
+
+		if ((u_int)t >= nlinesw)
+			return (ENXIO);
+		if (t != tp->t_line) {
+			s = spltty();
+			(*linesw[tp->t_line].l_close)(tp, flag);
+			error = (*linesw[t].l_open)(device, tp);
+			if (error) {
+				(void)(*linesw[tp->t_line].l_open)(device, tp);
+				splx(s);
+				return (error);
+			}
+			tp->t_line = t;
+			splx(s);
+		}
+		break;
+	}
+	case TIOCSTART:			/* start output, like ^Q */
+		s = spltty();
+		if (ISSET(tp->t_state, TS_TTSTOP) ||
+		    ISSET(tp->t_lflag, FLUSHO)) {
+			CLR(tp->t_lflag, FLUSHO);
+			CLR(tp->t_state, TS_TTSTOP);
+			ttstart(tp);
+		}
+		splx(s);
+		break;
+	case TIOCSTI:			/* simulate terminal input */
+		if (p->p_ucred->cr_uid && (flag & FREAD) == 0)
+			return (EPERM);
+		if (p->p_ucred->cr_uid && !isctty(p, tp))
+			return (EACCES);
+		s = spltty();
+		(*linesw[tp->t_line].l_rint)(*(u_char *)data, tp);
+		splx(s);
+		break;
+	case TIOCSTOP:			/* stop output, like ^S */
+		s = spltty();
+		if (!ISSET(tp->t_state, TS_TTSTOP)) {
+			SET(tp->t_state, TS_TTSTOP);
+#ifdef sun4c				/* XXX */
+			(*tp->t_stop)(tp, 0);
+#else
+			(*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0);
+#endif
+		}
+		splx(s);
+		break;
+	case TIOCSCTTY:			/* become controlling tty */
+		/* Session ctty vnode pointer set in vnode layer. */
+		if (!SESS_LEADER(p) ||
+		    ((p->p_session->s_ttyvp || tp->t_session) &&
+		    (tp->t_session != p->p_session)))
+			return (EPERM);
+		tp->t_session = p->p_session;
+		tp->t_pgrp = p->p_pgrp;
+		p->p_session->s_ttyp = tp;
+		p->p_flag |= P_CONTROLT;
+		break;
+	case TIOCSPGRP: {		/* set pgrp of tty */
+		register struct pgrp *pgrp = pgfind(*(int *)data);
+
+		if (!isctty(p, tp))
+			return (ENOTTY);
+		else if (pgrp == NULL || pgrp->pg_session != p->p_session)
+			return (EPERM);
+		tp->t_pgrp = pgrp;
+		break;
+	}
+	case TIOCSTAT:			/* simulate control-T */
+		s = spltty();
+		ttyinfo(tp);
+		splx(s);
+		break;
+	case TIOCSWINSZ:		/* set window size */
+		if (bcmp((caddr_t)&tp->t_winsize, data,
+		    sizeof (struct winsize))) {
+			tp->t_winsize = *(struct winsize *)data;
+			pgsignal(tp->t_pgrp, SIGWINCH, 1);
+		}
+		break;
+	case TIOCSDRAINWAIT:
+		error = suser(p->p_ucred, &p->p_acflag);
+		if (error)
+			return (error);
+		tp->t_timeout = *(int *)data * hz;
+		wakeup(TSA_OCOMPLETE(tp));
+		wakeup(TSA_OLOWAT(tp));
+		break;
+	case TIOCGDRAINWAIT:
+		*(int *)data = tp->t_timeout / hz;
+		break;
+	default:
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+		return (ttcompat(tp, cmd, data, flag));
+#else
+		return (ENOIOCTL);
+#endif
+	}
+	return (0);
+}
+
+int
+ttypoll(tp, events, p)
+	struct tty *tp;
+	int events;
+	struct proc *p;
+{
+	int s;
+	int revents = 0;
+
+	if (tp == NULL)	/* XXX used to return ENXIO, but that means true! */
+		return ((events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM))
+			| POLLHUP);
+
+	s = spltty();
+	if (events & (POLLIN | POLLRDNORM))
+		if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE))
+			revents |= events & (POLLIN | POLLRDNORM);
+		else
+			selrecord(p, &tp->t_rsel);
+
+	if (events & (POLLOUT | POLLWRNORM))
+		if ((tp->t_outq.c_cc <= tp->t_olowat &&
+		     ISSET(tp->t_state, TS_CONNECTED))
+		    || ISSET(tp->t_state, TS_ZOMBIE))
+			revents |= events & (POLLOUT | POLLWRNORM);
+		else
+			selrecord(p, &tp->t_wsel);
+	splx(s);
+	return (revents);
+}
+
+/*
+ * This is a wrapper for compatibility with the select vector used by
+ * cdevsw.  It relies on a proper xxxdevtotty routine.
+ */
+int
+ttpoll(dev, events, p)
+	dev_t dev;
+	int events;
+	struct proc *p;
+{
+	return ttypoll((*cdevsw[major(dev)]->d_devtotty)(dev), events, p);
+}
+
+/*
+ * Must be called at spltty().
+ */
+static int
+ttnread(tp)
+	struct tty *tp;
+{
+	int nread;
+
+	if (ISSET(tp->t_lflag, PENDIN))
+		ttypend(tp);
+	nread = tp->t_canq.c_cc;
+	if (!ISSET(tp->t_lflag, ICANON)) {
+		nread += tp->t_rawq.c_cc;
+		if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0)
+			nread = 0;
+	}
+	return (nread);
+}
+
+/*
+ * Wait for output to drain.
+ */
+int
+ttywait(tp)
+	register struct tty *tp;
+{
+	int error, s;
+
+	error = 0;
+	s = spltty();
+	while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
+	       ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) {
+		(*tp->t_oproc)(tp);
+		if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
+		    ISSET(tp->t_state, TS_CONNECTED)) {
+			SET(tp->t_state, TS_SO_OCOMPLETE);
+			error = ttysleep(tp, TSA_OCOMPLETE(tp),
+					 TTOPRI | PCATCH, "ttywai",
+					 tp->t_timeout);
+			if (error) {
+				if (error == EWOULDBLOCK)
+					error = EIO;
+				break;
+			}
+		} else
+			break;
+	}
+	if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)))
+		error = EIO;
+	splx(s);
+	return (error);
+}
+
+/*
+ * Flush if successfully wait.
+ */
+static int
+ttywflush(tp)
+	struct tty *tp;
+{
+	int error;
+
+	if ((error = ttywait(tp)) == 0)
+		ttyflush(tp, FREAD);
+	return (error);
+}
+
+/*
+ * Flush tty read and/or write queues, notifying anyone waiting.
+ */
+void
+ttyflush(tp, rw)
+	register struct tty *tp;
+	int rw;
+{
+	register int s;
+
+	s = spltty();
+#if 0
+again:
+#endif
+	if (rw & FWRITE) {
+		FLUSHQ(&tp->t_outq);
+		CLR(tp->t_state, TS_TTSTOP);
+	}
+#ifdef sun4c						/* XXX */
+	(*tp->t_stop)(tp, rw);
+#else
+	(*cdevsw[major(tp->t_dev)]->d_stop)(tp, rw);
+#endif
+	if (rw & FREAD) {
+		FLUSHQ(&tp->t_canq);
+		FLUSHQ(&tp->t_rawq);
+		CLR(tp->t_lflag, PENDIN);
+		tp->t_rocount = 0;
+		tp->t_rocol = 0;
+		CLR(tp->t_state, TS_LOCAL);
+		ttwakeup(tp);
+		if (ISSET(tp->t_state, TS_TBLOCK)) {
+			if (rw & FWRITE)
+				FLUSHQ(&tp->t_outq);
+			ttyunblock(tp);
+
+			/*
+			 * Don't let leave any state that might clobber the
+			 * next line discipline (although we should do more
+			 * to send the START char).  Not clearing the state
+			 * may have caused the "putc to a clist with no
+			 * reserved cblocks" panic/printf.
+			 */
+			CLR(tp->t_state, TS_TBLOCK);
+
+#if 0 /* forget it, sleeping isn't always safe and we don't know when it is */
+			if (ISSET(tp->t_iflag, IXOFF)) {
+				/*
+				 * XXX wait a bit in the hope that the stop
+				 * character (if any) will go out.  Waiting
+				 * isn't good since it allows races.  This
+				 * will be fixed when the stop character is
+				 * put in a special queue.  Don't bother with
+				 * the checks in ttywait() since the timeout
+				 * will save us.
+				 */
+				SET(tp->t_state, TS_SO_OCOMPLETE);
+				ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI,
+					 "ttyfls", hz / 10);
+				/*
+				 * Don't try sending the stop character again.
+				 */
+				CLR(tp->t_state, TS_TBLOCK);
+				goto again;
+			}
+#endif
+		}
+	}
+	if (rw & FWRITE) {
+		FLUSHQ(&tp->t_outq);
+		ttwwakeup(tp);
+	}
+	splx(s);
+}
+
+/*
+ * Copy in the default termios characters.
+ */
+void
+termioschars(t)
+	struct termios *t;
+{
+
+	bcopy(ttydefchars, t->c_cc, sizeof t->c_cc);
+}
+
+/*
+ * Old interface.
+ */
+void
+ttychars(tp)
+	struct tty *tp;
+{
+
+	termioschars(&tp->t_termios);
+}
+
+/*
+ * Handle input high water.  Send stop character for the IXOFF case.  Turn
+ * on our input flow control bit and propagate the changes to the driver.
+ * XXX the stop character should be put in a special high priority queue.
+ */
+void
+ttyblock(tp)
+	struct tty *tp;
+{
+
+	SET(tp->t_state, TS_TBLOCK);
+	if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE &&
+	    putc(tp->t_cc[VSTOP], &tp->t_outq) != 0)
+		CLR(tp->t_state, TS_TBLOCK);	/* try again later */
+	ttstart(tp);
+}
+
+/*
+ * Handle input low water.  Send start character for the IXOFF case.  Turn
+ * off our input flow control bit and propagate the changes to the driver.
+ * XXX the start character should be put in a special high priority queue.
+ */
+static void
+ttyunblock(tp)
+	struct tty *tp;
+{
+
+	CLR(tp->t_state, TS_TBLOCK);
+	if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE &&
+	    putc(tp->t_cc[VSTART], &tp->t_outq) != 0)
+		SET(tp->t_state, TS_TBLOCK);	/* try again later */
+	ttstart(tp);
+}
+
+#ifdef notyet
+/* Not used by any current (i386) drivers. */
+/*
+ * Restart after an inter-char delay.
+ */
+void
+ttrstrt(tp_arg)
+	void *tp_arg;
+{
+	struct tty *tp;
+	int s;
+
+	KASSERT(tp_arg != NULL, ("ttrstrt"));
+
+	tp = tp_arg;
+	s = spltty();
+
+	CLR(tp->t_state, TS_TIMEOUT);
+	ttstart(tp);
+
+	splx(s);
+}
+#endif
+
+int
+ttstart(tp)
+	struct tty *tp;
+{
+
+	if (tp->t_oproc != NULL)	/* XXX: Kludge for pty. */
+		(*tp->t_oproc)(tp);
+	return (0);
+}
+
+/*
+ * "close" a line discipline
+ */
+int
+ttylclose(tp, flag)
+	struct tty *tp;
+	int flag;
+{
+
+	if (flag & FNONBLOCK || ttywflush(tp))
+		ttyflush(tp, FREAD | FWRITE);
+	return (0);
+}
+
+/*
+ * Handle modem control transition on a tty.
+ * Flag indicates new state of carrier.
+ * Returns 0 if the line should be turned off, otherwise 1.
+ */
+int
+ttymodem(tp, flag)
+	register struct tty *tp;
+	int flag;
+{
+
+	if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) {
+		/*
+		 * MDMBUF: do flow control according to carrier flag
+		 * XXX TS_CAR_OFLOW doesn't do anything yet.  TS_TTSTOP
+		 * works if IXON and IXANY are clear.
+		 */
+		if (flag) {
+			CLR(tp->t_state, TS_CAR_OFLOW);
+			CLR(tp->t_state, TS_TTSTOP);
+			ttstart(tp);
+		} else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) {
+			SET(tp->t_state, TS_CAR_OFLOW);
+			SET(tp->t_state, TS_TTSTOP);
+#ifdef sun4c						/* XXX */
+			(*tp->t_stop)(tp, 0);
+#else
+			(*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0);
+#endif
+		}
+	} else if (flag == 0) {
+		/*
+		 * Lost carrier.
+		 */
+		CLR(tp->t_state, TS_CARR_ON);
+		if (ISSET(tp->t_state, TS_ISOPEN) &&
+		    !ISSET(tp->t_cflag, CLOCAL)) {
+			SET(tp->t_state, TS_ZOMBIE);
+			CLR(tp->t_state, TS_CONNECTED);
+			if (tp->t_session && tp->t_session->s_leader)
+				psignal(tp->t_session->s_leader, SIGHUP);
+			ttyflush(tp, FREAD | FWRITE);
+			return (0);
+		}
+	} else {
+		/*
+		 * Carrier now on.
+		 */
+		SET(tp->t_state, TS_CARR_ON);
+		if (!ISSET(tp->t_state, TS_ZOMBIE))
+			SET(tp->t_state, TS_CONNECTED);
+		wakeup(TSA_CARR_ON(tp));
+		ttwakeup(tp);
+		ttwwakeup(tp);
+	}
+	return (1);
+}
+
+/*
+ * Reinput pending characters after state switch
+ * call at spltty().
+ */
+static void
+ttypend(tp)
+	register struct tty *tp;
+{
+	struct clist tq;
+	register int c;
+
+	CLR(tp->t_lflag, PENDIN);
+	SET(tp->t_state, TS_TYPEN);
+	/*
+	 * XXX this assumes too much about clist internals.  It may even
+	 * fail if the cblock slush pool is empty.  We can't allocate more
+	 * cblocks here because we are called from an interrupt handler
+	 * and clist_alloc_cblocks() can wait.
+	 */
+	tq = tp->t_rawq;
+	bzero(&tp->t_rawq, sizeof tp->t_rawq);
+	tp->t_rawq.c_cbmax = tq.c_cbmax;
+	tp->t_rawq.c_cbreserved = tq.c_cbreserved;
+	while ((c = getc(&tq)) >= 0)
+		ttyinput(c, tp);
+	CLR(tp->t_state, TS_TYPEN);
+}
+
+/*
+ * Process a read call on a tty device.
+ */
+int
+ttread(tp, uio, flag)
+	register struct tty *tp;
+	struct uio *uio;
+	int flag;
+{
+	register struct clist *qp;
+	register int c;
+	register tcflag_t lflag;
+	register cc_t *cc = tp->t_cc;
+	register struct proc *p = curproc;
+	int s, first, error = 0;
+	int has_stime = 0, last_cc = 0;
+	long slp = 0;		/* XXX this should be renamed `timo'. */
+	struct timeval stime;
+
+loop:
+	s = spltty();
+	lflag = tp->t_lflag;
+	/*
+	 * take pending input first
+	 */
+	if (ISSET(lflag, PENDIN)) {
+		ttypend(tp);
+		splx(s);	/* reduce latency */
+		s = spltty();
+		lflag = tp->t_lflag;	/* XXX ttypend() clobbers it */
+	}
+
+	/*
+	 * Hang process if it's in the background.
+	 */
+	if (isbackground(p, tp)) {
+		splx(s);
+		if ((p->p_sigignore & sigmask(SIGTTIN)) ||
+		   (p->p_sigmask & sigmask(SIGTTIN)) ||
+		    p->p_flag & P_PPWAIT || p->p_pgrp->pg_jobc == 0)
+			return (EIO);
+		pgsignal(p->p_pgrp, SIGTTIN, 1);
+		error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0);
+		if (error)
+			return (error);
+		goto loop;
+	}
+
+	if (ISSET(tp->t_state, TS_ZOMBIE)) {
+		splx(s);
+		return (0);	/* EOF */
+	}
+
+	/*
+	 * If canonical, use the canonical queue,
+	 * else use the raw queue.
+	 *
+	 * (should get rid of clists...)
+	 */
+	qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq;
+
+	if (flag & IO_NDELAY) {
+		if (qp->c_cc > 0)
+			goto read;
+		if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) {
+			splx(s);
+			return (0);
+		}
+		splx(s);
+		return (EWOULDBLOCK);
+	}
+	if (!ISSET(lflag, ICANON)) {
+		int m = cc[VMIN];
+		long t = cc[VTIME];
+		struct timeval timecopy;
+
+		/*
+		 * Check each of the four combinations.
+		 * (m > 0 && t == 0) is the normal read case.
+		 * It should be fairly efficient, so we check that and its
+		 * companion case (m == 0 && t == 0) first.
+		 * For the other two cases, we compute the target sleep time
+		 * into slp.
+		 */
+		if (t == 0) {
+			if (qp->c_cc < m)
+				goto sleep;
+			if (qp->c_cc > 0)
+				goto read;
+
+			/* m, t and qp->c_cc are all 0.  0 is enough input. */
+			splx(s);
+			return (0);
+		}
+		t *= 100000;		/* time in us */
+#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \
+			 ((t1).tv_usec - (t2).tv_usec))
+		if (m > 0) {
+			if (qp->c_cc <= 0)
+				goto sleep;
+			if (qp->c_cc >= m)
+				goto read;
+			getmicrotime(&timecopy);
+			if (!has_stime) {
+				/* first character, start timer */
+				has_stime = 1;
+				stime = timecopy;
+				slp = t;
+			} else if (qp->c_cc > last_cc) {
+				/* got a character, restart timer */
+				stime = timecopy;
+				slp = t;
+			} else {
+				/* nothing, check expiration */
+				slp = t - diff(timecopy, stime);
+				if (slp <= 0)
+					goto read;
+			}
+			last_cc = qp->c_cc;
+		} else {	/* m == 0 */
+			if (qp->c_cc > 0)
+				goto read;
+			getmicrotime(&timecopy);
+			if (!has_stime) {
+				has_stime = 1;
+				stime = timecopy;
+				slp = t;
+			} else {
+				slp = t - diff(timecopy, stime);
+				if (slp <= 0) {
+					/* Timed out, but 0 is enough input. */
+					splx(s);
+					return (0);
+				}
+			}
+		}
+#undef diff
+		/*
+		 * Rounding down may make us wake up just short
+		 * of the target, so we round up.
+		 * The formula is ceiling(slp * hz/1000000).
+		 * 32-bit arithmetic is enough for hz < 169.
+		 * XXX see tvtohz() for how to avoid overflow if hz
+		 * is large (divide by `tick' and/or arrange to
+		 * use tvtohz() if hz is large).
+		 */
+		slp = (long) (((u_long)slp * hz) + 999999) / 1000000;
+		goto sleep;
+	}
+	if (qp->c_cc <= 0) {
+sleep:
+		/*
+		 * There is no input, or not enough input and we can block.
+		 */
+		error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH,
+				 ISSET(tp->t_state, TS_CONNECTED) ?
+				 "ttyin" : "ttyhup", (int)slp);
+		splx(s);
+		if (error == EWOULDBLOCK)
+			error = 0;
+		else if (error)
+			return (error);
+		/*
+		 * XXX what happens if another process eats some input
+		 * while we are asleep (not just here)?  It would be
+		 * safest to detect changes and reset our state variables
+		 * (has_stime and last_cc).
+		 */
+		slp = 0;
+		goto loop;
+	}
+read:
+	splx(s);
+	/*
+	 * Input present, check for input mapping and processing.
+	 */
+	first = 1;
+	if (ISSET(lflag, ICANON | ISIG))
+		goto slowcase;
+	for (;;) {
+		char ibuf[IBUFSIZ];
+		int icc;
+
+		icc = imin(uio->uio_resid, IBUFSIZ);
+		icc = q_to_b(qp, ibuf, icc);
+		if (icc <= 0) {
+			if (first)
+				goto loop;
+			break;
+		}
+		error = uiomove(ibuf, icc, uio);
+		/*
+		 * XXX if there was an error then we should ungetc() the
+		 * unmoved chars and reduce icc here.
+		 */
+#if NSNP > 0
+		if (ISSET(tp->t_lflag, ECHO) &&
+		    ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+			snpin((struct snoop *)tp->t_sc, ibuf, icc);
+#endif
+		if (error)
+			break;
+ 		if (uio->uio_resid == 0)
+			break;
+		first = 0;
+	}
+	goto out;
+slowcase:
+	for (;;) {
+		c = getc(qp);
+		if (c < 0) {
+			if (first)
+				goto loop;
+			break;
+		}
+		/*
+		 * delayed suspend (^Y)
+		 */
+		if (CCEQ(cc[VDSUSP], c) &&
+		    ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) {
+			pgsignal(tp->t_pgrp, SIGTSTP, 1);
+			if (first) {
+				error = ttysleep(tp, &lbolt, TTIPRI | PCATCH,
+						 "ttybg3", 0);
+				if (error)
+					break;
+				goto loop;
+			}
+			break;
+		}
+		/*
+		 * Interpret EOF only in canonical mode.
+		 */
+		if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON))
+			break;
+		/*
+		 * Give user character.
+		 */
+ 		error = ureadc(c, uio);
+		if (error)
+			/* XXX should ungetc(c, qp). */
+			break;
+#if NSNP > 0
+		/*
+		 * Only snoop directly on input in echo mode.  Non-echoed
+		 * input will be snooped later iff the application echoes it.
+		 */
+		if (ISSET(tp->t_lflag, ECHO) &&
+		    ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+			snpinc((struct snoop *)tp->t_sc, (char)c);
+#endif
+ 		if (uio->uio_resid == 0)
+			break;
+		/*
+		 * In canonical mode check for a "break character"
+		 * marking the end of a "line of input".
+		 */
+		if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag))
+			break;
+		first = 0;
+	}
+
+out:
+	/*
+	 * Look to unblock input now that (presumably)
+	 * the input queue has gone down.
+	 */
+	s = spltty();
+	if (ISSET(tp->t_state, TS_TBLOCK) &&
+	    tp->t_rawq.c_cc + tp->t_canq.c_cc <= tp->t_ilowat)
+		ttyunblock(tp);
+	splx(s);
+
+	return (error);
+}
+
+/*
+ * Check the output queue on tp for space for a kernel message (from uprintf
+ * or tprintf).  Allow some space over the normal hiwater mark so we don't
+ * lose messages due to normal flow control, but don't let the tty run amok.
+ * Sleeps here are not interruptible, but we return prematurely if new signals
+ * arrive.
+ */
+int
+ttycheckoutq(tp, wait)
+	register struct tty *tp;
+	int wait;
+{
+	int hiwat, s, oldsig;
+
+	hiwat = tp->t_ohiwat;
+	s = spltty();
+	oldsig = wait ? curproc->p_siglist : 0;
+	if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100)
+		while (tp->t_outq.c_cc > hiwat) {
+			ttstart(tp);
+			if (tp->t_outq.c_cc <= hiwat)
+				break;
+			if (wait == 0 || curproc->p_siglist != oldsig) {
+				splx(s);
+				return (0);
+			}
+			SET(tp->t_state, TS_SO_OLOWAT);
+			tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz);
+		}
+	splx(s);
+	return (1);
+}
+
+/*
+ * Process a write call on a tty device.
+ */
+int
+ttwrite(tp, uio, flag)
+	register struct tty *tp;
+	register struct uio *uio;
+	int flag;
+{
+	register char *cp = NULL;
+	register int cc, ce;
+	register struct proc *p;
+	int i, hiwat, cnt, error, s;
+	char obuf[OBUFSIZ];
+
+	hiwat = tp->t_ohiwat;
+	cnt = uio->uio_resid;
+	error = 0;
+	cc = 0;
+loop:
+	s = spltty();
+	if (ISSET(tp->t_state, TS_ZOMBIE)) {
+		splx(s);
+		if (uio->uio_resid == cnt)
+			error = EIO;
+		goto out;
+	}
+	if (!ISSET(tp->t_state, TS_CONNECTED)) {
+		if (flag & IO_NDELAY) {
+			splx(s);
+			error = EWOULDBLOCK;
+			goto out;
+		}
+		error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+				 "ttydcd", 0);
+		splx(s);
+		if (error)
+			goto out;
+		goto loop;
+	}
+	splx(s);
+	/*
+	 * Hang the process if it's in the background.
+	 */
+	p = curproc;
+	if (isbackground(p, tp) &&
+	    ISSET(tp->t_lflag, TOSTOP) && (p->p_flag & P_PPWAIT) == 0 &&
+	    (p->p_sigignore & sigmask(SIGTTOU)) == 0 &&
+	    (p->p_sigmask & sigmask(SIGTTOU)) == 0) {
+		if (p->p_pgrp->pg_jobc == 0) {
+			error = EIO;
+			goto out;
+		}
+		pgsignal(p->p_pgrp, SIGTTOU, 1);
+		error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0);
+		if (error)
+			goto out;
+		goto loop;
+	}
+	/*
+	 * Process the user's data in at most OBUFSIZ chunks.  Perform any
+	 * output translation.  Keep track of high water mark, sleep on
+	 * overflow awaiting device aid in acquiring new space.
+	 */
+	while (uio->uio_resid > 0 || cc > 0) {
+		if (ISSET(tp->t_lflag, FLUSHO)) {
+			uio->uio_resid = 0;
+			return (0);
+		}
+		if (tp->t_outq.c_cc > hiwat)
+			goto ovhiwat;
+		/*
+		 * Grab a hunk of data from the user, unless we have some
+		 * leftover from last time.
+		 */
+		if (cc == 0) {
+			cc = imin(uio->uio_resid, OBUFSIZ);
+			cp = obuf;
+			error = uiomove(cp, cc, uio);
+			if (error) {
+				cc = 0;
+				break;
+			}
+#if NSNP > 0
+			if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+				snpin((struct snoop *)tp->t_sc, cp, cc);
+#endif
+		}
+		/*
+		 * If nothing fancy need be done, grab those characters we
+		 * can handle without any of ttyoutput's processing and
+		 * just transfer them to the output q.  For those chars
+		 * which require special processing (as indicated by the
+		 * bits in char_type), call ttyoutput.  After processing
+		 * a hunk of data, look for FLUSHO so ^O's will take effect
+		 * immediately.
+		 */
+		while (cc > 0) {
+			if (!ISSET(tp->t_oflag, OPOST))
+				ce = cc;
+			else {
+				ce = cc - scanc((u_int)cc, (u_char *)cp,
+						char_type, CCLASSMASK);
+				/*
+				 * If ce is zero, then we're processing
+				 * a special character through ttyoutput.
+				 */
+				if (ce == 0) {
+					tp->t_rocount = 0;
+					if (ttyoutput(*cp, tp) >= 0) {
+						/* No Clists, wait a bit. */
+						ttstart(tp);
+						if (flag & IO_NDELAY) {
+							error = EWOULDBLOCK;
+							goto out;
+						}
+						error = ttysleep(tp, &lbolt,
+								 TTOPRI|PCATCH,
+								 "ttybf1", 0);
+						if (error)
+							goto out;
+						goto loop;
+					}
+					cp++;
+					cc--;
+					if (ISSET(tp->t_lflag, FLUSHO) ||
+					    tp->t_outq.c_cc > hiwat)
+						goto ovhiwat;
+					continue;
+				}
+			}
+			/*
+			 * A bunch of normal characters have been found.
+			 * Transfer them en masse to the output queue and
+			 * continue processing at the top of the loop.
+			 * If there are any further characters in this
+			 * <= OBUFSIZ chunk, the first should be a character
+			 * requiring special handling by ttyoutput.
+			 */
+			tp->t_rocount = 0;
+			i = b_to_q(cp, ce, &tp->t_outq);
+			ce -= i;
+			tp->t_column += ce;
+			cp += ce, cc -= ce, tk_nout += ce;
+			tp->t_outcc += ce;
+			if (i > 0) {
+				/* No Clists, wait a bit. */
+				ttstart(tp);
+				if (flag & IO_NDELAY) {
+					error = EWOULDBLOCK;
+					goto out;
+				}
+				error = ttysleep(tp, &lbolt, TTOPRI | PCATCH,
+						 "ttybf2", 0);
+				if (error)
+					goto out;
+				goto loop;
+			}
+			if (ISSET(tp->t_lflag, FLUSHO) ||
+			    tp->t_outq.c_cc > hiwat)
+				break;
+		}
+		ttstart(tp);
+	}
+out:
+	/*
+	 * If cc is nonzero, we leave the uio structure inconsistent, as the
+	 * offset and iov pointers have moved forward, but it doesn't matter
+	 * (the call will either return short or restart with a new uio).
+	 */
+	uio->uio_resid += cc;
+	return (error);
+
+ovhiwat:
+	ttstart(tp);
+	s = spltty();
+	/*
+	 * This can only occur if FLUSHO is set in t_lflag,
+	 * or if ttstart/oproc is synchronous (or very fast).
+	 */
+	if (tp->t_outq.c_cc <= hiwat) {
+		splx(s);
+		goto loop;
+	}
+	if (flag & IO_NDELAY) {
+		splx(s);
+		uio->uio_resid += cc;
+		return (uio->uio_resid == cnt ? EWOULDBLOCK : 0);
+	}
+	SET(tp->t_state, TS_SO_OLOWAT);
+	error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri",
+			 tp->t_timeout);
+	splx(s);
+	if (error == EWOULDBLOCK)
+		error = EIO;
+	if (error)
+		goto out;
+	goto loop;
+}
+
+/*
+ * Rubout one character from the rawq of tp
+ * as cleanly as possible.
+ */
+static void
+ttyrub(c, tp)
+	register int c;
+	register struct tty *tp;
+{
+	register char *cp;
+	register int savecol;
+	int tabc, s;
+
+	if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC))
+		return;
+	CLR(tp->t_lflag, FLUSHO);
+	if (ISSET(tp->t_lflag, ECHOE)) {
+		if (tp->t_rocount == 0) {
+			/*
+			 * Screwed by ttwrite; retype
+			 */
+			ttyretype(tp);
+			return;
+		}
+		if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE))
+			ttyrubo(tp, 2);
+		else {
+			CLR(c, ~TTY_CHARMASK);
+			switch (CCLASS(c)) {
+			case ORDINARY:
+				ttyrubo(tp, 1);
+				break;
+			case BACKSPACE:
+			case CONTROL:
+			case NEWLINE:
+			case RETURN:
+			case VTAB:
+				if (ISSET(tp->t_lflag, ECHOCTL))
+					ttyrubo(tp, 2);
+				break;
+			case TAB:
+				if (tp->t_rocount < tp->t_rawq.c_cc) {
+					ttyretype(tp);
+					return;
+				}
+				s = spltty();
+				savecol = tp->t_column;
+				SET(tp->t_state, TS_CNTTB);
+				SET(tp->t_lflag, FLUSHO);
+				tp->t_column = tp->t_rocol;
+				cp = tp->t_rawq.c_cf;
+				if (cp)
+					tabc = *cp;	/* XXX FIX NEXTC */
+				for (; cp; cp = nextc(&tp->t_rawq, cp, &tabc))
+					ttyecho(tabc, tp);
+				CLR(tp->t_lflag, FLUSHO);
+				CLR(tp->t_state, TS_CNTTB);
+				splx(s);
+
+				/* savecol will now be length of the tab. */
+				savecol -= tp->t_column;
+				tp->t_column += savecol;
+				if (savecol > 8)
+					savecol = 8;	/* overflow screw */
+				while (--savecol >= 0)
+					(void)ttyoutput('\b', tp);
+				break;
+			default:			/* XXX */
+#define	PANICSTR	"ttyrub: would panic c = %d, val = %d\n"
+				(void)printf(PANICSTR, c, CCLASS(c));
+#ifdef notdef
+				panic(PANICSTR, c, CCLASS(c));
+#endif
+			}
+		}
+	} else if (ISSET(tp->t_lflag, ECHOPRT)) {
+		if (!ISSET(tp->t_state, TS_ERASE)) {
+			SET(tp->t_state, TS_ERASE);
+			(void)ttyoutput('\\', tp);
+		}
+		ttyecho(c, tp);
+	} else
+		ttyecho(tp->t_cc[VERASE], tp);
+	--tp->t_rocount;
+}
+
+/*
+ * Back over cnt characters, erasing them.
+ */
+static void
+ttyrubo(tp, cnt)
+	register struct tty *tp;
+	int cnt;
+{
+
+	while (cnt-- > 0) {
+		(void)ttyoutput('\b', tp);
+		(void)ttyoutput(' ', tp);
+		(void)ttyoutput('\b', tp);
+	}
+}
+
+/*
+ * ttyretype --
+ *	Reprint the rawq line.  Note, it is assumed that c_cc has already
+ *	been checked.
+ */
+static void
+ttyretype(tp)
+	register struct tty *tp;
+{
+	register char *cp;
+	int s, c;
+
+	/* Echo the reprint character. */
+	if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE)
+		ttyecho(tp->t_cc[VREPRINT], tp);
+
+	(void)ttyoutput('\n', tp);
+
+	/*
+	 * XXX
+	 * FIX: NEXTC IS BROKEN - DOESN'T CHECK QUOTE
+	 * BIT OF FIRST CHAR.
+	 */
+	s = spltty();
+	for (cp = tp->t_canq.c_cf, c = (cp != NULL ? *cp : 0);
+	    cp != NULL; cp = nextc(&tp->t_canq, cp, &c))
+		ttyecho(c, tp);
+	for (cp = tp->t_rawq.c_cf, c = (cp != NULL ? *cp : 0);
+	    cp != NULL; cp = nextc(&tp->t_rawq, cp, &c))
+		ttyecho(c, tp);
+	CLR(tp->t_state, TS_ERASE);
+	splx(s);
+
+	tp->t_rocount = tp->t_rawq.c_cc;
+	tp->t_rocol = 0;
+}
+
+/*
+ * Echo a typed character to the terminal.
+ */
+static void
+ttyecho(c, tp)
+	register int c;
+	register struct tty *tp;
+{
+
+	if (!ISSET(tp->t_state, TS_CNTTB))
+		CLR(tp->t_lflag, FLUSHO);
+	if ((!ISSET(tp->t_lflag, ECHO) &&
+	     (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) ||
+	    ISSET(tp->t_lflag, EXTPROC))
+		return;
+	if (ISSET(tp->t_lflag, ECHOCTL) &&
+	    ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') ||
+	    ISSET(c, TTY_CHARMASK) == 0177)) {
+		(void)ttyoutput('^', tp);
+		CLR(c, ~TTY_CHARMASK);
+		if (c == 0177)
+			c = '?';
+		else
+			c += 'A' - 1;
+	}
+	(void)ttyoutput(c, tp);
+}
+
+/*
+ * Wake up any readers on a tty.
+ */
+void
+ttwakeup(tp)
+	register struct tty *tp;
+{
+
+	if (tp->t_rsel.si_pid != 0)
+		selwakeup(&tp->t_rsel);
+	if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
+		pgsigio(tp->t_sigio, SIGIO, (tp->t_session != NULL));
+	wakeup(TSA_HUP_OR_INPUT(tp));
+}
+
+/*
+ * Wake up any writers on a tty.
+ */
+void
+ttwwakeup(tp)
+	register struct tty *tp;
+{
+
+	if (tp->t_wsel.si_pid != 0 && tp->t_outq.c_cc <= tp->t_olowat)
+		selwakeup(&tp->t_wsel);
+	if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) ==
+	    TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) {
+		CLR(tp->t_state, TS_SO_OCOMPLETE);
+		wakeup(TSA_OCOMPLETE(tp));
+	}
+	if (ISSET(tp->t_state, TS_SO_OLOWAT) &&
+	    tp->t_outq.c_cc <= tp->t_olowat) {
+		CLR(tp->t_state, TS_SO_OLOWAT);
+		wakeup(TSA_OLOWAT(tp));
+	}
+}
+
+/*
+ * Look up a code for a specified speed in a conversion table;
+ * used by drivers to map software speed values to hardware parameters.
+ */
+int
+ttspeedtab(speed, table)
+	int speed;
+	register struct speedtab *table;
+{
+
+	for ( ; table->sp_speed != -1; table++)
+		if (table->sp_speed == speed)
+			return (table->sp_code);
+	return (-1);
+}
+
+/*
+ * Set input and output watermarks and buffer sizes.  For input, the
+ * high watermark is about one second's worth of input above empty, the
+ * low watermark is slightly below high water, and the buffer size is a
+ * driver-dependent amount above high water.  For output, the watermarks
+ * are near the ends of the buffer, with about 1 second's worth of input
+ * between them.  All this only applies to the standard line discipline.
+ */
+void
+ttsetwater(tp)
+	struct tty *tp;
+{
+	register int cps, ttmaxhiwat, x;
+
+	/* Input. */
+	clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512);
+	switch (tp->t_ispeedwat) {
+	case (speed_t)-1:
+		cps = tp->t_ispeed / 10;
+		break;
+	case 0:
+		/*
+		 * This case is for old drivers that don't know about
+		 * t_ispeedwat.  Arrange for them to get the old buffer
+		 * sizes and watermarks.
+		 */
+		cps = TTYHOG - 2 * 256;
+		tp->t_ififosize = 2 * 256;
+		break;
+	default:
+		cps = tp->t_ispeedwat / 10;
+		break;
+	}
+	tp->t_ihiwat = cps;
+	tp->t_ilowat = 7 * cps / 8;
+	x = cps + tp->t_ififosize;
+	clist_alloc_cblocks(&tp->t_rawq, x, x);
+
+	/* Output. */
+	switch (tp->t_ospeedwat) {
+	case (speed_t)-1:
+		cps = tp->t_ospeed / 10;
+		ttmaxhiwat = 2 * TTMAXHIWAT;
+		break;
+	case 0:
+		cps = tp->t_ospeed / 10;
+		ttmaxhiwat = TTMAXHIWAT;
+		break;
+	default:
+		cps = tp->t_ospeedwat / 10;
+		ttmaxhiwat = 8 * TTMAXHIWAT;
+		break;
+	}
+#define CLAMP(x, h, l)	((x) > h ? h : ((x) < l) ? l : (x))
+	tp->t_olowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT);
+	x += cps;
+	x = CLAMP(x, ttmaxhiwat, TTMINHIWAT);	/* XXX clamps are too magic */
+	tp->t_ohiwat = roundup(x, CBSIZE);	/* XXX for compat */
+	x = imax(tp->t_ohiwat, TTMAXHIWAT);	/* XXX for compat/safety */
+	x += OBUFSIZ + 100;
+	clist_alloc_cblocks(&tp->t_outq, x, x);
+#undef	CLAMP
+}
+
+/*
+ * Report on state of foreground process group.
+ */
+void
+ttyinfo(tp)
+	register struct tty *tp;
+{
+	register struct proc *p, *pick;
+	struct timeval utime, stime;
+	int tmp;
+
+	if (ttycheckoutq(tp,0) == 0)
+		return;
+
+	/* Print load average. */
+	tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
+	ttyprintf(tp, "load: %d.%02d ", tmp / 100, tmp % 100);
+
+	if (tp->t_session == NULL)
+		ttyprintf(tp, "not a controlling terminal\n");
+	else if (tp->t_pgrp == NULL)
+		ttyprintf(tp, "no foreground process group\n");
+	else if ((p = tp->t_pgrp->pg_members.lh_first) == 0)
+		ttyprintf(tp, "empty foreground process group\n");
+	else {
+		/* Pick interesting process. */
+		for (pick = NULL; p != 0; p = p->p_pglist.le_next)
+			if (proc_compare(pick, p))
+				pick = p;
+
+		ttyprintf(tp, " cmd: %s %d [%s] ", pick->p_comm, pick->p_pid,
+		    pick->p_stat == SRUN ? "running" :
+		    pick->p_wmesg ? pick->p_wmesg : "iowait");
+
+		calcru(pick, &utime, &stime, NULL);
+
+		/* Print user time. */
+		ttyprintf(tp, "%ld.%02ldu ",
+		    utime.tv_sec, utime.tv_usec / 10000);
+
+		/* Print system time. */
+		ttyprintf(tp, "%ld.%02lds ",
+		    stime.tv_sec, stime.tv_usec / 10000);
+
+#define	pgtok(a)	(((a) * PAGE_SIZE) / 1024)
+		/* Print percentage cpu, resident set size. */
+		tmp = (pick->p_pctcpu * 10000 + FSCALE / 2) >> FSHIFT;
+		ttyprintf(tp, "%d%% %ldk\n",
+		    tmp / 100,
+		    pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 :
+#ifdef pmap_resident_count
+		    (long)pgtok(pmap_resident_count(&pick->p_vmspace->vm_pmap))
+#else
+		    (long)pgtok(pick->p_vmspace->vm_rssize)
+#endif
+		    );
+	}
+	tp->t_rocount = 0;	/* so pending input will be retyped if BS */
+}
+
+/*
+ * Returns 1 if p2 is "better" than p1
+ *
+ * The algorithm for picking the "interesting" process is thus:
+ *
+ *	1) Only foreground processes are eligible - implied.
+ *	2) Runnable processes are favored over anything else.  The runner
+ *	   with the highest cpu utilization is picked (p_estcpu).  Ties are
+ *	   broken by picking the highest pid.
+ *	3) The sleeper with the shortest sleep time is next.  With ties,
+ *	   we pick out just "short-term" sleepers (P_SINTR == 0).
+ *	4) Further ties are broken by picking the highest pid.
+ */
+#define ISRUN(p)	(((p)->p_stat == SRUN) || ((p)->p_stat == SIDL))
+#define TESTAB(a, b)    ((a)<<1 | (b))
+#define ONLYA   2
+#define ONLYB   1
+#define BOTH    3
+
+static int
+proc_compare(p1, p2)
+	register struct proc *p1, *p2;
+{
+
+	if (p1 == NULL)
+		return (1);
+	/*
+	 * see if at least one of them is runnable
+	 */
+	switch (TESTAB(ISRUN(p1), ISRUN(p2))) {
+	case ONLYA:
+		return (0);
+	case ONLYB:
+		return (1);
+	case BOTH:
+		/*
+		 * tie - favor one with highest recent cpu utilization
+		 */
+		if (p2->p_estcpu > p1->p_estcpu)
+			return (1);
+		if (p1->p_estcpu > p2->p_estcpu)
+			return (0);
+		return (p2->p_pid > p1->p_pid);	/* tie - return highest pid */
+	}
+	/*
+ 	 * weed out zombies
+	 */
+	switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) {
+	case ONLYA:
+		return (1);
+	case ONLYB:
+		return (0);
+	case BOTH:
+		return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+	}
+	/*
+	 * pick the one with the smallest sleep time
+	 */
+	if (p2->p_slptime > p1->p_slptime)
+		return (0);
+	if (p1->p_slptime > p2->p_slptime)
+		return (1);
+	/*
+	 * favor one sleeping in a non-interruptible sleep
+	 */
+	if (p1->p_flag & P_SINTR && (p2->p_flag & P_SINTR) == 0)
+		return (1);
+	if (p2->p_flag & P_SINTR && (p1->p_flag & P_SINTR) == 0)
+		return (0);
+	return (p2->p_pid > p1->p_pid);		/* tie - return highest pid */
+}
+
+/*
+ * Output char to tty; console putchar style.
+ */
+int
+tputchar(c, tp)
+	int c;
+	struct tty *tp;
+{
+	register int s;
+
+	s = spltty();
+	if (!ISSET(tp->t_state, TS_CONNECTED)) {
+		splx(s);
+		return (-1);
+	}
+	if (c == '\n')
+		(void)ttyoutput('\r', tp);
+	(void)ttyoutput(c, tp);
+	ttstart(tp);
+	splx(s);
+	return (0);
+}
+
+/*
+ * Sleep on chan, returning ERESTART if tty changed while we napped and
+ * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep.  If
+ * the tty is revoked, restarting a pending call will redo validation done
+ * at the start of the call.
+ */
+int
+ttysleep(tp, chan, pri, wmesg, timo)
+	struct tty *tp;
+	void *chan;
+	int pri, timo;
+	char *wmesg;
+{
+	int error;
+	int gen;
+
+	gen = tp->t_gen;
+	error = tsleep(chan, pri, wmesg, timo);
+	if (error)
+		return (error);
+	return (tp->t_gen == gen ? 0 : ERESTART);
+}
+
+#ifdef notyet
+/*
+ * XXX this is usable not useful or used.  Most tty drivers have
+ * ifdefs for using ttymalloc() but assume a different interface.
+ */
+/*
+ * Allocate a tty struct.  Clists in the struct will be allocated by
+ * ttyopen().
+ */
+struct tty *
+ttymalloc()
+{
+        struct tty *tp;
+
+        tp = malloc(sizeof *tp, M_TTYS, M_WAITOK);
+        bzero(tp, sizeof *tp);
+        return (tp);
+}
+#endif
+
+#if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */
+/*
+ * Free a tty struct.  Clists in the struct should have been freed by
+ * ttyclose().
+ */
+void
+ttyfree(tp)
+	struct tty *tp;
+{
+        free(tp, M_TTYS);
+}
+#endif /* 0 */
diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c
new file mode 100644
index 0000000..fa2ae5c
--- /dev/null
+++ b/sys/kern/tty_compat.c
@@ -0,0 +1,490 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_compat.c	8.1 (Berkeley) 6/10/93
+ * $Id: tty_compat.c,v 1.27 1998/02/25 06:16:37 bde Exp $
+ */
+
+#include "opt_compat.h"
+
+/*
+ * mapping routines for old line discipline (yuck)
+ */
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ioctl_compat.h>
+#include <sys/tty.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+static int ttcompatgetflags	__P((struct tty	*tp));
+static void ttcompatsetflags	__P((struct tty	*tp, struct termios *t));
+static void ttcompatsetlflags	__P((struct tty	*tp, struct termios *t));
+static int ttcompatspeedtab	__P((int speed, struct speedtab *table));
+
+static int ttydebug = 0;
+SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, "");
+
+static struct speedtab compatspeeds[] = {
+#define MAX_SPEED	17
+	{ 115200, 17 },
+	{ 57600, 16 },
+	{ 38400, 15 },
+	{ 19200, 14 },
+	{ 9600,	13 },
+	{ 4800,	12 },
+	{ 2400,	11 },
+	{ 1800,	10 },
+	{ 1200,	9 },
+	{ 600,	8 },
+	{ 300,	7 },
+	{ 200,	6 },
+	{ 150,	5 },
+	{ 134,	4 },
+	{ 110,	3 },
+	{ 75,	2 },
+	{ 50,	1 },
+	{ 0,	0 },
+	{ -1,	-1 },
+};
+static int compatspcodes[] = {
+	0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
+	1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200,
+};
+
+static int
+ttcompatspeedtab(speed, table)
+	int speed;
+	register struct speedtab *table;
+{
+	if (speed == 0)
+		return (0); /* hangup */
+	for ( ; table->sp_speed > 0; table++)
+		if (table->sp_speed <= speed) /* nearest one, rounded down */
+			return (table->sp_code);
+	return (1); /* 50, min and not hangup */
+}
+
+int
+ttsetcompat(tp, com, data, term)
+	register struct tty *tp;
+	u_long *com;
+	caddr_t data;
+	struct termios *term;
+{
+	switch (*com) {
+	case TIOCSETP:
+	case TIOCSETN: {
+		register struct sgttyb *sg = (struct sgttyb *)data;
+		int speed;
+
+		if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0)
+			return(EINVAL);
+		else if (speed != ttcompatspeedtab(tp->t_ispeed, compatspeeds))
+			term->c_ispeed = compatspcodes[speed];
+		else
+			term->c_ispeed = tp->t_ispeed;
+		if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0)
+			return(EINVAL);
+		else if (speed != ttcompatspeedtab(tp->t_ospeed, compatspeeds))
+			term->c_ospeed = compatspcodes[speed];
+		else
+			term->c_ospeed = tp->t_ospeed;
+		term->c_cc[VERASE] = sg->sg_erase;
+		term->c_cc[VKILL] = sg->sg_kill;
+		tp->t_flags = (tp->t_flags&0xffff0000) | (sg->sg_flags&0xffff);
+		ttcompatsetflags(tp, term);
+		*com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA;
+		break;
+	}
+	case TIOCSETC: {
+		struct tchars *tc = (struct tchars *)data;
+		register cc_t *cc;
+
+		cc = term->c_cc;
+		cc[VINTR] = tc->t_intrc;
+		cc[VQUIT] = tc->t_quitc;
+		cc[VSTART] = tc->t_startc;
+		cc[VSTOP] = tc->t_stopc;
+		cc[VEOF] = tc->t_eofc;
+		cc[VEOL] = tc->t_brkc;
+		if (tc->t_brkc == -1)
+			cc[VEOL2] = _POSIX_VDISABLE;
+		*com = TIOCSETA;
+		break;
+	}
+	case TIOCSLTC: {
+		struct ltchars *ltc = (struct ltchars *)data;
+		register cc_t *cc;
+
+		cc = term->c_cc;
+		cc[VSUSP] = ltc->t_suspc;
+		cc[VDSUSP] = ltc->t_dsuspc;
+		cc[VREPRINT] = ltc->t_rprntc;
+		cc[VDISCARD] = ltc->t_flushc;
+		cc[VWERASE] = ltc->t_werasc;
+		cc[VLNEXT] = ltc->t_lnextc;
+		*com = TIOCSETA;
+		break;
+	}
+	case TIOCLBIS:
+	case TIOCLBIC:
+	case TIOCLSET:
+		if (*com == TIOCLSET)
+			tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16;
+		else {
+			tp->t_flags =
+			 (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff);
+			if (*com == TIOCLBIS)
+				tp->t_flags |= *(int *)data<<16;
+			else
+				tp->t_flags &= ~(*(int *)data<<16);
+		}
+		ttcompatsetlflags(tp, term);
+		*com = TIOCSETA;
+		break;
+	}
+	return 0;
+}
+
+/*ARGSUSED*/
+int
+ttcompat(tp, com, data, flag)
+	register struct tty *tp;
+	u_long com;
+	caddr_t data;
+	int flag;
+{
+	switch (com) {
+	case TIOCSETP:
+	case TIOCSETN:
+	case TIOCSETC:
+	case TIOCSLTC:
+	case TIOCLBIS:
+	case TIOCLBIC:
+	case TIOCLSET: {
+		struct termios term;
+		int error;
+
+		term = tp->t_termios;
+		if ((error = ttsetcompat(tp, &com, data, &term)) != 0)
+			return error;
+		return ttioctl(tp, com, &term, flag);
+	}
+	case TIOCGETP: {
+		register struct sgttyb *sg = (struct sgttyb *)data;
+		register cc_t *cc = tp->t_cc;
+
+		sg->sg_ospeed = ttcompatspeedtab(tp->t_ospeed, compatspeeds);
+		if (tp->t_ispeed == 0)
+			sg->sg_ispeed = sg->sg_ospeed;
+		else
+			sg->sg_ispeed = ttcompatspeedtab(tp->t_ispeed, compatspeeds);
+		sg->sg_erase = cc[VERASE];
+		sg->sg_kill = cc[VKILL];
+		sg->sg_flags = tp->t_flags = ttcompatgetflags(tp);
+		break;
+	}
+	case TIOCGETC: {
+		struct tchars *tc = (struct tchars *)data;
+		register cc_t *cc = tp->t_cc;
+
+		tc->t_intrc = cc[VINTR];
+		tc->t_quitc = cc[VQUIT];
+		tc->t_startc = cc[VSTART];
+		tc->t_stopc = cc[VSTOP];
+		tc->t_eofc = cc[VEOF];
+		tc->t_brkc = cc[VEOL];
+		break;
+	}
+	case TIOCGLTC: {
+		struct ltchars *ltc = (struct ltchars *)data;
+		register cc_t *cc = tp->t_cc;
+
+		ltc->t_suspc = cc[VSUSP];
+		ltc->t_dsuspc = cc[VDSUSP];
+		ltc->t_rprntc = cc[VREPRINT];
+		ltc->t_flushc = cc[VDISCARD];
+		ltc->t_werasc = cc[VWERASE];
+		ltc->t_lnextc = cc[VLNEXT];
+		break;
+	}
+	case TIOCLGET:
+		tp->t_flags =
+		 (ttcompatgetflags(tp) & 0xffff0000UL)
+		   | (tp->t_flags & 0xffff);
+		*(int *)data = tp->t_flags>>16;
+		if (ttydebug)
+			printf("CLGET: returning %x\n", *(int *)data);
+		break;
+
+	case OTIOCGETD:
+		*(int *)data = tp->t_line ? tp->t_line : 2;
+		break;
+
+	case OTIOCSETD: {
+		int ldisczero = 0;
+
+		return (ttioctl(tp, TIOCSETD,
+			*(int *)data == 2 ? (caddr_t)&ldisczero : data, flag));
+	    }
+
+	case OTIOCCONS:
+		*(int *)data = 1;
+		return (ttioctl(tp, TIOCCONS, data, flag));
+
+	default:
+		return (ENOIOCTL);
+	}
+	return (0);
+}
+
+static int
+ttcompatgetflags(tp)
+	register struct tty *tp;
+{
+	register tcflag_t iflag	= tp->t_iflag;
+	register tcflag_t lflag	= tp->t_lflag;
+	register tcflag_t oflag	= tp->t_oflag;
+	register tcflag_t cflag	= tp->t_cflag;
+	register int flags = 0;
+
+	if (iflag&IXOFF)
+		flags |= TANDEM;
+	if (iflag&ICRNL || oflag&ONLCR)
+		flags |= CRMOD;
+	if ((cflag&CSIZE) == CS8) {
+		flags |= PASS8;
+		if (iflag&ISTRIP)
+			flags |= ANYP;
+	}
+	else if (cflag&PARENB) {
+		if (iflag&INPCK) {
+			if (cflag&PARODD)
+				flags |= ODDP;
+			else
+				flags |= EVENP;
+		} else
+			flags |= EVENP | ODDP;
+	}
+
+	if ((lflag&ICANON) == 0) {
+		/* fudge */
+		if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG)
+		    || (cflag&(CSIZE|PARENB)) != CS8)
+			flags |= CBREAK;
+		else
+			flags |= RAW;
+	}
+	if (!(flags&RAW) && !(oflag&OPOST) && (cflag&(CSIZE|PARENB)) == CS8)
+		flags |= LITOUT;
+	if (cflag&MDMBUF)
+		flags |= MDMBUF;
+	if ((cflag&HUPCL) == 0)
+		flags |= NOHANG;
+	if (oflag&OXTABS)
+		flags |= XTABS;
+	if (lflag&ECHOE)
+		flags |= CRTERA|CRTBS;
+	if (lflag&ECHOKE)
+		flags |= CRTKIL|CRTBS;
+	if (lflag&ECHOPRT)
+		flags |= PRTERA;
+	if (lflag&ECHOCTL)
+		flags |= CTLECH;
+	if ((iflag&IXANY) == 0)
+		flags |= DECCTQ;
+	flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH);
+	if (ttydebug)
+		printf("getflags: %x\n", flags);
+	return (flags);
+}
+
+static void
+ttcompatsetflags(tp, t)
+	register struct tty *tp;
+	register struct termios *t;
+{
+	register int flags = tp->t_flags;
+	register tcflag_t iflag	= t->c_iflag;
+	register tcflag_t oflag	= t->c_oflag;
+	register tcflag_t lflag	= t->c_lflag;
+	register tcflag_t cflag	= t->c_cflag;
+
+	if (flags & RAW) {
+		iflag = IGNBRK;
+		lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN);
+	} else {
+		iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+		iflag |= BRKINT|IXON|IMAXBEL;
+		lflag |= ISIG|IEXTEN|ECHOCTL;	/* XXX was echoctl on ? */
+		if (flags & XTABS)
+			oflag |= OXTABS;
+		else
+			oflag &= ~OXTABS;
+		if (flags & CBREAK)
+			lflag &= ~ICANON;
+		else
+			lflag |= ICANON;
+		if (flags&CRMOD) {
+			iflag |= ICRNL;
+			oflag |= ONLCR;
+		} else {
+			iflag &= ~ICRNL;
+			oflag &= ~ONLCR;
+		}
+	}
+	if (flags&ECHO)
+		lflag |= ECHO;
+	else
+		lflag &= ~ECHO;
+
+	cflag &= ~(CSIZE|PARENB);
+	if (flags&(RAW|LITOUT|PASS8)) {
+		cflag |= CS8;
+		if (!(flags&(RAW|PASS8))
+		    || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+			iflag |= ISTRIP;
+		else
+			iflag &= ~ISTRIP;
+		if (flags&(RAW|LITOUT))
+			oflag &= ~OPOST;
+		else
+			oflag |= OPOST;
+	} else {
+		cflag |= CS7|PARENB;
+		iflag |= ISTRIP;
+		oflag |= OPOST;
+	}
+	/* XXX don't set INPCK if RAW or PASS8? */
+	if ((flags&(EVENP|ODDP)) == EVENP) {
+		iflag |= INPCK;
+		cflag &= ~PARODD;
+	} else if ((flags&(EVENP|ODDP)) == ODDP) {
+		iflag |= INPCK;
+		cflag |= PARODD;
+	} else
+		iflag &= ~INPCK;
+	if (flags&TANDEM)
+		iflag |= IXOFF;
+	else
+		iflag &= ~IXOFF;
+	if ((flags&DECCTQ) == 0)
+		iflag |= IXANY;
+	else
+		iflag &= ~IXANY;
+	t->c_iflag = iflag;
+	t->c_oflag = oflag;
+	t->c_lflag = lflag;
+	t->c_cflag = cflag;
+}
+
+static void
+ttcompatsetlflags(tp, t)
+	register struct tty *tp;
+	register struct termios *t;
+{
+	register int flags = tp->t_flags;
+	register tcflag_t iflag	= t->c_iflag;
+	register tcflag_t oflag	= t->c_oflag;
+	register tcflag_t lflag	= t->c_lflag;
+	register tcflag_t cflag	= t->c_cflag;
+
+	iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+	if (flags&CRTERA)
+		lflag |= ECHOE;
+	else
+		lflag &= ~ECHOE;
+	if (flags&CRTKIL)
+		lflag |= ECHOKE;
+	else
+		lflag &= ~ECHOKE;
+	if (flags&PRTERA)
+		lflag |= ECHOPRT;
+	else
+		lflag &= ~ECHOPRT;
+	if (flags&CTLECH)
+		lflag |= ECHOCTL;
+	else
+		lflag &= ~ECHOCTL;
+	if (flags&TANDEM)
+		iflag |= IXOFF;
+	else
+		iflag &= ~IXOFF;
+	if ((flags&DECCTQ) == 0)
+		iflag |= IXANY;
+	else
+		iflag &= ~IXANY;
+	if (flags & MDMBUF)
+		cflag |= MDMBUF;
+	else
+		cflag &= ~MDMBUF;
+	if (flags&NOHANG)
+		cflag &= ~HUPCL;
+	else
+		cflag |= HUPCL;
+	lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+	lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+
+	/*
+	 * The next if-else statement is copied from above so don't bother
+	 * checking it separately.  We could avoid fiddlling with the
+	 * character size if the mode is already RAW or if neither the
+	 * LITOUT bit or the PASS8 bit is being changed, but the delta of
+	 * the change is not available here and skipping the RAW case would
+	 * make the code different from above.
+	 */
+	cflag &= ~(CSIZE|PARENB);
+	if (flags&(RAW|LITOUT|PASS8)) {
+		cflag |= CS8;
+		if (!(flags&(RAW|PASS8))
+		    || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+			iflag |= ISTRIP;
+		else
+			iflag &= ~ISTRIP;
+		if (flags&(RAW|LITOUT))
+			oflag &= ~OPOST;
+		else
+			oflag |= OPOST;
+	} else {
+		cflag |= CS7|PARENB;
+		iflag |= ISTRIP;
+		oflag |= OPOST;
+	}
+	t->c_iflag = iflag;
+	t->c_oflag = oflag;
+	t->c_lflag = lflag;
+	t->c_cflag = cflag;
+}
+#endif	/* COMPAT_43 || COMPAT_SUNOS */
diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c
new file mode 100644
index 0000000..12f26e0
--- /dev/null
+++ b/sys/kern/tty_conf.c
@@ -0,0 +1,210 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_conf.c	8.4 (Berkeley) 1/21/94
+ * $Id: tty_conf.c,v 1.12 1997/12/16 17:40:27 eivind Exp $
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+
+#ifndef MAXLDISC
+#define MAXLDISC 8
+#endif
+
+static l_open_t		l_noopen;
+static l_close_t	l_noclose;
+static l_ioctl_t	l_nullioctl;
+static l_rint_t		l_norint;
+static l_start_t	l_nostart;
+
+/*
+ * XXX it probably doesn't matter what the entries other than the l_open
+ * entry are here.  The l_nullioctl and ttymodem entries still look fishy.
+ * Reconsider the removal of nullmodem anyway.  It was too much like
+ * ttymodem, but a completely null version might be useful.
+ */
+#define NODISC(n) \
+	{ l_noopen,	l_noclose,	l_noread,	l_nowrite, \
+	  l_nullioctl,	l_norint,	l_nostart,	ttymodem }
+
+struct	linesw linesw[MAXLDISC] =
+{
+				/* 0- termios */
+	{ ttyopen,	ttylclose,	ttread,		ttwrite,
+	  l_nullioctl,	ttyinput,	ttstart,	ttymodem },
+	NODISC(1),		/* 1- defunct */
+	  			/* 2- NTTYDISC */
+#ifdef COMPAT_43
+	{ ttyopen,	ttylclose,	ttread,		ttwrite,
+	  l_nullioctl,	ttyinput,	ttstart,	ttymodem },
+#else
+	NODISC(2),
+#endif
+	NODISC(3),		/* TABLDISC */
+	NODISC(4),		/* SLIPDISC */
+	NODISC(5),		/* PPPDISC */
+	NODISC(6),		/* loadable */
+	NODISC(7),		/* loadable */
+};
+
+int	nlinesw = sizeof (linesw) / sizeof (linesw[0]);
+
+static struct linesw nodisc = NODISC(0);
+
+#define LOADABLE_LDISC 6
+/*
+ * ldisc_register: Register a line discipline.
+ *
+ * discipline: Index for discipline to load, or LDISC_LOAD for us to choose.
+ * linesw_p:   Pointer to linesw_p.
+ *
+ * Returns: Index used or -1 on failure.
+ */
+int
+ldisc_register(discipline, linesw_p)
+	int discipline;
+	struct linesw *linesw_p;
+{
+	int slot = -1;
+
+	if (discipline == LDISC_LOAD) {
+		int i;
+		for (i = LOADABLE_LDISC; i < MAXLDISC; i++)
+			if (bcmp(linesw + i, &nodisc, sizeof(nodisc)) == 0) {
+				slot = i;
+			}
+	}
+	else if (discipline >= 0 && discipline < MAXLDISC) {
+		slot = discipline;
+	}
+
+	if (slot != -1 && linesw_p)
+		linesw[slot] = *linesw_p;
+
+	return slot;
+}
+
+/*
+ * ldisc_deregister: Deregister a line discipline obtained with
+ * ldisc_register.  Can only deregister "loadable" ones now.
+ *
+ * discipline: Index for discipline to unload.
+ */
+void
+ldisc_deregister(discipline)
+	int discipline;
+{
+	if (discipline >= LOADABLE_LDISC && discipline < MAXLDISC) {
+		linesw[discipline] = nodisc;
+	}
+}
+
+static int
+l_noopen(dev, tp)
+	dev_t dev;
+	struct tty *tp;
+{
+
+	return (ENODEV);
+}
+
+static int
+l_noclose(tp, flag)
+	struct tty *tp;
+	int flag;
+{
+
+	return (ENODEV);
+}
+
+int
+l_noread(tp, uio, flag)
+	struct tty *tp;
+	struct uio *uio;
+	int flag;
+{
+
+	return (ENODEV);
+}
+
+int
+l_nowrite(tp, uio, flag)
+	struct tty *tp;
+	struct uio *uio;
+	int flag;
+{
+
+	return (ENODEV);
+}
+
+static int
+l_norint(c, tp)
+	int c;
+	struct tty *tp;
+{
+
+	return (ENODEV);
+}
+
+static int
+l_nostart(tp)
+	struct tty *tp;
+{
+
+	return (ENODEV);
+}
+
+/*
+ * Do nothing specific version of line
+ * discipline specific ioctl command.
+ */
+static int
+l_nullioctl(tp, cmd, data, flags, p)
+	struct tty *tp;
+	u_long cmd;
+	char *data;
+	int flags;
+	struct proc *p;
+{
+
+	return (ENOIOCTL);
+}
diff --git a/sys/kern/tty_cons.c b/sys/kern/tty_cons.c
index 6189d72..581ff3f 100644
--- a/sys/kern/tty_cons.c
+++ b/sys/kern/tty_cons.c
@@ -35,129 +35,323 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)cons.c	7.2 (Berkeley) 5/9/91
- *
- * PATCHES MAGIC                LEVEL   PATCH THAT GOT US HERE
- * --------------------         -----   ----------------------
- * CURRENT PATCH LEVEL:         1       00083
- * --------------------         -----   ----------------------
- *
- * 16 Aug 92	Pace Willisson		/dev/console redirect (xterm -C, etc.)
- * 14 Mar 93	Chris G. Demetriou	Moved pg() here from isa/pccons.c
+ *	from: @(#)cons.c	7.2 (Berkeley) 5/9/91
+ *	$Id: cons.c,v 1.59 1998/08/23 08:26:40 bde Exp $
  */
 
+#include "opt_devfs.h"
 
-#include "sys/param.h"
-#include "sys/proc.h"
-#include "sys/user.h"
-#include "sys/systm.h"
-#include "sys/buf.h"
-#include "sys/ioctl.h"
-#include "sys/tty.h"
-#include "sys/file.h"
-#include "sys/conf.h"
+#include <sys/param.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/reboot.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/uio.h>
 
-#include "cons.h"
+#include <machine/cpu.h>
+#include <machine/cons.h>
 
-/* XXX - all this could be autoconfig()ed */
-int pccnprobe(), pccninit(), pccngetc(), pccnputc();
-#include "com.h"
-#if NCOM > 0
-int comcnprobe(), comcninit(), comcngetc(), comcnputc();
-#endif
+static	d_open_t	cnopen;
+static	d_close_t	cnclose;
+static	d_read_t	cnread;
+static	d_write_t	cnwrite;
+static	d_ioctl_t	cnioctl;
+static	d_poll_t	cnpoll;
 
-struct	consdev constab[] = {
-	{ pccnprobe,	pccninit,	pccngetc,	pccnputc },
-#if NCOM > 0
-	{ comcnprobe,	comcninit,	comcngetc,	comcnputc },
-#endif
-	{ 0 },
+#define	CDEV_MAJOR	0
+static	struct cdevsw	cn_cdevsw = {
+	cnopen,		cnclose,	cnread,		cnwrite,
+	cnioctl,	nullstop,	nullreset,	nodevtotty,
+	cnpoll,		nommap,		NULL,		"console",
+	NULL,		-1,		nodump,		nopsize,
+	D_TTY,
 };
-/* end XXX */
 
-struct	tty *constty = 0;	/* virtual console output device */
-struct	consdev *cn_tab;	/* physical console device info */
-struct	tty *cn_tty;		/* XXX: console tty struct for tprintf */
+static dev_t	cn_dev_t; 	/* seems to be never really used */
+SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLFLAG_RD,
+	&cn_dev_t, sizeof cn_dev_t, "T,dev_t", "");
+
+static int cn_mute;
 
+int	cons_unavail = 0;	/* XXX:
+				 * physical console not available for
+				 * input (i.e., it is in graphics mode)
+				 */
+
+static u_char cn_is_open;		/* nonzero if logical console is open */
+static int openmode, openflag;		/* how /dev/console was openned */
+static u_char cn_phys_is_open;		/* nonzero if physical device is open */
+static d_close_t *cn_phys_close;	/* physical device close function */
+static d_open_t *cn_phys_open;		/* physical device open function */
+static struct consdev *cn_tab;		/* physical console device info */
+static struct tty *cn_tp;		/* physical console tty struct */
+#ifdef DEVFS
+static void *cn_devfs_token;		/* represents the devfs entry */
+#endif /* DEVFS */
+
+CONS_DRIVER(cons, NULL, NULL, NULL, NULL, NULL);
+
+void
 cninit()
 {
-	register struct consdev *cp;
+	struct consdev *best_cp, *cp;
+	struct consdev **list;
 
 	/*
-	 * Collect information about all possible consoles
-	 * and find the one with highest priority
+	 * Find the first console with the highest priority.
 	 */
-	for (cp = constab; cp->cn_probe; cp++) {
+	best_cp = NULL;
+	list = (struct consdev **)cons_set.ls_items;
+	while ((cp = *list++) != NULL) {
+		if (cp->cn_probe == NULL)
+			continue;
 		(*cp->cn_probe)(cp);
 		if (cp->cn_pri > CN_DEAD &&
-		    (cn_tab == NULL || cp->cn_pri > cn_tab->cn_pri))
-			cn_tab = cp;
+		    (best_cp == NULL || cp->cn_pri > best_cp->cn_pri))
+			best_cp = cp;
 	}
+
+	/*
+	 * Check if we should mute the console (for security reasons perhaps)
+	 * It can be changes dynamically using sysctl kern.consmute
+	 * once we are up and going.
+	 * 
+	 */
+        cn_mute = ((boothowto & (RB_MUTE
+			|RB_SINGLE
+			|RB_VERBOSE
+			|RB_ASKNAME
+			|RB_CONFIG)) == RB_MUTE);
+	
+	/*
+	 * If no console, give up.
+	 */
+	if (best_cp == NULL) {
+		cn_tab = best_cp;
+		return;
+	}
+
+	/*
+	 * Initialize console, then attach to it.  This ordering allows
+	 * debugging using the previous console, if any.
+	 * XXX if there was a previous console, then its driver should
+	 * be informed when we forget about it.
+	 */
+	(*best_cp->cn_init)(best_cp);
+	cn_tab = best_cp;
+}
+
+void
+cninit_finish()
+{
+	struct cdevsw *cdp;
+
+	if ((cn_tab == NULL) || cn_mute)
+		return;
+
 	/*
-	 * No console, we can handle it
+	 * Hook the open and close functions.
 	 */
-	if ((cp = cn_tab) == NULL)
+	cdp = cdevsw[major(cn_tab->cn_dev)];
+	cn_phys_close = cdp->d_close;
+	cdp->d_close = cnclose;
+	cn_phys_open = cdp->d_open;
+	cdp->d_open = cnopen;
+	cn_tp = (*cdp->d_devtotty)(cn_tab->cn_dev);
+	cn_dev_t = cn_tp->t_dev;
+}
+
+static void
+cnuninit(void)
+{
+	struct cdevsw *cdp;
+
+	if (cn_tab == NULL)
 		return;
+
 	/*
-	 * Turn on console
+	 * Unhook the open and close functions.
 	 */
-	cn_tty = cp->cn_tp;
-	(*cp->cn_init)(cp);
+	cdp = cdevsw[major(cn_tab->cn_dev)];
+	cdp->d_close = cn_phys_close;
+	cn_phys_close = NULL;
+	cdp->d_open = cn_phys_open;
+	cn_phys_open = NULL;
+	cn_tp = NULL;
+	cn_dev_t = 0;
+}
+
+/*
+ * User has changed the state of the console muting.
+ * This may require us to open or close the device in question.
+ */
+static int
+sysctl_kern_consmute SYSCTL_HANDLER_ARGS
+{
+	int error;
+	int ocn_mute;
+
+	ocn_mute = cn_mute;
+	error = sysctl_handle_int(oidp, &cn_mute, 0, req);
+	if((error == 0) && (cn_tab != NULL) && (req->newptr != NULL)) {
+		if(ocn_mute && !cn_mute) {
+			/*
+			 * going from muted to unmuted.. open the physical dev 
+			 * if the console has been openned
+			 */
+			cninit_finish();
+			if(cn_is_open)
+				/* XXX curproc is not what we want really */
+				error = cnopen(cn_dev_t, openflag,
+					openmode, curproc);
+			/* if it failed, back it out */
+			if ( error != 0) cnuninit();
+		} else if (!ocn_mute && cn_mute) {
+			/*
+			 * going from unmuted to muted.. close the physical dev 
+			 * if it's only open via /dev/console
+			 */
+			if(cn_is_open)
+				error = cnclose(cn_dev_t, openflag,
+					openmode, curproc);
+			if ( error == 0) cnuninit();
+		}
+		if (error != 0) {
+			/* 
+	 		 * back out the change if there was an error
+			 */
+			cn_mute = ocn_mute;
+		}
+	}
+	return (error);
 }
 
+SYSCTL_PROC(_kern, OID_AUTO, consmute, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof cn_mute, sysctl_kern_consmute, "I", "");
+
+static int
 cnopen(dev, flag, mode, p)
 	dev_t dev;
 	int flag, mode;
 	struct proc *p;
 {
+	dev_t cndev, physdev;
+	int retval = 0;
+
 	if (cn_tab == NULL)
 		return (0);
-	dev = cn_tab->cn_dev;
-	return ((*cdevsw[major(dev)].d_open)(dev, flag, mode, p));
+	cndev = cn_tab->cn_dev;
+	physdev = (major(dev) == major(cndev) ? dev : cndev);
+	/*
+	 * If mute is active, then non console opens don't get here
+	 * so we don't need to check for that. They 
+	 * bypass this and go straight to the device.
+	 */
+	if(!cn_mute)
+		retval = (*cn_phys_open)(physdev, flag, mode, p);
+	if (retval == 0) {
+		/* 
+		 * check if we openned it via /dev/console or 
+		 * via the physical entry (e.g. /dev/sio0).
+		 */
+		if (dev == cndev)
+			cn_phys_is_open = 1;
+		else if (physdev == cndev) {
+			openmode = mode;
+			openflag = flag;
+			cn_is_open = 1;
+		}
+	}
+	return (retval);
 }
- 
+
+static int
 cnclose(dev, flag, mode, p)
 	dev_t dev;
 	int flag, mode;
 	struct proc *p;
 {
+	dev_t cndev;
+
 	if (cn_tab == NULL)
 		return (0);
-	dev = cn_tab->cn_dev;
-	return ((*cdevsw[major(dev)].d_close)(dev, flag, mode, p));
+	cndev = cn_tab->cn_dev;
+	/*
+	 * act appropriatly depending on whether it's /dev/console
+	 * or the pysical device (e.g. /dev/sio) that's being closed.
+	 * in either case, don't actually close the device unless
+	 * both are closed.
+	 */
+	if (dev == cndev) {
+		/* the physical device is about to be closed */
+		cn_phys_is_open = 0;
+		if (cn_is_open) {
+			if (cn_tp) {
+				/* perform a ttyhalfclose() */
+				/* reset session and proc group */
+				cn_tp->t_pgrp = NULL;
+				cn_tp->t_session = NULL;
+			}
+			return (0);
+		}
+	} else if (major(dev) != major(cndev)) {
+		/* the logical console is about to be closed */
+		cn_is_open = 0;
+		if (cn_phys_is_open)
+			return (0);
+		dev = cndev;
+	}
+	if(cn_phys_close)
+		return ((*cn_phys_close)(dev, flag, mode, p));
+	return (0);
 }
- 
+
+static int
 cnread(dev, uio, flag)
 	dev_t dev;
 	struct uio *uio;
+	int flag;
 {
-	if (cn_tab == NULL)
+	if ((cn_tab == NULL) || cn_mute)
 		return (0);
 	dev = cn_tab->cn_dev;
-	return ((*cdevsw[major(dev)].d_read)(dev, uio, flag));
+	return ((*cdevsw[major(dev)]->d_read)(dev, uio, flag));
 }
- 
+
+static int
 cnwrite(dev, uio, flag)
 	dev_t dev;
 	struct uio *uio;
+	int flag;
 {
-	if (cn_tab == NULL)
+	if ((cn_tab == NULL) || cn_mute) {
+		uio->uio_resid = 0; /* dump the data */
 		return (0);
-	if (constty)					/* 16 Aug 92*/
+	}
+	if (constty)
 		dev = constty->t_dev;
 	else
 		dev = cn_tab->cn_dev;
-	return ((*cdevsw[major(dev)].d_write)(dev, uio, flag));
+	return ((*cdevsw[major(dev)]->d_write)(dev, uio, flag));
 }
- 
+
+static int
 cnioctl(dev, cmd, data, flag, p)
 	dev_t dev;
+	u_long cmd;
 	caddr_t data;
+	int flag;
 	struct proc *p;
 {
 	int error;
 
-	if (cn_tab == NULL)
+	if ((cn_tab == NULL) || cn_mute)
 		return (0);
 	/*
 	 * Superuser can always use this to wrest control of console
@@ -171,43 +365,74 @@ cnioctl(dev, cmd, data, flag, p)
 		return (0);
 	}
 	dev = cn_tab->cn_dev;
-	return ((*cdevsw[major(dev)].d_ioctl)(dev, cmd, data, flag, p));
+	return ((*cdevsw[major(dev)]->d_ioctl)(dev, cmd, data, flag, p));
 }
 
-/*ARGSUSED*/
-cnselect(dev, rw, p)
+static int
+cnpoll(dev, events, p)
 	dev_t dev;
-	int rw;
+	int events;
 	struct proc *p;
 {
-	if (cn_tab == NULL)
+	if ((cn_tab == NULL) || cn_mute)
 		return (1);
-	return (ttselect(cn_tab->cn_dev, rw, p));
+
+	dev = cn_tab->cn_dev;
+
+	return ((*cdevsw[major(dev)]->d_poll)(dev, events, p));
 }
 
+int
 cngetc()
 {
-	if (cn_tab == NULL)
-		return (0);
-	return ((*cn_tab->cn_getc)(cn_tab->cn_dev));
+	int c;
+	if ((cn_tab == NULL) || cn_mute)
+		return (-1);
+	c = (*cn_tab->cn_getc)(cn_tab->cn_dev);
+	if (c == '\r') c = '\n'; /* console input is always ICRNL */
+	return (c);
 }
 
+int
+cncheckc()
+{
+	if ((cn_tab == NULL) || cn_mute)
+		return (-1);
+	return ((*cn_tab->cn_checkc)(cn_tab->cn_dev));
+}
+
+void
 cnputc(c)
 	register int c;
 {
-	if (cn_tab == NULL)
+	if ((cn_tab == NULL) || cn_mute)
 		return;
 	if (c) {
-		(*cn_tab->cn_putc)(cn_tab->cn_dev, c);
 		if (c == '\n')
 			(*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
+		(*cn_tab->cn_putc)(cn_tab->cn_dev, c);
 	}
 }
 
-pg(p,q,r,s,t,u,v,w,x,y,z) char *p; {
-	printf(p,q,r,s,t,u,v,w,x,y,z);
-	printf("\n>");
-	return(cngetc());
+static cn_devsw_installed = 0;
+
+static void
+cn_drvinit(void *unused)
+{
+	dev_t dev;
+
+	if( ! cn_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR,0);
+		cdevsw_add(&dev,&cn_cdevsw,NULL);
+		cn_devsw_installed = 1;
+#ifdef DEVFS
+		cn_devfs_token = devfs_add_devswf(&cn_cdevsw, 0, DV_CHR,
+						  UID_ROOT, GID_WHEEL, 0600,
+						  "console");
+#endif
+	}
 }
 
+SYSINIT(cndev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,cn_drvinit,NULL)
+
 
diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c
new file mode 100644
index 0000000..214f103
--- /dev/null
+++ b/sys/kern/tty_pty.c
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_pty.c	8.4 (Berkeley) 2/20/95
+ * $Id: tty_pty.c,v 1.53 1998/07/15 12:18:30 bde Exp $
+ */
+
+/*
+ * Pseudo-teletype Driver
+ * (Actually two drivers, requiring two entries in 'cdevsw')
+ */
+#include "pty.h"		/* XXX */
+#include "opt_compat.h"
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#ifdef notyet
+static void ptyattach __P((int n));
+#endif
+static void ptsstart __P((struct tty *tp));
+static void ptcwakeup __P((struct tty *tp, int flag));
+
+static	d_open_t	ptsopen;
+static	d_close_t	ptsclose;
+static	d_read_t	ptsread;
+static	d_write_t	ptswrite;
+static	d_ioctl_t	ptyioctl;
+static	d_stop_t	ptsstop;
+static	d_devtotty_t	ptydevtotty;
+static	d_open_t	ptcopen;
+static	d_close_t	ptcclose;
+static	d_read_t	ptcread;
+static	d_write_t	ptcwrite;
+static	d_poll_t	ptcpoll;
+
+#define	CDEV_MAJOR_S	5
+static	struct cdevsw	pts_cdevsw = {
+	ptsopen,	ptsclose,	ptsread,	ptswrite,
+	ptyioctl,	ptsstop,	nullreset,	ptydevtotty,
+	ttpoll,		nommap,		NULL,		"pts",
+	NULL,		-1,		nodump,		nopsize,
+	D_TTY,
+};
+
+#define	CDEV_MAJOR_C	6
+static	struct cdevsw	ptc_cdevsw = {
+	ptcopen,	ptcclose,	ptcread,	ptcwrite,
+	ptyioctl,	nullstop,	nullreset,	ptydevtotty,
+	ptcpoll,	nommap,		NULL,		"ptc",
+	NULL,		-1,		nodump,		nopsize,
+	D_TTY,
+};
+
+#if NPTY == 1
+#undef NPTY
+#define	NPTY	32		/* crude XXX */
+#warning	You have only one pty defined, redefining to 32.
+#endif
+
+#ifdef DEVFS
+#define MAXUNITS (8 * 32)
+static	void	*devfs_token_pts[MAXUNITS];
+static	void	*devfs_token_ptc[MAXUNITS];
+static  const	char jnames[] = "pqrsPQRS";
+#if NPTY > MAXUNITS
+#undef NPTY
+#define NPTY MAXUNITS
+#warning	Can't have more than 256 pty's with DEVFS defined.
+#endif
+#endif
+
+#define BUFSIZ 100		/* Chunk size iomoved to/from user */
+
+/*
+ * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
+ * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
+ */
+static struct	tty pt_tty[NPTY];	/* XXX */
+static struct	pt_ioctl {
+	int	pt_flags;
+	struct	selinfo pt_selr, pt_selw;
+	u_char	pt_send;
+	u_char	pt_ucntl;
+} pt_ioctl[NPTY];		/* XXX */
+static int	npty = NPTY;		/* for pstat -t */
+
+#define	PF_PKT		0x08		/* packet mode */
+#define	PF_STOPPED	0x10		/* user told stopped */
+#define	PF_REMOTE	0x20		/* remote and flow controlled input */
+#define	PF_NOSTOP	0x40
+#define PF_UCNTL	0x80		/* user control mode */
+
+#ifdef notyet
+/*
+ * Establish n (or default if n is 1) ptys in the system.
+ *
+ * XXX cdevsw & pstat require the array `pty[]' to be an array
+ */
+static void
+ptyattach(n)
+	int n;
+{
+	char *mem;
+	register u_long ntb;
+#define	DEFAULT_NPTY	32
+
+	/* maybe should allow 0 => none? */
+	if (n <= 1)
+		n = DEFAULT_NPTY;
+	ntb = n * sizeof(struct tty);
+	mem = malloc(ntb + ALIGNBYTES + n * sizeof(struct pt_ioctl),
+	    M_DEVBUF, M_WAITOK);
+	pt_tty = (struct tty *)mem;
+	mem = (char *)ALIGN(mem + ntb);
+	pt_ioctl = (struct pt_ioctl *)mem;
+	npty = n;
+}
+#endif
+
+/*ARGSUSED*/
+static	int
+ptsopen(dev, flag, devtype, p)
+	dev_t dev;
+	int flag, devtype;
+	struct proc *p;
+{
+	register struct tty *tp;
+	int error;
+
+	if (minor(dev) >= npty)
+		return (ENXIO);
+	tp = &pt_tty[minor(dev)];
+	if ((tp->t_state & TS_ISOPEN) == 0) {
+		ttychars(tp);		/* Set up default chars */
+		tp->t_iflag = TTYDEF_IFLAG;
+		tp->t_oflag = TTYDEF_OFLAG;
+		tp->t_lflag = TTYDEF_LFLAG;
+		tp->t_cflag = TTYDEF_CFLAG;
+		tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED;
+	} else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0)
+		return (EBUSY);
+	if (tp->t_oproc)			/* Ctrlr still around. */
+		(void)(*linesw[tp->t_line].l_modem)(tp, 1);
+	while ((tp->t_state & TS_CARR_ON) == 0) {
+		if (flag&FNONBLOCK)
+			break;
+		error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+				 "ptsopn", 0);
+		if (error)
+			return (error);
+	}
+	error = (*linesw[tp->t_line].l_open)(dev, tp);
+	if (error == 0)
+		ptcwakeup(tp, FREAD|FWRITE);
+	return (error);
+}
+
+static	int
+ptsclose(dev, flag, mode, p)
+	dev_t dev;
+	int flag, mode;
+	struct proc *p;
+{
+	register struct tty *tp;
+	int err;
+
+	tp = &pt_tty[minor(dev)];
+	err = (*linesw[tp->t_line].l_close)(tp, flag);
+	ptsstop(tp, FREAD|FWRITE);
+	(void) ttyclose(tp);
+	return (err);
+}
+
+static	int
+ptsread(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	struct proc *p = curproc;
+	register struct tty *tp = &pt_tty[minor(dev)];
+	register struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
+	int error = 0;
+
+again:
+	if (pti->pt_flags & PF_REMOTE) {
+		while (isbackground(p, tp)) {
+			if ((p->p_sigignore & sigmask(SIGTTIN)) ||
+			    (p->p_sigmask & sigmask(SIGTTIN)) ||
+			    p->p_pgrp->pg_jobc == 0 ||
+			    p->p_flag & P_PPWAIT)
+				return (EIO);
+			pgsignal(p->p_pgrp, SIGTTIN, 1);
+			error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ptsbg",
+					 0);
+			if (error)
+				return (error);
+		}
+		if (tp->t_canq.c_cc == 0) {
+			if (flag & IO_NDELAY)
+				return (EWOULDBLOCK);
+			error = ttysleep(tp, TSA_PTS_READ(tp), TTIPRI | PCATCH,
+					 "ptsin", 0);
+			if (error)
+				return (error);
+			goto again;
+		}
+		while (tp->t_canq.c_cc > 1 && uio->uio_resid > 0)
+			if (ureadc(getc(&tp->t_canq), uio) < 0) {
+				error = EFAULT;
+				break;
+			}
+		if (tp->t_canq.c_cc == 1)
+			(void) getc(&tp->t_canq);
+		if (tp->t_canq.c_cc)
+			return (error);
+	} else
+		if (tp->t_oproc)
+			error = (*linesw[tp->t_line].l_read)(tp, uio, flag);
+	ptcwakeup(tp, FWRITE);
+	return (error);
+}
+
+/*
+ * Write to pseudo-tty.
+ * Wakeups of controlling tty will happen
+ * indirectly, when tty driver calls ptsstart.
+ */
+static	int
+ptswrite(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	register struct tty *tp;
+
+	tp = &pt_tty[minor(dev)];
+	if (tp->t_oproc == 0)
+		return (EIO);
+	return ((*linesw[tp->t_line].l_write)(tp, uio, flag));
+}
+
+/*
+ * Start output on pseudo-tty.
+ * Wake up process selecting or sleeping for input from controlling tty.
+ */
+static void
+ptsstart(tp)
+	struct tty *tp;
+{
+	register struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)];
+
+	if (tp->t_state & TS_TTSTOP)
+		return;
+	if (pti->pt_flags & PF_STOPPED) {
+		pti->pt_flags &= ~PF_STOPPED;
+		pti->pt_send = TIOCPKT_START;
+	}
+	ptcwakeup(tp, FREAD);
+}
+
+static void
+ptcwakeup(tp, flag)
+	struct tty *tp;
+	int flag;
+{
+	struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)];
+
+	if (flag & FREAD) {
+		selwakeup(&pti->pt_selr);
+		wakeup(TSA_PTC_READ(tp));
+	}
+	if (flag & FWRITE) {
+		selwakeup(&pti->pt_selw);
+		wakeup(TSA_PTC_WRITE(tp));
+	}
+}
+
+static	int
+ptcopen(dev, flag, devtype, p)
+	dev_t dev;
+	int flag, devtype;
+	struct proc *p;
+{
+	register struct tty *tp;
+	struct pt_ioctl *pti;
+
+	if (minor(dev) >= npty)
+		return (ENXIO);
+	tp = &pt_tty[minor(dev)];
+	if (tp->t_oproc)
+		return (EIO);
+	tp->t_oproc = ptsstart;
+#ifdef sun4c
+	tp->t_stop = ptsstop;
+#endif
+	(void)(*linesw[tp->t_line].l_modem)(tp, 1);
+	tp->t_lflag &= ~EXTPROC;
+	pti = &pt_ioctl[minor(dev)];
+	pti->pt_flags = 0;
+	pti->pt_send = 0;
+	pti->pt_ucntl = 0;
+	return (0);
+}
+
+static	int
+ptcclose(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
+{
+	register struct tty *tp;
+
+	tp = &pt_tty[minor(dev)];
+	(void)(*linesw[tp->t_line].l_modem)(tp, 0);
+
+	/*
+	 * XXX MDMBUF makes no sense for ptys but would inhibit the above
+	 * l_modem().  CLOCAL makes sense but isn't supported.   Special
+	 * l_modem()s that ignore carrier drop make no sense for ptys but
+	 * may be in use because other parts of the line discipline make
+	 * sense for ptys.  Recover by doing everything that a normal
+	 * ttymodem() would have done except for sending a SIGHUP.
+	 */
+	if (tp->t_state & TS_ISOPEN) {
+		tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED);
+		tp->t_state |= TS_ZOMBIE;
+		ttyflush(tp, FREAD | FWRITE);
+	}
+
+	tp->t_oproc = 0;		/* mark closed */
+	return (0);
+}
+
+static	int
+ptcread(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	register struct tty *tp = &pt_tty[minor(dev)];
+	struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
+	char buf[BUFSIZ];
+	int error = 0, cc;
+
+	/*
+	 * We want to block until the slave
+	 * is open, and there's something to read;
+	 * but if we lost the slave or we're NBIO,
+	 * then return the appropriate error instead.
+	 */
+	for (;;) {
+		if (tp->t_state&TS_ISOPEN) {
+			if (pti->pt_flags&PF_PKT && pti->pt_send) {
+				error = ureadc((int)pti->pt_send, uio);
+				if (error)
+					return (error);
+				if (pti->pt_send & TIOCPKT_IOCTL) {
+					cc = min(uio->uio_resid,
+						sizeof(tp->t_termios));
+					uiomove((caddr_t)&tp->t_termios, cc,
+						uio);
+				}
+				pti->pt_send = 0;
+				return (0);
+			}
+			if (pti->pt_flags&PF_UCNTL && pti->pt_ucntl) {
+				error = ureadc((int)pti->pt_ucntl, uio);
+				if (error)
+					return (error);
+				pti->pt_ucntl = 0;
+				return (0);
+			}
+			if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0)
+				break;
+		}
+		if ((tp->t_state & TS_CONNECTED) == 0)
+			return (0);	/* EOF */
+		if (flag & IO_NDELAY)
+			return (EWOULDBLOCK);
+		error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0);
+		if (error)
+			return (error);
+	}
+	if (pti->pt_flags & (PF_PKT|PF_UCNTL))
+		error = ureadc(0, uio);
+	while (uio->uio_resid > 0 && error == 0) {
+		cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ));
+		if (cc <= 0)
+			break;
+		error = uiomove(buf, cc, uio);
+	}
+	ttwwakeup(tp);
+	return (error);
+}
+
+static	void
+ptsstop(tp, flush)
+	register struct tty *tp;
+	int flush;
+{
+	struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)];
+	int flag;
+
+	/* note: FLUSHREAD and FLUSHWRITE already ok */
+	if (flush == 0) {
+		flush = TIOCPKT_STOP;
+		pti->pt_flags |= PF_STOPPED;
+	} else
+		pti->pt_flags &= ~PF_STOPPED;
+	pti->pt_send |= flush;
+	/* change of perspective */
+	flag = 0;
+	if (flush & FREAD)
+		flag |= FWRITE;
+	if (flush & FWRITE)
+		flag |= FREAD;
+	ptcwakeup(tp, flag);
+}
+
+static	int
+ptcpoll(dev, events, p)
+	dev_t dev;
+	int events;
+	struct proc *p;
+{
+	register struct tty *tp = &pt_tty[minor(dev)];
+	struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
+	int revents = 0;
+	int s;
+
+	if ((tp->t_state & TS_CONNECTED) == 0)
+		return (seltrue(dev, events, p) | POLLHUP);
+
+	/*
+	 * Need to block timeouts (ttrstart).
+	 */
+	s = spltty();
+
+	if (events & (POLLIN | POLLRDNORM))
+		if ((tp->t_state & TS_ISOPEN) &&
+		    ((tp->t_outq.c_cc && (tp->t_state & TS_TTSTOP) == 0) ||
+		     ((pti->pt_flags & PF_PKT) && pti->pt_send) ||
+		     ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)))
+			revents |= events & (POLLIN | POLLRDNORM);
+
+	if (events & (POLLOUT | POLLWRNORM))
+		if (tp->t_state & TS_ISOPEN &&
+		    ((pti->pt_flags & PF_REMOTE) ?
+		     (tp->t_canq.c_cc == 0) : 
+		     ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG - 2) ||
+		      (tp->t_canq.c_cc == 0 && (tp->t_iflag & ICANON)))))
+			revents |= events & (POLLOUT | POLLWRNORM);
+
+	if (events & POLLHUP)
+		if ((tp->t_state & TS_CARR_ON) == 0)
+			revents |= POLLHUP;
+
+	if (revents == 0) {
+		if (events & (POLLIN | POLLRDNORM))
+			selrecord(p, &pti->pt_selr);
+
+		if (events & (POLLOUT | POLLWRNORM)) 
+			selrecord(p, &pti->pt_selw);
+	}
+	splx(s);
+
+	return (revents);
+}
+
+static	int
+ptcwrite(dev, uio, flag)
+	dev_t dev;
+	register struct uio *uio;
+	int flag;
+{
+	register struct tty *tp = &pt_tty[minor(dev)];
+	register u_char *cp = 0;
+	register int cc = 0;
+	u_char locbuf[BUFSIZ];
+	int cnt = 0;
+	struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
+	int error = 0;
+
+again:
+	if ((tp->t_state&TS_ISOPEN) == 0)
+		goto block;
+	if (pti->pt_flags & PF_REMOTE) {
+		if (tp->t_canq.c_cc)
+			goto block;
+		while ((uio->uio_resid > 0 || cc > 0) &&
+		       tp->t_canq.c_cc < TTYHOG - 1) {
+			if (cc == 0) {
+				cc = min(uio->uio_resid, BUFSIZ);
+				cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc);
+				cp = locbuf;
+				error = uiomove((caddr_t)cp, cc, uio);
+				if (error)
+					return (error);
+				/* check again for safety */
+				if ((tp->t_state & TS_ISOPEN) == 0) {
+					/* adjust as usual */
+					uio->uio_resid += cc;
+					return (EIO);
+				}
+			}
+			if (cc > 0) {
+				cc = b_to_q((char *)cp, cc, &tp->t_canq);
+				/*
+				 * XXX we don't guarantee that the canq size
+				 * is >= TTYHOG, so the above b_to_q() may
+				 * leave some bytes uncopied.  However, space
+				 * is guaranteed for the null terminator if
+				 * we don't fail here since (TTYHOG - 1) is
+				 * not a multiple of CBSIZE.
+				 */
+				if (cc > 0)
+					break;
+			}
+		}
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		(void) putc(0, &tp->t_canq);
+		ttwakeup(tp);
+		wakeup(TSA_PTS_READ(tp));
+		return (0);
+	}
+	while (uio->uio_resid > 0 || cc > 0) {
+		if (cc == 0) {
+			cc = min(uio->uio_resid, BUFSIZ);
+			cp = locbuf;
+			error = uiomove((caddr_t)cp, cc, uio);
+			if (error)
+				return (error);
+			/* check again for safety */
+			if ((tp->t_state & TS_ISOPEN) == 0) {
+				/* adjust for data copied in but not written */
+				uio->uio_resid += cc;
+				return (EIO);
+			}
+		}
+		while (cc > 0) {
+			if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 &&
+			   (tp->t_canq.c_cc > 0 || !(tp->t_iflag&ICANON))) {
+				wakeup(TSA_HUP_OR_INPUT(tp));
+				goto block;
+			}
+			(*linesw[tp->t_line].l_rint)(*cp++, tp);
+			cnt++;
+			cc--;
+		}
+		cc = 0;
+	}
+	return (0);
+block:
+	/*
+	 * Come here to wait for slave to open, for space
+	 * in outq, or space in rawq, or an empty canq.
+	 */
+	if ((tp->t_state & TS_CONNECTED) == 0) {
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		return (EIO);
+	}
+	if (flag & IO_NDELAY) {
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		if (cnt == 0)
+			return (EWOULDBLOCK);
+		return (0);
+	}
+	error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0);
+	if (error) {
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		return (error);
+	}
+	goto again;
+}
+
+static	struct tty *
+ptydevtotty(dev)
+	dev_t		dev;
+{
+	if (minor(dev) >= npty)
+		return (NULL);
+
+	return &pt_tty[minor(dev)];
+}
+
+/*ARGSUSED*/
+static	int
+ptyioctl(dev, cmd, data, flag, p)
+	dev_t dev;
+	u_long cmd;
+	caddr_t data;
+	int flag;
+	struct proc *p;
+{
+	register struct tty *tp = &pt_tty[minor(dev)];
+	register struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
+	register u_char *cc = tp->t_cc;
+	int stop, error;
+
+	/*
+	 * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG.
+	 * ttywflush(tp) will hang if there are characters in the outq.
+	 */
+	if (cmd == TIOCEXT) {
+		/*
+		 * When the EXTPROC bit is being toggled, we need
+		 * to send an TIOCPKT_IOCTL if the packet driver
+		 * is turned on.
+		 */
+		if (*(int *)data) {
+			if (pti->pt_flags & PF_PKT) {
+				pti->pt_send |= TIOCPKT_IOCTL;
+				ptcwakeup(tp, FREAD);
+			}
+			tp->t_lflag |= EXTPROC;
+		} else {
+			if ((tp->t_lflag & EXTPROC) &&
+			    (pti->pt_flags & PF_PKT)) {
+				pti->pt_send |= TIOCPKT_IOCTL;
+				ptcwakeup(tp, FREAD);
+			}
+			tp->t_lflag &= ~EXTPROC;
+		}
+		return(0);
+	} else
+	if (cdevsw[major(dev)]->d_open == ptcopen)
+		switch (cmd) {
+
+		case TIOCGPGRP:
+			/*
+			 * We avoid calling ttioctl on the controller since,
+			 * in that case, tp must be the controlling terminal.
+			 */
+			*(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
+			return (0);
+
+		case TIOCPKT:
+			if (*(int *)data) {
+				if (pti->pt_flags & PF_UCNTL)
+					return (EINVAL);
+				pti->pt_flags |= PF_PKT;
+			} else
+				pti->pt_flags &= ~PF_PKT;
+			return (0);
+
+		case TIOCUCNTL:
+			if (*(int *)data) {
+				if (pti->pt_flags & PF_PKT)
+					return (EINVAL);
+				pti->pt_flags |= PF_UCNTL;
+			} else
+				pti->pt_flags &= ~PF_UCNTL;
+			return (0);
+
+		case TIOCREMOTE:
+			if (*(int *)data)
+				pti->pt_flags |= PF_REMOTE;
+			else
+				pti->pt_flags &= ~PF_REMOTE;
+			ttyflush(tp, FREAD|FWRITE);
+			return (0);
+
+#ifdef COMPAT_43
+		case TIOCSETP:
+		case TIOCSETN:
+#endif
+		case TIOCSETD:
+		case TIOCSETA:
+		case TIOCSETAW:
+		case TIOCSETAF:
+			ndflush(&tp->t_outq, tp->t_outq.c_cc);
+			break;
+
+		case TIOCSIG:
+			if (*(unsigned int *)data >= NSIG ||
+			    *(unsigned int *)data == 0)
+				return(EINVAL);
+			if ((tp->t_lflag&NOFLSH) == 0)
+				ttyflush(tp, FREAD|FWRITE);
+			pgsignal(tp->t_pgrp, *(unsigned int *)data, 1);
+			if ((*(unsigned int *)data == SIGINFO) &&
+			    ((tp->t_lflag&NOKERNINFO) == 0))
+				ttyinfo(tp);
+			return(0);
+		}
+	error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p);
+	if (error == ENOIOCTL)
+		 error = ttioctl(tp, cmd, data, flag);
+	if (error == ENOIOCTL) {
+		if (pti->pt_flags & PF_UCNTL &&
+		    (cmd & ~0xff) == UIOCCMD(0)) {
+			if (cmd & 0xff) {
+				pti->pt_ucntl = (u_char)cmd;
+				ptcwakeup(tp, FREAD);
+			}
+			return (0);
+		}
+		error = ENOTTY;
+	}
+	/*
+	 * If external processing and packet mode send ioctl packet.
+	 */
+	if ((tp->t_lflag&EXTPROC) && (pti->pt_flags & PF_PKT)) {
+		switch(cmd) {
+		case TIOCSETA:
+		case TIOCSETAW:
+		case TIOCSETAF:
+#ifdef COMPAT_43
+		case TIOCSETP:
+		case TIOCSETN:
+#endif
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+		case TIOCSETC:
+		case TIOCSLTC:
+		case TIOCLBIS:
+		case TIOCLBIC:
+		case TIOCLSET:
+#endif
+			pti->pt_send |= TIOCPKT_IOCTL;
+			ptcwakeup(tp, FREAD);
+		default:
+			break;
+		}
+	}
+	stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s'))
+		&& CCEQ(cc[VSTART], CTRL('q'));
+	if (pti->pt_flags & PF_NOSTOP) {
+		if (stop) {
+			pti->pt_send &= ~TIOCPKT_NOSTOP;
+			pti->pt_send |= TIOCPKT_DOSTOP;
+			pti->pt_flags &= ~PF_NOSTOP;
+			ptcwakeup(tp, FREAD);
+		}
+	} else {
+		if (!stop) {
+			pti->pt_send &= ~TIOCPKT_DOSTOP;
+			pti->pt_send |= TIOCPKT_NOSTOP;
+			pti->pt_flags |= PF_NOSTOP;
+			ptcwakeup(tp, FREAD);
+		}
+	}
+	return (error);
+}
+
+static int ptc_devsw_installed;
+
+static void ptc_drvinit __P((void *unused));
+static void
+ptc_drvinit(unused)
+	void *unused;
+{
+#ifdef DEVFS
+	int i,j,k;
+#endif
+	dev_t dev;
+
+	if( ! ptc_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR_S, 0);
+		cdevsw_add(&dev, &pts_cdevsw, NULL);
+		dev = makedev(CDEV_MAJOR_C, 0);
+		cdevsw_add(&dev, &ptc_cdevsw, NULL);
+		ptc_devsw_installed = 1;
+#ifdef DEVFS
+		for ( i = 0 ; i<NPTY ; i++ ) {
+			j = i / 32;
+			k = i % 32;
+			devfs_token_pts[i] = 
+				devfs_add_devswf(&pts_cdevsw,i,
+						DV_CHR,0,0,0666,
+						"tty%c%r",jnames[j],k);
+			devfs_token_ptc[i] =
+				devfs_add_devswf(&ptc_cdevsw,i,
+						DV_CHR,0,0,0666,
+						"pty%c%r",jnames[j],k);
+		}
+#endif
+    	}
+}
+
+SYSINIT(ptcdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR_C,ptc_drvinit,NULL)
diff --git a/sys/kern/tty_snoop.c b/sys/kern/tty_snoop.c
new file mode 100644
index 0000000..ba71a94
--- /dev/null
+++ b/sys/kern/tty_snoop.c
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 1995 Ugen J.S.Antsilevich
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * Snoop stuff.
+ */
+
+#include "snp.h"
+
+#if NSNP > 0
+
+#include "opt_compat.h"
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/filio.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+#include <sys/snoop.h>
+#include <sys/vnode.h>
+
+static	d_open_t	snpopen;
+static	d_close_t	snpclose;
+static	d_read_t	snpread;
+static	d_write_t	snpwrite;
+static	d_ioctl_t	snpioctl;
+static	d_poll_t	snppoll;
+
+#define CDEV_MAJOR 53
+static struct cdevsw snp_cdevsw = 
+	{ snpopen,	snpclose,	snpread,	snpwrite,	/*53*/
+	  snpioctl,	nostop,		nullreset,	nodevtotty,/* snoop */
+	  snppoll,	nommap,		NULL,	"snp",	NULL,	-1 };
+
+
+#ifndef MIN
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+static struct snoop snoopsw[NSNP];
+
+static struct tty	*snpdevtotty __P((dev_t dev));
+static int		snp_detach __P((struct snoop *snp));
+
+static struct tty *
+snpdevtotty (dev)
+	dev_t		dev;
+{
+	struct cdevsw	*cdp;
+	int		maj;
+
+	maj = major(dev);
+	if ((u_int)maj >= nchrdev)
+		return (NULL);
+	cdp = cdevsw[maj];
+	if (cdp == NULL)
+		return (NULL);
+	return ((*cdp->d_devtotty)(dev));
+}
+
+#define SNP_INPUT_BUF	5	/* This is even too  much,the maximal
+				 * interactive mode write is 3 bytes
+				 * length for function keys...
+				 */
+
+static	int
+snpwrite(dev, uio, flag)
+	dev_t           dev;
+	struct uio     *uio;
+	int             flag;
+{
+	int             unit = minor(dev), len, i, error;
+	struct snoop   *snp = &snoopsw[unit];
+	struct tty     *tp;
+	char		c[SNP_INPUT_BUF];
+
+	if (snp->snp_tty == NULL)
+		return (EIO);
+
+	tp = snp->snp_tty;
+
+	if ((tp->t_sc == snp) && (tp->t_state & TS_SNOOP) &&
+	    (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC))
+		goto tty_input;
+
+	printf("Snoop: attempt to write to bad tty.\n");
+	return (EIO);
+
+tty_input:
+	if (!(tp->t_state & TS_ISOPEN))
+		return (EIO);
+
+	while (uio->uio_resid > 0) {
+		len = MIN(uio->uio_resid,SNP_INPUT_BUF);
+		if ((error = uiomove(c, len, uio)) != 0)
+			return (error);
+		for (i=0;i<len;i++) {
+			if (ttyinput(c[i] , tp))
+				return (EIO);
+		}
+	}
+	return 0;
+
+}
+
+
+static	int
+snpread(dev, uio, flag)
+	dev_t           dev;
+	struct uio     *uio;
+	int             flag;
+{
+	int             unit = minor(dev), s;
+	struct snoop   *snp = &snoopsw[unit];
+	int             len, n, nblen, error = 0;
+	caddr_t         from;
+	char           *nbuf;
+
+	KASSERT(snp->snp_len + snp->snp_base <= snp->snp_blen,
+	    ("snoop buffer error"));
+
+	if (snp->snp_tty == NULL)
+		return (EIO);
+
+	snp->snp_flags &= ~SNOOP_RWAIT;
+
+	do {
+		if (snp->snp_len == 0) {
+			if (flag & IO_NDELAY)
+				return (EWOULDBLOCK);
+			snp->snp_flags |= SNOOP_RWAIT;
+			tsleep((caddr_t) snp, (PZERO + 1) | PCATCH, "snoopread", 0);
+		}
+	} while (snp->snp_len == 0);
+
+	n = snp->snp_len;
+
+	while (snp->snp_len > 0 && uio->uio_resid > 0 && error == 0) {
+		len = MIN(uio->uio_resid, snp->snp_len);
+		from = (caddr_t) (snp->snp_buf + snp->snp_base);
+		if (len == 0)
+			break;
+
+		error = uiomove(from, len, uio);
+		snp->snp_base += len;
+		snp->snp_len -= len;
+	}
+	if ((snp->snp_flags & SNOOP_OFLOW) && (n < snp->snp_len)) {
+		snp->snp_flags &= ~SNOOP_OFLOW;
+	}
+	s = spltty();
+	nblen = snp->snp_blen;
+	if (((nblen / 2) >= SNOOP_MINLEN) && (nblen / 2) >= snp->snp_len) {
+		while (((nblen / 2) >= snp->snp_len) && ((nblen / 2) >= SNOOP_MINLEN))
+			nblen = nblen / 2;
+		if (nbuf = malloc(nblen, M_TTYS, M_NOWAIT)) {
+			bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len);
+			free(snp->snp_buf, M_TTYS);
+			snp->snp_buf = nbuf;
+			snp->snp_blen = nblen;
+			snp->snp_base = 0;
+		}
+	}
+	splx(s);
+
+	return error;
+}
+
+int
+snpinc(struct snoop *snp, char c)
+{
+        char    buf[1];
+
+        buf[0]=c;
+        return (snpin(snp,buf,1));
+}
+
+
+int
+snpin(snp, buf, n)
+	struct snoop   *snp;
+	char           *buf;
+	int             n;
+{
+	int             s_free, s_tail;
+	int             s, len, nblen;
+	caddr_t         from, to;
+	char           *nbuf;
+
+	KASSERT(n >= 0, ("negative snoop char count"));
+
+	if (n == 0)
+		return 0;
+
+#ifdef DIAGNOSTIC
+	if (!(snp->snp_flags & SNOOP_OPEN)) {
+		printf("Snoop: data coming to closed device.\n");
+		return 0;
+	}
+#endif
+	if (snp->snp_flags & SNOOP_DOWN) {
+		printf("Snoop: more data to down interface.\n");
+		return 0;
+	}
+
+	if (snp->snp_flags & SNOOP_OFLOW) {
+		printf("Snoop: buffer overflow.\n");
+		/*
+		 * On overflow we just repeat the standart close
+		 * procedure...yes , this is waste of space but.. Then next
+		 * read from device will fail if one would recall he is
+		 * snooping and retry...
+		 */
+
+		return (snpdown(snp));
+	}
+	s_tail = snp->snp_blen - (snp->snp_len + snp->snp_base);
+	s_free = snp->snp_blen - snp->snp_len;
+
+
+	if (n > s_free) {
+		s = spltty();
+		nblen = snp->snp_blen;
+		while ((n > s_free) && ((nblen * 2) <= SNOOP_MAXLEN)) {
+			nblen = snp->snp_blen * 2;
+			s_free = nblen - (snp->snp_len + snp->snp_base);
+		}
+		if ((n <= s_free) && (nbuf = malloc(nblen, M_TTYS, M_NOWAIT))) {
+			bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len);
+			free(snp->snp_buf, M_TTYS);
+			snp->snp_buf = nbuf;
+			snp->snp_blen = nblen;
+			snp->snp_base = 0;
+		} else {
+			snp->snp_flags |= SNOOP_OFLOW;
+			if (snp->snp_flags & SNOOP_RWAIT) {
+				snp->snp_flags &= ~SNOOP_RWAIT;
+				wakeup((caddr_t) snp);
+			}
+			splx(s);
+			return 0;
+		}
+		splx(s);
+	}
+	if (n > s_tail) {
+		from = (caddr_t) (snp->snp_buf + snp->snp_base);
+		to = (caddr_t) (snp->snp_buf);
+		len = snp->snp_len;
+		bcopy(from, to, len);
+		snp->snp_base = 0;
+	}
+	to = (caddr_t) (snp->snp_buf + snp->snp_base + snp->snp_len);
+	bcopy(buf, to, n);
+	snp->snp_len += n;
+
+	if (snp->snp_flags & SNOOP_RWAIT) {
+		snp->snp_flags &= ~SNOOP_RWAIT;
+		wakeup((caddr_t) snp);
+	}
+	selwakeup(&snp->snp_sel);
+	snp->snp_sel.si_pid = 0;
+
+	return n;
+}
+
+static	int
+snpopen(dev, flag, mode, p)
+	dev_t           dev;
+	int             flag, mode;
+	struct proc    *p;
+{
+	struct snoop   *snp;
+	register int    unit, error;
+
+	if (error = suser(p->p_ucred, &p->p_acflag))
+		return (error);
+
+	if ((unit = minor(dev)) >= NSNP)
+		return (ENXIO);
+
+	snp = &snoopsw[unit];
+
+	if (snp->snp_flags & SNOOP_OPEN)
+		return (ENXIO);
+
+	/*
+	 * We intentionally do not OR flags with SNOOP_OPEN,but set them so
+	 * all previous settings (especially SNOOP_OFLOW) will be cleared.
+	 */
+	snp->snp_flags = SNOOP_OPEN;
+
+	snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK);
+	snp->snp_blen = SNOOP_MINLEN;
+	snp->snp_base = 0;
+	snp->snp_len = 0;
+
+	/*
+	 * snp_tty == NULL  is for inactive snoop devices.
+	 */
+	snp->snp_tty = NULL;
+	snp->snp_target = -1;
+	return (0);
+}
+
+
+static int
+snp_detach(snp)
+	struct snoop   *snp;
+{
+	struct tty     *tp;
+
+	snp->snp_base = 0;
+	snp->snp_len = 0;
+
+	/*
+	 * If line disc. changed we do not touch this pointer,SLIP/PPP will
+	 * change it anyway.
+	 */
+
+	if (snp->snp_tty == NULL)
+		goto detach_notty;
+
+	tp = snp->snp_tty;
+
+	if (tp && (tp->t_sc == snp) && (tp->t_state & TS_SNOOP) &&
+	    (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC)) {
+		tp->t_sc = NULL;
+		tp->t_state &= ~TS_SNOOP;
+	} else
+		printf("Snoop: bad attached tty data.\n");
+
+	snp->snp_tty = NULL;
+	snp->snp_target = -1;
+
+detach_notty:
+	selwakeup(&snp->snp_sel);
+	snp->snp_sel.si_pid = 0;
+
+	return (0);
+}
+
+static	int
+snpclose(dev, flags, fmt, p)
+	dev_t           dev;
+	int             flags;
+	int             fmt;
+	struct proc    *p;
+{
+	register int    unit = minor(dev);
+	struct snoop   *snp = &snoopsw[unit];
+
+	snp->snp_blen = 0;
+	free(snp->snp_buf, M_TTYS);
+	snp->snp_flags &= ~SNOOP_OPEN;
+
+	return (snp_detach(snp));
+}
+
+int
+snpdown(snp)
+	struct snoop	*snp;
+{
+	snp->snp_blen = SNOOP_MINLEN;
+	free(snp->snp_buf, M_TTYS);
+	snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK);
+	snp->snp_flags |= SNOOP_DOWN;
+
+	return (snp_detach(snp));
+}
+
+
+static	int
+snpioctl(dev, cmd, data, flags, p)
+	dev_t           dev;
+	u_long          cmd;
+	caddr_t         data;
+	int             flags;
+	struct proc    *p;
+{
+	int             unit = minor(dev), s;
+	dev_t		tdev;
+	struct snoop   *snp = &snoopsw[unit];
+	struct tty     *tp, *tpo;
+
+	switch (cmd) {
+	case SNPSTTY:
+		tdev = *((dev_t *) data);
+		if (tdev == -1)
+			return (snpdown(snp));
+
+		tp = snpdevtotty(tdev);
+		if (!tp)
+			return (EINVAL);
+
+		if ((tp->t_sc != (caddr_t) snp) && (tp->t_state & TS_SNOOP))
+			return (EBUSY);
+
+		if ((tp->t_line != OTTYDISC) && (tp->t_line != NTTYDISC))
+			return (EBUSY);
+
+		s = spltty();
+
+		if (snp->snp_target == -1) {
+			tpo = snp->snp_tty;
+			if (tpo)
+				tpo->t_state &= ~TS_SNOOP;
+		}
+
+		tp->t_sc = (caddr_t) snp;
+		tp->t_state |= TS_SNOOP;
+		snp->snp_tty = tp;
+		snp->snp_target = tdev;
+
+		/*
+		 * Clean overflow and down flags -
+		 * we'll have a chance to get them in the future :)))
+		 */
+		snp->snp_flags &= ~SNOOP_OFLOW;
+		snp->snp_flags &= ~SNOOP_DOWN;
+		splx(s);
+		break;
+
+	case SNPGTTY:
+		/*
+		 * We keep snp_target field specially to make
+		 * SNPGTTY happy,else we can't know what is device
+		 * major/minor for tty.
+		 */
+		*((dev_t *) data) = snp->snp_target;
+		break;
+
+	case FIONBIO:
+		break;
+
+	case FIOASYNC:
+		if (*(int *) data)
+			snp->snp_flags |= SNOOP_ASYNC;
+		else
+			snp->snp_flags &= ~SNOOP_ASYNC;
+		break;
+
+	case FIONREAD:
+		s = spltty();
+		if (snp->snp_tty != NULL)
+			*(int *) data = snp->snp_len;
+		else
+			if (snp->snp_flags & SNOOP_DOWN) {
+				if (snp->snp_flags & SNOOP_OFLOW)
+					*(int *) data = SNP_OFLOW;
+				else
+					*(int *) data = SNP_TTYCLOSE;
+			} else {
+				*(int *) data = SNP_DETACH;
+			}
+		splx(s);
+		break;
+
+	default:
+		return (ENOTTY);
+	}
+	return (0);
+}
+
+
+static	int
+snppoll(dev, events, p)
+	dev_t           dev;
+	int             events;
+	struct proc    *p;
+{
+	int             unit = minor(dev);
+	struct snoop   *snp = &snoopsw[unit];
+	int		revents = 0;
+
+
+	/*
+	 * If snoop is down,we don't want to poll() forever so we return 1.
+	 * Caller should see if we down via FIONREAD ioctl().The last should
+	 * return -1 to indicate down state.
+	 */
+	if (events & (POLLIN | POLLRDNORM))
+		if (snp->snp_flags & SNOOP_DOWN || snp->snp_len > 0)
+			revents |= events & (POLLIN | POLLRDNORM);
+		else
+			selrecord(p, &snp->snp_sel);
+
+	return (revents);
+}
+
+#ifdef DEVFS
+static	void	*snp_devfs_token[NSNP];
+#endif
+static	int	snp_devsw_installed;
+
+static void snp_drvinit __P((void *unused));
+static void
+snp_drvinit(unused)
+	void *unused;
+{
+	dev_t dev;
+#ifdef DEVFS
+	int	i;
+#endif
+
+	if( ! snp_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR, 0);
+		cdevsw_add(&dev,&snp_cdevsw, NULL);
+		snp_devsw_installed = 1;
+#ifdef DEVFS
+		for ( i = 0 ; i < NSNP ; i++) {
+			snp_devfs_token[i] =
+				devfs_add_devswf(&snp_cdevsw, i, DV_CHR, 0, 0, 
+						0600, "snp%d", i);
+		}
+#endif
+    	}
+}
+
+SYSINIT(snpdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,snp_drvinit,NULL)
+
+
+#endif
diff --git a/sys/kern/tty_subr.c b/sys/kern/tty_subr.c
new file mode 100644
index 0000000..593d00c
--- /dev/null
+++ b/sys/kern/tty_subr.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: tty_subr.c,v 1.29 1998/04/15 17:46:27 bde Exp $
+ */
+
+/*
+ * clist support routines
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/tty.h>
+#include <sys/clist.h>
+
+static void clist_init __P((void *));
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
+
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define	INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc __P((void));
+static void cblock_alloc_cblocks __P((int number));
+static void cblock_free __P((struct cblock *cblockp));
+static void cblock_free_cblocks __P((int number));
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
+{
+	printf(
+	"tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+	       ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount,
+	       cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE);
+}
+#endif /* DDB */
+
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+	void *dummy;
+{
+	/*
+	 * Allocate an initial base set of cblocks as a 'slush'.
+	 * We allocate non-slush cblocks with each initial ttyopen() and
+	 * deallocate them with each ttyclose().
+	 * We should adjust the slush allocation.  This can't be done in
+	 * the i/o routines because they are sometimes called from
+	 * interrupt handlers when it may be unsafe to call malloc().
+	 */
+	cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
+}
+
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static __inline struct cblock *
+cblock_alloc()
+{
+	struct cblock *cblockp;
+
+	cblockp = cfreelist;
+	if (cblockp == NULL)
+		panic("clist reservation botch");
+	cfreelist = cblockp->c_next;
+	cblockp->c_next = NULL;
+	cfreecount -= CBSIZE;
+	return (cblockp);
+}
+
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static __inline void
+cblock_free(cblockp)
+	struct cblock *cblockp;
+{
+	if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+		bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+	cblockp->c_next = cfreelist;
+	cfreelist = cblockp;
+	cfreecount += CBSIZE;
+}
+
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+	int number;
+{
+	int i;
+	struct cblock *cbp;
+
+	for (i = 0; i < number; ++i) {
+		cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+		if (cbp == NULL) {
+			printf(
+"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+			cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+		}
+		/*
+		 * Freed cblocks have zero quotes and garbage elsewhere.
+		 * Set the may-have-quote bit to force zeroing the quotes.
+		 */
+		setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+		cblock_free(cbp);
+	}
+	ctotcount += number;
+}
+
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+	struct clist *clistp;
+	int ccmax;
+	int ccreserved;
+{
+	int dcbr;
+
+	/*
+	 * Allow for wasted space at the head.
+	 */
+	if (ccmax != 0)
+		ccmax += CBSIZE - 1;
+	if (ccreserved != 0)
+		ccreserved += CBSIZE - 1;
+
+	clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+	dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+	if (dcbr >= 0)
+		cblock_alloc_cblocks(dcbr);
+	else {
+		if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+			dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+		cblock_free_cblocks(-dcbr);
+	}
+	clistp->c_cbreserved += dcbr;
+}
+
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+	int number;
+{
+	int i;
+
+	for (i = 0; i < number; ++i)
+		free(cblock_alloc(), M_TTYS);
+	ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
+void
+clist_free_cblocks(clistp)
+	struct clist *clistp;
+{
+	if (clistp->c_cbcount != 0)
+		panic("freeing active clist cblocks");
+	cblock_free_cblocks(clistp->c_cbreserved);
+	clistp->c_cbmax = 0;
+	clistp->c_cbreserved = 0;
+}
+
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+	struct clist *clistp;
+{
+	int chr = -1;
+	int s;
+	struct cblock *cblockp;
+
+	s = spltty();
+
+	/* If there are characters in the list, get one */
+	if (clistp->c_cc) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		chr = (u_char)*clistp->c_cf;
+
+		/*
+		 * If this char is quoted, set the flag.
+		 */
+		if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * Advance to next character.
+		 */
+		clistp->c_cf++;
+		clistp->c_cc--;
+		/*
+		 * If we have advanced the 'first' character pointer
+		 * past the end of this cblock, advance to the next one.
+		 * If there are no more characters, set the first and
+		 * last pointers to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+	struct clist *clistp;
+	char *dest;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	char *dest_orig = dest;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (clistp && amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		bcopy(clistp->c_cf, dest, numc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		dest += numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (dest - dest_orig);
+}
+
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+	struct clist *clistp;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+	int chr;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	int s;
+
+	s = spltty();
+
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("putc to a clist with no reserved cblocks\n");
+			return (-1);		/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+		if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = (cblockp - 1);
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (-1);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+	}
+
+	/*
+	 * If this character is quoted, set the quote bit, if not, clear it.
+	 */
+	if (chr & TTY_QUOTE) {
+		setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+		/*
+		 * Use one of the spare quote bits to record that something
+		 * may be quoted.
+		 */
+		setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+	} else
+		clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+	*clistp->c_cl++ = chr;
+	clistp->c_cc++;
+
+	splx(s);
+	return (0);
+}
+
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+	char *src;
+	int amount;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	char *firstbyte, *lastbyte;
+	u_char startmask, endmask;
+	int startbit, endbit, num_between, numc;
+	int s;
+
+	/*
+	 * Avoid allocating an initial cblock and then not using it.
+	 * c_cc == 0 must imply c_cbount == 0.
+	 */
+	if (amount <= 0)
+		return (amount);
+
+	s = spltty();
+
+	/*
+	 * If there are no cblocks assigned to this clist yet,
+	 * then get one.
+	 */
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("b_to_q to a clist with no reserved cblocks.\n");
+			return (amount);	/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+	}
+
+	while (amount) {
+		/*
+		 * Get another cblock if needed.
+		 */
+		if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = cblockp - 1;
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (amount);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+
+		/*
+		 * Copy a chunk of the linear buffer up to the end
+		 * of this cblock.
+		 */
+		numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+		bcopy(src, clistp->c_cl, numc);
+
+		/*
+		 * Clear quote bits if they aren't known to be clear.
+		 * The following could probably be made into a seperate
+		 * "bitzero()" routine, but why bother?
+		 */
+		if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+			startbit = clistp->c_cl - (char *)cblockp->c_info;
+			endbit = startbit + numc - 1;
+
+			firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+			lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+			/*
+			 * Calculate mask of bits to preserve in first and
+			 * last bytes.
+			 */
+			startmask = NBBY - (startbit % NBBY);
+			startmask = 0xff >> startmask;
+			endmask = (endbit % NBBY);
+			endmask = 0xff << (endmask + 1);
+
+			if (firstbyte != lastbyte) {
+				*firstbyte &= startmask;
+				*lastbyte &= endmask;
+
+				num_between = lastbyte - firstbyte - 1;
+				if (num_between)
+					bzero(firstbyte + 1, num_between);
+			} else {
+				*firstbyte &= (startmask | endmask);
+			}
+		}
+
+		/*
+		 * ...and update pointer for the next chunk.
+		 */
+		src += numc;
+		clistp->c_cl += numc;
+		clistp->c_cc += numc;
+		amount -= numc;
+		/*
+		 * If we go through the loop again, it's always
+		 * for data in the next cblock, so by adding one (cblock),
+		 * (which makes the pointer 1 beyond the end of this
+		 * cblock) we prepare for the assignment of 'prev'
+		 * above.
+		 */
+		cblockp += 1;
+
+	}
+
+	splx(s);
+	return (amount);
+}
+
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
+char *
+nextc(clistp, cp, dst)
+	struct clist *clistp;
+	char *cp;
+	int *dst;
+{
+	struct cblock *cblockp;
+
+	++cp;
+	/*
+	 * See if the next character is beyond the end of
+	 * the clist.
+	 */
+	if (clistp->c_cc && (cp != clistp->c_cl)) {
+		/*
+		 * If the next character is beyond the end of this
+		 * cblock, advance to the next cblock.
+		 */
+		if (((intptr_t)cp & CROUND) == 0)
+			cp = ((struct cblock *)cp - 1)->c_next->c_info;
+		cblockp = (struct cblock *)((intptr_t)cp & ~CROUND);
+
+		/*
+		 * Get the character. Set the quote flag if this character
+		 * is quoted.
+		 */
+		*dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+		return (cp);
+	}
+
+	return (NULL);
+}
+
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+	struct clist *clistp;
+{
+	struct cblock *cblockp = 0, *cbp = 0;
+	int s;
+	int chr = -1;
+
+
+	s = spltty();
+
+	if (clistp->c_cc) {
+		--clistp->c_cc;
+		--clistp->c_cl;
+
+		chr = (u_char)*clistp->c_cl;
+
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+
+		/*
+		 * Set quote flag if this character was quoted.
+		 */
+		if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * If all of the characters have been unput in this
+		 * cblock, then find the previous one and free this
+		 * one.
+		 */
+		if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+			cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+
+			while (cbp->c_next != cblockp)
+				cbp = cbp->c_next;
+
+			/*
+			 * When the previous cblock is at the end, the 'last'
+			 * pointer always points (invalidly) one past.
+			 */
+			clistp->c_cl = (char *)(cbp+1);
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+			cbp->c_next = NULL;
+		}
+	}
+
+	/*
+	 * If there are no more characters on the list, then
+	 * free the last cblock.
+	 */
+	if ((clistp->c_cc == 0) && clistp->c_cl) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+		cblock_free(cblockp);
+		if (--clistp->c_cbcount >= clistp->c_cbreserved)
+			++cslushcount;
+		clistp->c_cf = clistp->c_cl = NULL;
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
+void
+catq(src_clistp, dest_clistp)
+	struct clist *src_clistp, *dest_clistp;
+{
+	int chr, s;
+
+	s = spltty();
+	/*
+	 * If the destination clist is empty (has no cblocks atttached),
+	 * and there are no possible complications with the resource counters,
+	 * then we simply assign the current clist to the destination.
+	 */
+	if (!dest_clistp->c_cf
+	    && src_clistp->c_cbcount <= src_clistp->c_cbmax
+	    && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+		dest_clistp->c_cf = src_clistp->c_cf;
+		dest_clistp->c_cl = src_clistp->c_cl;
+		src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+		dest_clistp->c_cc = src_clistp->c_cc;
+		src_clistp->c_cc = 0;
+		dest_clistp->c_cbcount = src_clistp->c_cbcount;
+		src_clistp->c_cbcount = 0;
+
+		splx(s);
+		return;
+	}
+
+	splx(s);
+
+	/*
+	 * XXX  This should probably be optimized to more than one
+	 * character at a time.
+	 */
+	while ((chr = getc(src_clistp)) != -1)
+		putc(chr, dest_clistp);
+}
diff --git a/sys/kern/tty_tb.c b/sys/kern/tty_tb.c
new file mode 100644
index 0000000..8f4c84c
--- /dev/null
+++ b/sys/kern/tty_tb.c
@@ -0,0 +1,367 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_tb.c	8.1 (Berkeley) 6/10/93
+ * $Id$
+ */
+
+#include "tb.h"
+#if NTB > 0
+
+/*
+ * Line discipline for RS232 tablets;
+ * supplies binary coordinate data.
+ */
+#include <sys/param.h>
+#include <sys/tablet.h>
+#include <sys/tty.h>
+
+/*
+ * Tablet configuration table.
+ */
+struct	tbconf {
+	short	tbc_recsize;	/* input record size in bytes */
+	short	tbc_uiosize;	/* size of data record returned user */
+	int	tbc_sync;	/* mask for finding sync byte/bit */
+	int	(*tbc_decode)();/* decoding routine */
+	char	*tbc_run;	/* enter run mode sequence */
+	char	*tbc_point;	/* enter point mode sequence */
+	char	*tbc_stop;	/* stop sequence */
+	char	*tbc_start;	/* start/restart sequence */
+	int	tbc_flags;
+#define	TBF_POL		0x1	/* polhemus hack */
+#define	TBF_INPROX	0x2	/* tablet has proximity info */
+};
+
+static	int tbdecode(), gtcodecode(), poldecode();
+static	int tblresdecode(), tbhresdecode();
+
+struct	tbconf tbconf[TBTYPE] = {
+{ 0 },
+{ 5, sizeof (struct tbpos), 0200, tbdecode, "6", "4" },
+{ 5, sizeof (struct tbpos), 0200, tbdecode, "\1CN", "\1RT", "\2", "\4" },
+{ 8, sizeof (struct gtcopos), 0200, gtcodecode },
+{17, sizeof (struct polpos), 0200, poldecode, 0, 0, "\21", "\5\22\2\23",
+  TBF_POL },
+{ 5, sizeof (struct tbpos), 0100, tblresdecode, "\1CN", "\1PT", "\2", "\4",
+  TBF_INPROX },
+{ 6, sizeof (struct tbpos), 0200, tbhresdecode, "\1CN", "\1PT", "\2", "\4",
+  TBF_INPROX },
+{ 5, sizeof (struct tbpos), 0100, tblresdecode, "\1CL\33", "\1PT\33", 0, 0},
+{ 6, sizeof (struct tbpos), 0200, tbhresdecode, "\1CL\33", "\1PT\33", 0, 0},
+};
+
+/*
+ * Tablet state
+ */
+struct tb {
+	int	tbflags;		/* mode & type bits */
+#define	TBMAXREC	17	/* max input record size */
+	char	cbuf[TBMAXREC];		/* input buffer */
+	union {
+		struct	tbpos tbpos;
+		struct	gtcopos gtcopos;
+		struct	polpos polpos;
+	} rets;				/* processed state */
+#define NTBS	16
+} tb[NTBS];
+
+/*
+ * Open as tablet discipline; called on discipline change.
+ */
+/*ARGSUSED*/
+tbopen(dev, tp)
+	dev_t dev;
+	register struct tty *tp;
+{
+	register struct tb *tbp;
+
+	if (tp->t_line == TABLDISC)
+		return (ENODEV);
+	ttywflush(tp);
+	for (tbp = tb; tbp < &tb[NTBS]; tbp++)
+		if (tbp->tbflags == 0)
+			break;
+	if (tbp >= &tb[NTBS])
+		return (EBUSY);
+	tbp->tbflags = TBTIGER|TBPOINT;		/* default */
+	tp->t_cp = tbp->cbuf;
+	tp->t_inbuf = 0;
+	bzero((caddr_t)&tbp->rets, sizeof (tbp->rets));
+	tp->T_LINEP = (caddr_t)tbp;
+	tp->t_flags |= LITOUT;
+	return (0);
+}
+
+/*
+ * Line discipline change or last device close.
+ */
+tbclose(tp)
+	register struct tty *tp;
+{
+	register int s;
+	int modebits = TBPOINT|TBSTOP;
+
+	tbioctl(tp, BIOSMODE, &modebits, 0);
+	s = spltty();
+	((struct tb *)tp->T_LINEP)->tbflags = 0;
+	tp->t_cp = 0;
+	tp->t_inbuf = 0;
+	tp->t_rawq.c_cc = 0;		/* clear queues -- paranoid */
+	tp->t_canq.c_cc = 0;
+	tp->t_line = 0;			/* paranoid: avoid races */
+	splx(s);
+}
+
+/*
+ * Read from a tablet line.
+ * Characters have been buffered in a buffer and decoded.
+ */
+tbread(tp, uio)
+	register struct tty *tp;
+	struct uio *uio;
+{
+	register struct tb *tbp = (struct tb *)tp->T_LINEP;
+	register struct tbconf *tc = &tbconf[tbp->tbflags & TBTYPE];
+	int ret;
+
+	if ((tp->t_state&TS_CARR_ON) == 0)
+		return (EIO);
+	ret = uiomove(&tbp->rets, tc->tbc_uiosize, uio);
+	if (tc->tbc_flags&TBF_POL)
+		tbp->rets.polpos.p_key = ' ';
+	return (ret);
+}
+
+/*
+ * Low level character input routine.
+ * Stuff the character in the buffer, and decode
+ * if all the chars are there.
+ *
+ * This routine could be expanded in-line in the receiver
+ * interrupt routine to make it run as fast as possible.
+ */
+tbinput(c, tp)
+	register int c;
+	register struct tty *tp;
+{
+	register struct tb *tbp = (struct tb *)tp->T_LINEP;
+	register struct tbconf *tc = &tbconf[tbp->tbflags & TBTYPE];
+
+	if (tc->tbc_recsize == 0 || tc->tbc_decode == 0)	/* paranoid? */
+		return;
+	/*
+	 * Locate sync bit/byte or reset input buffer.
+	 */
+	if (c&tc->tbc_sync || tp->t_inbuf == tc->tbc_recsize) {
+		tp->t_cp = tbp->cbuf;
+		tp->t_inbuf = 0;
+	}
+	*tp->t_cp++ = c&0177;
+	/*
+	 * Call decode routine only if a full record has been collected.
+	 */
+	if (++tp->t_inbuf == tc->tbc_recsize)
+		(*tc->tbc_decode)(tc, tbp->cbuf, &tbp->rets);
+}
+
+/*
+ * Decode GTCO 8 byte format (high res, tilt, and pressure).
+ */
+static
+gtcodecode(tc, cp, tbpos)
+	struct tbconf *tc;
+	register char *cp;
+	register struct gtcopos *tbpos;
+{
+
+	tbpos->pressure = *cp >> 2;
+	tbpos->status = (tbpos->pressure > 16) | TBINPROX; /* half way down */
+	tbpos->xpos = (*cp++ & 03) << 14;
+	tbpos->xpos |= *cp++ << 7;
+	tbpos->xpos |= *cp++;
+	tbpos->ypos = (*cp++ & 03) << 14;
+	tbpos->ypos |= *cp++ << 7;
+	tbpos->ypos |= *cp++;
+	tbpos->xtilt = *cp++;
+	tbpos->ytilt = *cp++;
+	tbpos->scount++;
+}
+
+/*
+ * Decode old Hitachi 5 byte format (low res).
+ */
+static
+tbdecode(tc, cp, tbpos)
+	struct tbconf *tc;
+	register char *cp;
+	register struct tbpos *tbpos;
+{
+	register char byte;
+
+	byte = *cp++;
+	tbpos->status = (byte&0100) ? TBINPROX : 0;
+	byte &= ~0100;
+	if (byte > 036)
+		tbpos->status |= 1 << ((byte-040)/2);
+	tbpos->xpos = *cp++ << 7;
+	tbpos->xpos |= *cp++;
+	if (tbpos->xpos < 256)			/* tablet wraps around at 256 */
+		tbpos->status &= ~TBINPROX;	/* make it out of proximity */
+	tbpos->ypos = *cp++ << 7;
+	tbpos->ypos |= *cp++;
+	tbpos->scount++;
+}
+
+/*
+ * Decode new Hitach 5-byte format (low res).
+ */
+static
+tblresdecode(tc, cp, tbpos)
+	struct tbconf *tc;
+	register char *cp;
+	register struct tbpos *tbpos;
+{
+
+	*cp &= ~0100;		/* mask sync bit */
+	tbpos->status = (*cp++ >> 2) | TBINPROX;
+	if (tc->tbc_flags&TBF_INPROX && tbpos->status&020)
+		tbpos->status &= ~(020|TBINPROX);
+	tbpos->xpos = *cp++;
+	tbpos->xpos |= *cp++ << 6;
+	tbpos->ypos = *cp++;
+	tbpos->ypos |= *cp++ << 6;
+	tbpos->scount++;
+}
+
+/*
+ * Decode new Hitach 6-byte format (high res).
+ */
+static
+tbhresdecode(tc, cp, tbpos)
+	struct tbconf *tc;
+	register char *cp;
+	register struct tbpos *tbpos;
+{
+	char byte;
+
+	byte = *cp++;
+	tbpos->xpos = (byte & 03) << 14;
+	tbpos->xpos |= *cp++ << 7;
+	tbpos->xpos |= *cp++;
+	tbpos->ypos = *cp++ << 14;
+	tbpos->ypos |= *cp++ << 7;
+	tbpos->ypos |= *cp++;
+	tbpos->status = (byte >> 2) | TBINPROX;
+	if (tc->tbc_flags&TBF_INPROX && tbpos->status&020)
+		tbpos->status &= ~(020|TBINPROX);
+	tbpos->scount++;
+}
+
+/*
+ * Polhemus decode.
+ */
+static
+poldecode(tc, cp, polpos)
+	struct tbconf *tc;
+	register char *cp;
+	register struct polpos *polpos;
+{
+
+	polpos->p_x = cp[4] | cp[3]<<7 | (cp[9] & 0x03) << 14;
+	polpos->p_y = cp[6] | cp[5]<<7 | (cp[9] & 0x0c) << 12;
+	polpos->p_z = cp[8] | cp[7]<<7 | (cp[9] & 0x30) << 10;
+	polpos->p_azi = cp[11] | cp[10]<<7 | (cp[16] & 0x03) << 14;
+	polpos->p_pit = cp[13] | cp[12]<<7 | (cp[16] & 0x0c) << 12;
+	polpos->p_rol = cp[15] | cp[14]<<7 | (cp[16] & 0x30) << 10;
+	polpos->p_stat = cp[1] | cp[0]<<7;
+	if (cp[2] != ' ')
+		polpos->p_key = cp[2];
+}
+
+/*ARGSUSED*/
+tbioctl(tp, cmd, data, flag)
+	struct tty *tp;
+	caddr_t data;
+{
+	register struct tb *tbp = (struct tb *)tp->T_LINEP;
+
+	switch (cmd) {
+
+	case BIOGMODE:
+		*(int *)data = tbp->tbflags & TBMODE;
+		break;
+
+	case BIOSTYPE:
+		if (tbconf[*(int *)data & TBTYPE].tbc_recsize == 0 ||
+		    tbconf[*(int *)data & TBTYPE].tbc_decode == 0)
+			return (EINVAL);
+		tbp->tbflags &= ~TBTYPE;
+		tbp->tbflags |= *(int *)data & TBTYPE;
+		/* fall thru... to set mode bits */
+
+	case BIOSMODE: {
+		register struct tbconf *tc;
+
+		tbp->tbflags &= ~TBMODE;
+		tbp->tbflags |= *(int *)data & TBMODE;
+		tc = &tbconf[tbp->tbflags & TBTYPE];
+		if (tbp->tbflags&TBSTOP) {
+			if (tc->tbc_stop)
+				ttyout(tc->tbc_stop, tp);
+		} else if (tc->tbc_start)
+			ttyout(tc->tbc_start, tp);
+		if (tbp->tbflags&TBPOINT) {
+			if (tc->tbc_point)
+				ttyout(tc->tbc_point, tp);
+		} else if (tc->tbc_run)
+			ttyout(tc->tbc_run, tp);
+		ttstart(tp);
+		break;
+	}
+
+	case BIOGTYPE:
+		*(int *)data = tbp->tbflags & TBTYPE;
+		break;
+
+	case TIOCSETD:
+	case TIOCGETD:
+	case TIOCGETP:
+	case TIOCGETC:
+		return (-1);		/* pass thru... */
+
+	default:
+		return (ENOTTY);
+	}
+	return (0);
+}
+#endif
diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c
new file mode 100644
index 0000000..889c935
--- /dev/null
+++ b/sys/kern/tty_tty.c
@@ -0,0 +1,206 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_tty.c	8.2 (Berkeley) 9/23/93
+ * $Id: tty_tty.c,v 1.24 1998/06/07 17:11:44 dfr Exp $
+ */
+
+/*
+ * Indirect driver for controlling tty.
+ */
+
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/lock.h>
+#include <sys/proc.h>
+#include <sys/ttycom.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+static	d_open_t	cttyopen;
+static	d_read_t	cttyread;
+static	d_write_t	cttywrite;
+static	d_ioctl_t	cttyioctl;
+static	d_poll_t	cttypoll;
+
+#define	CDEV_MAJOR	1
+/* Don't make this static, since fdesc_vnops uses it. */
+struct cdevsw	ctty_cdevsw = {
+	cttyopen,	nullclose,	cttyread,	cttywrite,
+	cttyioctl,	nullstop,	nullreset,	nodevtotty,
+	cttypoll,	nommap,		NULL,		"ctty",
+	NULL,		-1,		nodump,		nopsize,
+	D_TTY,
+};
+
+#define cttyvp(p) ((p)->p_flag & P_CONTROLT ? (p)->p_session->s_ttyvp : NULL)
+
+/*ARGSUSED*/
+static	int
+cttyopen(dev, flag, mode, p)
+	dev_t dev;
+	int flag, mode;
+	struct proc *p;
+{
+	struct vnode *ttyvp = cttyvp(p);
+	int error;
+
+	if (ttyvp == NULL)
+		return (ENXIO);
+	vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p);
+#ifdef PARANOID
+	/*
+	 * Since group is tty and mode is 620 on most terminal lines
+	 * and since sessions protect terminals from processes outside
+	 * your session, this check is probably no longer necessary.
+	 * Since it inhibits setuid root programs that later switch
+	 * to another user from accessing /dev/tty, we have decided
+	 * to delete this test. (mckusick 5/93)
+	 */
+	error = VOP_ACCESS(ttyvp,
+	  (flag&FREAD ? VREAD : 0) | (flag&FWRITE ? VWRITE : 0), p->p_ucred, p);
+	if (!error)
+#endif /* PARANOID */
+		error = VOP_OPEN(ttyvp, flag, NOCRED, p);
+	VOP_UNLOCK(ttyvp, 0, p);
+	return (error);
+}
+
+/*ARGSUSED*/
+static	int
+cttyread(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	struct proc *p = uio->uio_procp;
+	register struct vnode *ttyvp = cttyvp(p);
+	int error;
+
+	if (ttyvp == NULL)
+		return (EIO);
+	vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p);
+	error = VOP_READ(ttyvp, uio, flag, NOCRED);
+	VOP_UNLOCK(ttyvp, 0, p);
+	return (error);
+}
+
+/*ARGSUSED*/
+static	int
+cttywrite(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	struct proc *p = uio->uio_procp;
+	struct vnode *ttyvp = cttyvp(uio->uio_procp);
+	int error;
+
+	if (ttyvp == NULL)
+		return (EIO);
+	vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p);
+	error = VOP_WRITE(ttyvp, uio, flag, NOCRED);
+	VOP_UNLOCK(ttyvp, 0, p);
+	return (error);
+}
+
+/*ARGSUSED*/
+static	int
+cttyioctl(dev, cmd, addr, flag, p)
+	dev_t dev;
+	u_long cmd;
+	caddr_t addr;
+	int flag;
+	struct proc *p;
+{
+	struct vnode *ttyvp = cttyvp(p);
+
+	if (ttyvp == NULL)
+		return (EIO);
+	if (cmd == TIOCSCTTY)  /* don't allow controlling tty to be set    */
+		return EINVAL; /* to controlling tty -- infinite recursion */
+	if (cmd == TIOCNOTTY) {
+		if (!SESS_LEADER(p)) {
+			p->p_flag &= ~P_CONTROLT;
+			return (0);
+		} else
+			return (EINVAL);
+	}
+	return (VOP_IOCTL(ttyvp, cmd, addr, flag, NOCRED, p));
+}
+
+/*ARGSUSED*/
+static	int
+cttypoll(dev, events, p)
+	dev_t dev;
+	int events;
+	struct proc *p;
+{
+	struct vnode *ttyvp = cttyvp(p);
+
+	if (ttyvp == NULL)
+		/* try operation to get EOF/failure */
+		return (seltrue(dev, events, p));
+	return (VOP_POLL(ttyvp, events, p->p_ucred, p));
+}
+
+static	int	ctty_devsw_installed;
+#ifdef DEVFS
+static 	void	*ctty_devfs_token;
+#endif
+
+static void ctty_drvinit __P((void *unused));
+static void
+ctty_drvinit(unused)
+	void *unused;
+{
+	dev_t dev;
+
+	if( ! ctty_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR,0);
+		cdevsw_add(&dev,&ctty_cdevsw,NULL);
+		ctty_devsw_installed = 1;
+#ifdef DEVFS
+		ctty_devfs_token = 
+			devfs_add_devswf(&ctty_cdevsw, 0, DV_CHR, 0, 0, 
+					0666, "tty");
+#endif
+    	}
+}
+
+SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,ctty_drvinit,NULL)
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
new file mode 100644
index 0000000..929da87
--- /dev/null
+++ b/sys/kern/uipc_domain.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_domain.c	8.2 (Berkeley) 10/18/93
+ *	$Id: uipc_domain.c,v 1.19 1998/05/15 20:11:29 wollman Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socketvar.h>
+#include <sys/systm.h>
+#include <vm/vm_zone.h>
+
+/*
+ * System initialization
+ *
+ * Note: domain initialization wants to take place on a per domain basis
+ * as a result of traversing a linker set.  Most likely, each domain
+ * want to call a registration function rather than being handled here
+ * in domaininit().  Probably this will look like:
+ *
+ * SYSINIT(unique, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, domain_add, xxx)
+ *
+ * Where 'xxx' is replaced by the address of a parameter struct to be
+ * passed to the doamin_add() function.
+ */
+
+static int	x_save_spl;			/* used by kludge*/
+static void kludge_splimp __P((void *));
+static void kludge_splx __P((void *));
+static void domaininit __P((void *));
+SYSINIT(splimp, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, kludge_splimp, &x_save_spl)
+SYSINIT(domain, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, domaininit, NULL)
+SYSINIT(splx, SI_SUB_PROTO_END, SI_ORDER_FIRST, kludge_splx, &x_save_spl)
+
+static void	pffasttimo __P((void *));
+static void	pfslowtimo __P((void *));
+
+struct domain *domains;
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because  a socket may be using it.
+ * XXX can't fail at this time.
+ */
+static int
+net_init_domain(struct domain *dp)
+{
+	register struct protosw *pr;
+	int	s;
+
+	s = splnet();
+	if (dp->dom_init)
+		(*dp->dom_init)();
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++){
+		if (pr->pr_usrreqs == 0)
+			panic("domaininit: %ssw[%d] has no usrreqs!",
+			      dp->dom_name, 
+			      (int)(pr - dp->dom_protosw));
+		if (pr->pr_init)
+			(*pr->pr_init)();
+	}
+	/*
+	 * update global informatio about maximums
+	 */
+	max_hdr = max_linkhdr + max_protohdr;
+	max_datalen = MHLEN - max_hdr;
+	splx(s);
+	return (0);
+}
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because  a socket may be using it.
+ * XXX can't fail at this time.
+ */
+int
+net_add_domain(struct domain *dp)
+{
+	int	s, error;
+
+	s = splnet();
+	dp->dom_next = domains;
+	domains = dp;
+	splx(s);
+	error = net_init_domain(dp);
+	max_hdr = max_linkhdr + max_protohdr;
+	max_datalen = MHLEN - max_hdr;
+	return (error);
+}
+
+extern struct linker_set domain_set;
+
+/* ARGSUSED*/
+static void
+domaininit(void *dummy)
+{
+	register struct domain *dp, **dpp;
+	/*
+	 * Before we do any setup, make sure to initialize the
+	 * zone allocator we get struct sockets from.  The obvious
+	 * maximum number of sockets is `maxfiles', but it is possible
+	 * to have a socket without an open file (e.g., a connection waiting
+	 * to be accept(2)ed).  Rather than think up and define a
+	 * better value, we just use nmbclusters, since that's what people
+	 * are told to increase first when the network runs out of memory.
+	 * Perhaps we should have two pools, one of unlimited size
+	 * for use during socreate(), and one ZONE_INTERRUPT pool for
+	 * use in sonewconn().
+	 */
+	socket_zone = zinit("socket", sizeof(struct socket), maxsockets,
+			    ZONE_INTERRUPT, 0);
+
+	if (max_linkhdr < 16)		/* XXX */
+		max_linkhdr = 16;
+
+	/*
+	 * NB - local domain is always present.
+	 */
+	net_add_domain(&localdomain);
+
+	/* 
+	 * gather up as many protocols as we have statically linked.
+	 * XXX we need to do this because when we ask the routing
+	 * protocol to initialise it will want to examine all 
+	 * installed protocols. This needs fixing before protocols
+	 * that use the standard routing can become modules.
+	 */
+	for (dpp = (struct domain **)domain_set.ls_items; *dpp; dpp++) {
+		(**dpp).dom_next = domains;
+		domains = *dpp;
+	}
+
+	/*
+	 * Now ask them all to init (XXX including the routing domain,
+	 * see above)
+	 */
+	for (dp = domains; dp; dp = dp->dom_next)
+		net_init_domain(dp);
+
+	timeout(pffasttimo, (void *)0, 1);
+	timeout(pfslowtimo, (void *)0, 1);
+}
+
+
+/*
+ * The following two operations are kludge code.  Most likely, they should
+ * be done as a "domainpreinit()" for the first function and then rolled
+ * in as the last act of "domaininit()" for the second.
+ *
+ * In point of fact, it is questionable why other initialization prior
+ * to this does not also take place at splimp by default.
+ */
+static void
+kludge_splimp(udata)
+	void *udata;
+{
+	int	*savesplp = udata;
+
+	*savesplp = splimp();
+}
+
+static void
+kludge_splx(udata)
+	void *udata;
+{
+	int	*savesplp = udata;
+
+	splx(*savesplp);
+}
+
+
+
+struct protosw *
+pffindtype(family, type)
+	int family;
+	int type;
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		if (dp->dom_family == family)
+			goto found;
+	return (0);
+found:
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+		if (pr->pr_type && pr->pr_type == type)
+			return (pr);
+	return (0);
+}
+
+struct protosw *
+pffindproto(family, protocol, type)
+	int family;
+	int protocol;
+	int type;
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+	struct protosw *maybe = 0;
+
+	if (family == 0)
+		return (0);
+	for (dp = domains; dp; dp = dp->dom_next)
+		if (dp->dom_family == family)
+			goto found;
+	return (0);
+found:
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+		if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
+			return (pr);
+
+		if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
+		    pr->pr_protocol == 0 && maybe == (struct protosw *)0)
+			maybe = pr;
+	}
+	return (maybe);
+}
+
+void
+pfctlinput(cmd, sa)
+	int cmd;
+	struct sockaddr *sa;
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_ctlinput)
+				(*pr->pr_ctlinput)(cmd, sa, (void *)0);
+}
+
+static void
+pfslowtimo(arg)
+	void *arg;
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_slowtimo)
+				(*pr->pr_slowtimo)();
+	timeout(pfslowtimo, (void *)0, hz/2);
+}
+
+static void
+pffasttimo(arg)
+	void *arg;
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_fasttimo)
+				(*pr->pr_fasttimo)();
+	timeout(pffasttimo, (void *)0, hz/5);
+}
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
new file mode 100644
index 0000000..09ddd23
--- /dev/null
+++ b/sys/kern/uipc_mbuf.c
@@ -0,0 +1,945 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
+ *	$Id: uipc_mbuf.c,v 1.36 1998/07/03 08:36:48 phk Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+static void mbinit __P((void *));
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
+
+struct mbuf *mbutl;
+char	*mclrefcnt;
+struct mbstat mbstat;
+struct mbuf *mmbfree;
+union mcluster *mclfree;
+int	max_linkhdr;
+int	max_protohdr;
+int	max_hdr;
+int	max_datalen;
+
+SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
+	   &max_linkhdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
+	   &max_protohdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
+	   &max_datalen, 0, "");
+SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
+
+static void	m_reclaim __P((void));
+
+/* "number of clusters of pages" */
+#define NCL_INIT	1
+
+#define NMB_INIT	16
+
+/* ARGSUSED*/
+static void
+mbinit(dummy)
+	void *dummy;
+{
+	int s;
+
+	mmbfree = NULL; mclfree = NULL;
+	mbstat.m_msize = MSIZE;
+	mbstat.m_mclbytes = MCLBYTES;
+	mbstat.m_minclsize = MINCLSIZE;
+	mbstat.m_mlen = MLEN;
+	mbstat.m_mhlen = MHLEN;
+
+	s = splimp();
+	if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0)
+		goto bad;
+#if MCLBYTES <= PAGE_SIZE
+	if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0)
+		goto bad;
+#else
+	/* It's OK to call contigmalloc in this context. */
+	if (m_clalloc(16, M_WAIT) == 0)
+		goto bad;
+#endif
+	splx(s);
+	return;
+bad:
+	panic("mbinit");
+}
+
+/*
+ * Allocate at least nmb mbufs and place on mbuf free list.
+ * Must be called at splimp.
+ */
+/* ARGSUSED */
+int
+m_mballoc(nmb, how)
+	register int nmb;
+	int how;
+{
+	register caddr_t p;
+	register int i;
+	int nbytes;
+
+	/* Once we run out of map space, it will be impossible to get
+	 * any more (nothing is ever freed back to the map) (XXX which
+	 * is dumb). (however you are not dead as m_reclaim might
+	 * still be able to free a substantial amount of space).
+	 */
+	if (mb_map_full)
+		return (0);
+
+	nbytes = round_page(nmb * MSIZE);
+	p = (caddr_t)kmem_malloc(mb_map, nbytes, M_NOWAIT);
+	if (p == 0 && how == M_WAIT) {
+		mbstat.m_wait++;
+		p = (caddr_t)kmem_malloc(mb_map, nbytes, M_WAITOK);
+	}
+
+	/*
+	 * Either the map is now full, or `how' is M_NOWAIT and there
+	 * are no pages left.
+	 */
+	if (p == NULL)
+		return (0);
+
+	nmb = nbytes / MSIZE;
+	for (i = 0; i < nmb; i++) {
+		((struct mbuf *)p)->m_next = mmbfree;
+		mmbfree = (struct mbuf *)p;
+		p += MSIZE;
+	}
+	mbstat.m_mbufs += nmb;
+	return (1);
+}
+
+#if MCLBYTES > PAGE_SIZE
+static int i_want_my_mcl;
+
+static void
+kproc_mclalloc(void)
+{
+	int status;
+
+	while (1) {
+		tsleep(&i_want_my_mcl, PVM, "mclalloc", 0);
+
+		for (; i_want_my_mcl; i_want_my_mcl--) {
+			if (m_clalloc(1, M_WAIT) == 0)
+				printf("m_clalloc failed even in process context!\n");
+		}
+	}
+}
+
+static struct proc *mclallocproc;
+static struct kproc_desc mclalloc_kp = {
+	"mclalloc",
+	kproc_mclalloc,
+	&mclallocproc
+};
+SYSINIT_KT(mclallocproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
+	   &mclalloc_kp);
+#endif
+
+/*
+ * Allocate some number of mbuf clusters
+ * and place on cluster free list.
+ * Must be called at splimp.
+ */
+/* ARGSUSED */
+int
+m_clalloc(ncl, how)
+	register int ncl;
+	int how;
+{
+	register caddr_t p;
+	register int i;
+	int npg;
+
+	/*
+	 * Once we run out of map space, it will be impossible
+	 * to get any more (nothing is ever freed back to the
+	 * map).
+	 */
+	if (mb_map_full) {
+		mbstat.m_drops++;
+		return (0);
+	}
+
+#if MCLBYTES > PAGE_SIZE
+	if (how != M_WAIT) {
+		i_want_my_mcl += ncl;
+		wakeup(&i_want_my_mcl);
+		mbstat.m_wait++;
+		p = 0;
+	} else {
+		p = contigmalloc1(MCLBYTES * ncl, M_DEVBUF, M_WAITOK, 0ul,
+				  ~0ul, PAGE_SIZE, 0, mb_map);
+	}
+#else
+	npg = ncl;
+	p = (caddr_t)kmem_malloc(mb_map, ctob(npg),
+				 how != M_WAIT ? M_NOWAIT : M_WAITOK);
+	ncl = ncl * PAGE_SIZE / MCLBYTES;
+#endif
+	/*
+	 * Either the map is now full, or `how' is M_NOWAIT and there
+	 * are no pages left.
+	 */
+	if (p == NULL) {
+		mbstat.m_drops++;
+		return (0);
+	}
+
+	for (i = 0; i < ncl; i++) {
+		((union mcluster *)p)->mcl_next = mclfree;
+		mclfree = (union mcluster *)p;
+		p += MCLBYTES;
+		mbstat.m_clfree++;
+	}
+	mbstat.m_clusters += ncl;
+	return (1);
+}
+
+/*
+ * When MGET failes, ask protocols to free space when short of memory,
+ * then re-attempt to allocate an mbuf.
+ */
+struct mbuf *
+m_retry(i, t)
+	int i, t;
+{
+	register struct mbuf *m;
+
+	/*
+	 * Must only do the reclaim if not in an interrupt context.
+	 */
+	if (i == M_WAIT)
+		m_reclaim();
+#define m_retry(i, t)	(struct mbuf *)0
+	MGET(m, i, t);
+#undef m_retry
+	if (m != NULL) {
+		mbstat.m_wait++;
+	} else {
+		if (i == M_DONTWAIT)
+			mbstat.m_drops++;
+		else
+			panic("Out of mbuf clusters");
+	}
+	return (m);
+}
+
+/*
+ * As above; retry an MGETHDR.
+ */
+struct mbuf *
+m_retryhdr(i, t)
+	int i, t;
+{
+	register struct mbuf *m;
+
+	/*
+	 * Must only do the reclaim if not in an interrupt context.
+	 */
+	if (i == M_WAIT)
+		m_reclaim();
+#define m_retryhdr(i, t) (struct mbuf *)0
+	MGETHDR(m, i, t);
+#undef m_retryhdr
+	if (m != NULL) {
+		mbstat.m_wait++;
+	} else {
+		if (i == M_DONTWAIT)
+			mbstat.m_drops++;
+		else
+			panic("Out of mbuf clusters");
+	}
+	return (m);
+}
+
+static void
+m_reclaim()
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+	int s = splimp();
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_drain)
+				(*pr->pr_drain)();
+	splx(s);
+	mbstat.m_drain++;
+}
+
+/*
+ * Space allocation routines.
+ * These are also available as macros
+ * for critical paths.
+ */
+struct mbuf *
+m_get(how, type)
+	int how, type;
+{
+	register struct mbuf *m;
+
+	MGET(m, how, type);
+	return (m);
+}
+
+struct mbuf *
+m_gethdr(how, type)
+	int how, type;
+{
+	register struct mbuf *m;
+
+	MGETHDR(m, how, type);
+	return (m);
+}
+
+struct mbuf *
+m_getclr(how, type)
+	int how, type;
+{
+	register struct mbuf *m;
+
+	MGET(m, how, type);
+	if (m == 0)
+		return (0);
+	bzero(mtod(m, caddr_t), MLEN);
+	return (m);
+}
+
+struct mbuf *
+m_free(m)
+	struct mbuf *m;
+{
+	register struct mbuf *n;
+
+	MFREE(m, n);
+	return (n);
+}
+
+void
+m_freem(m)
+	register struct mbuf *m;
+{
+	register struct mbuf *n;
+
+	if (m == NULL)
+		return;
+	do {
+		MFREE(m, n);
+		m = n;
+	} while (m);
+}
+
+/*
+ * Mbuffer utility routines.
+ */
+
+/*
+ * Lesser-used path for M_PREPEND:
+ * allocate new mbuf to prepend to chain,
+ * copy junk along.
+ */
+struct mbuf *
+m_prepend(m, len, how)
+	register struct mbuf *m;
+	int len, how;
+{
+	struct mbuf *mn;
+
+	MGET(mn, how, m->m_type);
+	if (mn == (struct mbuf *)NULL) {
+		m_freem(m);
+		return ((struct mbuf *)NULL);
+	}
+	if (m->m_flags & M_PKTHDR) {
+		M_COPY_PKTHDR(mn, m);
+		m->m_flags &= ~M_PKTHDR;
+	}
+	mn->m_next = m;
+	m = mn;
+	if (len < MHLEN)
+		MH_ALIGN(m, len);
+	m->m_len = len;
+	return (m);
+}
+
+/*
+ * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
+ * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
+ * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
+ */
+#define MCFail (mbstat.m_mcfail)
+
+struct mbuf *
+m_copym(m, off0, len, wait)
+	register struct mbuf *m;
+	int off0, wait;
+	register int len;
+{
+	register struct mbuf *n, **np;
+	register int off = off0;
+	struct mbuf *top;
+	int copyhdr = 0;
+
+	if (off < 0 || len < 0)
+		panic("m_copym");
+	if (off == 0 && m->m_flags & M_PKTHDR)
+		copyhdr = 1;
+	while (off > 0) {
+		if (m == 0)
+			panic("m_copym");
+		if (off < m->m_len)
+			break;
+		off -= m->m_len;
+		m = m->m_next;
+	}
+	np = &top;
+	top = 0;
+	while (len > 0) {
+		if (m == 0) {
+			if (len != M_COPYALL)
+				panic("m_copym");
+			break;
+		}
+		MGET(n, wait, m->m_type);
+		*np = n;
+		if (n == 0)
+			goto nospace;
+		if (copyhdr) {
+			M_COPY_PKTHDR(n, m);
+			if (len == M_COPYALL)
+				n->m_pkthdr.len -= off0;
+			else
+				n->m_pkthdr.len = len;
+			copyhdr = 0;
+		}
+		n->m_len = min(len, m->m_len - off);
+		if (m->m_flags & M_EXT) {
+			n->m_data = m->m_data + off;
+			if(!m->m_ext.ext_ref)
+				mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+			else
+				(*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+							m->m_ext.ext_size);
+			n->m_ext = m->m_ext;
+			n->m_flags |= M_EXT;
+		} else
+			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
+			    (unsigned)n->m_len);
+		if (len != M_COPYALL)
+			len -= n->m_len;
+		off = 0;
+		m = m->m_next;
+		np = &n->m_next;
+	}
+	if (top == 0)
+		MCFail++;
+	return (top);
+nospace:
+	m_freem(top);
+	MCFail++;
+	return (0);
+}
+
+/*
+ * Copy an entire packet, including header (which must be present).
+ * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
+ */
+struct mbuf *
+m_copypacket(m, how)
+	struct mbuf *m;
+	int how;
+{
+	struct mbuf *top, *n, *o;
+
+	MGET(n, how, m->m_type);
+	top = n;
+	if (!n)
+		goto nospace;
+
+	M_COPY_PKTHDR(n, m);
+	n->m_len = m->m_len;
+	if (m->m_flags & M_EXT) {
+		n->m_data = m->m_data;
+		if(!m->m_ext.ext_ref)
+			mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+		else
+			(*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+						m->m_ext.ext_size);
+		n->m_ext = m->m_ext;
+		n->m_flags |= M_EXT;
+	} else {
+		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+	}
+
+	m = m->m_next;
+	while (m) {
+		MGET(o, how, m->m_type);
+		if (!o)
+			goto nospace;
+
+		n->m_next = o;
+		n = n->m_next;
+
+		n->m_len = m->m_len;
+		if (m->m_flags & M_EXT) {
+			n->m_data = m->m_data;
+			if(!m->m_ext.ext_ref)
+				mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+			else
+				(*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+							m->m_ext.ext_size);
+			n->m_ext = m->m_ext;
+			n->m_flags |= M_EXT;
+		} else {
+			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+		}
+
+		m = m->m_next;
+	}
+	return top;
+nospace:
+	m_freem(top);
+	MCFail++;
+	return 0;
+}
+
+/*
+ * Copy data from an mbuf chain starting "off" bytes from the beginning,
+ * continuing for "len" bytes, into the indicated buffer.
+ */
+void
+m_copydata(m, off, len, cp)
+	register struct mbuf *m;
+	register int off;
+	register int len;
+	caddr_t cp;
+{
+	register unsigned count;
+
+	if (off < 0 || len < 0)
+		panic("m_copydata");
+	while (off > 0) {
+		if (m == 0)
+			panic("m_copydata");
+		if (off < m->m_len)
+			break;
+		off -= m->m_len;
+		m = m->m_next;
+	}
+	while (len > 0) {
+		if (m == 0)
+			panic("m_copydata");
+		count = min(m->m_len - off, len);
+		bcopy(mtod(m, caddr_t) + off, cp, count);
+		len -= count;
+		cp += count;
+		off = 0;
+		m = m->m_next;
+	}
+}
+
+/*
+ * Concatenate mbuf chain n to m.
+ * Both chains must be of the same type (e.g. MT_DATA).
+ * Any m_pkthdr is not updated.
+ */
+void
+m_cat(m, n)
+	register struct mbuf *m, *n;
+{
+	while (m->m_next)
+		m = m->m_next;
+	while (n) {
+		if (m->m_flags & M_EXT ||
+		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
+			/* just join the two chains */
+			m->m_next = n;
+			return;
+		}
+		/* splat the data from one into the other */
+		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+		    (u_int)n->m_len);
+		m->m_len += n->m_len;
+		n = m_free(n);
+	}
+}
+
+void
+m_adj(mp, req_len)
+	struct mbuf *mp;
+	int req_len;
+{
+	register int len = req_len;
+	register struct mbuf *m;
+	register int count;
+
+	if ((m = mp) == NULL)
+		return;
+	if (len >= 0) {
+		/*
+		 * Trim from head.
+		 */
+		while (m != NULL && len > 0) {
+			if (m->m_len <= len) {
+				len -= m->m_len;
+				m->m_len = 0;
+				m = m->m_next;
+			} else {
+				m->m_len -= len;
+				m->m_data += len;
+				len = 0;
+			}
+		}
+		m = mp;
+		if (mp->m_flags & M_PKTHDR)
+			m->m_pkthdr.len -= (req_len - len);
+	} else {
+		/*
+		 * Trim from tail.  Scan the mbuf chain,
+		 * calculating its length and finding the last mbuf.
+		 * If the adjustment only affects this mbuf, then just
+		 * adjust and return.  Otherwise, rescan and truncate
+		 * after the remaining size.
+		 */
+		len = -len;
+		count = 0;
+		for (;;) {
+			count += m->m_len;
+			if (m->m_next == (struct mbuf *)0)
+				break;
+			m = m->m_next;
+		}
+		if (m->m_len >= len) {
+			m->m_len -= len;
+			if (mp->m_flags & M_PKTHDR)
+				mp->m_pkthdr.len -= len;
+			return;
+		}
+		count -= len;
+		if (count < 0)
+			count = 0;
+		/*
+		 * Correct length for chain is "count".
+		 * Find the mbuf with last data, adjust its length,
+		 * and toss data from remaining mbufs on chain.
+		 */
+		m = mp;
+		if (m->m_flags & M_PKTHDR)
+			m->m_pkthdr.len = count;
+		for (; m; m = m->m_next) {
+			if (m->m_len >= count) {
+				m->m_len = count;
+				break;
+			}
+			count -= m->m_len;
+		}
+		while (m->m_next)
+			(m = m->m_next) ->m_len = 0;
+	}
+}
+
+/*
+ * Rearange an mbuf chain so that len bytes are contiguous
+ * and in the data area of an mbuf (so that mtod and dtom
+ * will work for a structure of size len).  Returns the resulting
+ * mbuf chain on success, frees it and returns null on failure.
+ * If there is room, it will add up to max_protohdr-len extra bytes to the
+ * contiguous region in an attempt to avoid being called next time.
+ */
+#define MPFail (mbstat.m_mpfail)
+
+struct mbuf *
+m_pullup(n, len)
+	register struct mbuf *n;
+	int len;
+{
+	register struct mbuf *m;
+	register int count;
+	int space;
+
+	/*
+	 * If first mbuf has no cluster, and has room for len bytes
+	 * without shifting current data, pullup into it,
+	 * otherwise allocate a new mbuf to prepend to the chain.
+	 */
+	if ((n->m_flags & M_EXT) == 0 &&
+	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
+		if (n->m_len >= len)
+			return (n);
+		m = n;
+		n = n->m_next;
+		len -= m->m_len;
+	} else {
+		if (len > MHLEN)
+			goto bad;
+		MGET(m, M_DONTWAIT, n->m_type);
+		if (m == 0)
+			goto bad;
+		m->m_len = 0;
+		if (n->m_flags & M_PKTHDR) {
+			M_COPY_PKTHDR(m, n);
+			n->m_flags &= ~M_PKTHDR;
+		}
+	}
+	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
+	do {
+		count = min(min(max(len, max_protohdr), space), n->m_len);
+		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+		  (unsigned)count);
+		len -= count;
+		m->m_len += count;
+		n->m_len -= count;
+		space -= count;
+		if (n->m_len)
+			n->m_data += count;
+		else
+			n = m_free(n);
+	} while (len > 0 && n);
+	if (len > 0) {
+		(void) m_free(m);
+		goto bad;
+	}
+	m->m_next = n;
+	return (m);
+bad:
+	m_freem(n);
+	MPFail++;
+	return (0);
+}
+
+/*
+ * Partition an mbuf chain in two pieces, returning the tail --
+ * all but the first len0 bytes.  In case of failure, it returns NULL and
+ * attempts to restore the chain to its original state.
+ */
+struct mbuf *
+m_split(m0, len0, wait)
+	register struct mbuf *m0;
+	int len0, wait;
+{
+	register struct mbuf *m, *n;
+	unsigned len = len0, remain;
+
+	for (m = m0; m && len > m->m_len; m = m->m_next)
+		len -= m->m_len;
+	if (m == 0)
+		return (0);
+	remain = m->m_len - len;
+	if (m0->m_flags & M_PKTHDR) {
+		MGETHDR(n, wait, m0->m_type);
+		if (n == 0)
+			return (0);
+		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
+		m0->m_pkthdr.len = len0;
+		if (m->m_flags & M_EXT)
+			goto extpacket;
+		if (remain > MHLEN) {
+			/* m can't be the lead packet */
+			MH_ALIGN(n, 0);
+			n->m_next = m_split(m, len, wait);
+			if (n->m_next == 0) {
+				(void) m_free(n);
+				return (0);
+			} else
+				return (n);
+		} else
+			MH_ALIGN(n, remain);
+	} else if (remain == 0) {
+		n = m->m_next;
+		m->m_next = 0;
+		return (n);
+	} else {
+		MGET(n, wait, m->m_type);
+		if (n == 0)
+			return (0);
+		M_ALIGN(n, remain);
+	}
+extpacket:
+	if (m->m_flags & M_EXT) {
+		n->m_flags |= M_EXT;
+		n->m_ext = m->m_ext;
+		if(!m->m_ext.ext_ref)
+			mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+		else
+			(*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+						m->m_ext.ext_size);
+		m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */
+		n->m_data = m->m_data + len;
+	} else {
+		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
+	}
+	n->m_len = remain;
+	m->m_len = len;
+	n->m_next = m->m_next;
+	m->m_next = 0;
+	return (n);
+}
+/*
+ * Routine to copy from device local memory into mbufs.
+ */
+struct mbuf *
+m_devget(buf, totlen, off0, ifp, copy)
+	char *buf;
+	int totlen, off0;
+	struct ifnet *ifp;
+	void (*copy) __P((char *from, caddr_t to, u_int len));
+{
+	register struct mbuf *m;
+	struct mbuf *top = 0, **mp = &top;
+	register int off = off0, len;
+	register char *cp;
+	char *epkt;
+
+	cp = buf;
+	epkt = cp + totlen;
+	if (off) {
+		cp += off + 2 * sizeof(u_short);
+		totlen -= 2 * sizeof(u_short);
+	}
+	MGETHDR(m, M_DONTWAIT, MT_DATA);
+	if (m == 0)
+		return (0);
+	m->m_pkthdr.rcvif = ifp;
+	m->m_pkthdr.len = totlen;
+	m->m_len = MHLEN;
+
+	while (totlen > 0) {
+		if (top) {
+			MGET(m, M_DONTWAIT, MT_DATA);
+			if (m == 0) {
+				m_freem(top);
+				return (0);
+			}
+			m->m_len = MLEN;
+		}
+		len = min(totlen, epkt - cp);
+		if (len >= MINCLSIZE) {
+			MCLGET(m, M_DONTWAIT);
+			if (m->m_flags & M_EXT)
+				m->m_len = len = min(len, MCLBYTES);
+			else
+				len = m->m_len;
+		} else {
+			/*
+			 * Place initial small packet/header at end of mbuf.
+			 */
+			if (len < m->m_len) {
+				if (top == 0 && len + max_linkhdr <= m->m_len)
+					m->m_data += max_linkhdr;
+				m->m_len = len;
+			} else
+				len = m->m_len;
+		}
+		if (copy)
+			copy(cp, mtod(m, caddr_t), (unsigned)len);
+		else
+			bcopy(cp, mtod(m, caddr_t), (unsigned)len);
+		cp += len;
+		*mp = m;
+		mp = &m->m_next;
+		totlen -= len;
+		if (cp == epkt)
+			cp = buf;
+	}
+	return (top);
+}
+
+/*
+ * Copy data from a buffer back into the indicated mbuf chain,
+ * starting "off" bytes from the beginning, extending the mbuf
+ * chain if necessary.
+ */
+void
+m_copyback(m0, off, len, cp)
+	struct	mbuf *m0;
+	register int off;
+	register int len;
+	caddr_t cp;
+{
+	register int mlen;
+	register struct mbuf *m = m0, *n;
+	int totlen = 0;
+
+	if (m0 == 0)
+		return;
+	while (off > (mlen = m->m_len)) {
+		off -= mlen;
+		totlen += mlen;
+		if (m->m_next == 0) {
+			n = m_getclr(M_DONTWAIT, m->m_type);
+			if (n == 0)
+				goto out;
+			n->m_len = min(MLEN, len + off);
+			m->m_next = n;
+		}
+		m = m->m_next;
+	}
+	while (len > 0) {
+		mlen = min (m->m_len - off, len);
+		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
+		cp += mlen;
+		len -= mlen;
+		mlen += off;
+		off = 0;
+		totlen += mlen;
+		if (len == 0)
+			break;
+		if (m->m_next == 0) {
+			n = m_get(M_DONTWAIT, m->m_type);
+			if (n == 0)
+				break;
+			n->m_len = min(MLEN, len);
+			m->m_next = n;
+		}
+		m = m->m_next;
+	}
+out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
+		m->m_pkthdr.len = totlen;
+}
diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c
new file mode 100644
index 0000000..094d1bf
--- /dev/null
+++ b/sys/kern/uipc_proto.c
@@ -0,0 +1,79 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_proto.c	8.1 (Berkeley) 6/10/93
+ * $Id: uipc_proto.c,v 1.16 1998/06/21 14:53:18 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/un.h>
+#include <sys/unpcb.h>
+
+#include <net/raw_cb.h>
+
+/*
+ * Definitions of protocols supported in the LOCAL domain.
+ */
+
+static struct protosw localsw[] = {
+{ SOCK_STREAM,	&localdomain,	0,	PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+  0,		0,		0,		0,
+  0,
+  0,		0,		0,		0,
+  &uipc_usrreqs
+},
+{ SOCK_DGRAM,	&localdomain,	0,		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+  0,		0,		0,		0,
+  0,
+  0,		0,		0,		0,
+  &uipc_usrreqs
+},
+{ 0,		0,		0,		0,
+  0,		0,		raw_ctlinput,	0,
+  0,
+  raw_init,	0,		0,		0,
+  &raw_usrreqs
+}
+};
+
+struct domain localdomain =
+    { AF_LOCAL, "local", unp_init, unp_externalize, unp_dispose,
+      localsw, &localsw[sizeof(localsw)/sizeof(localsw[0])] };
+
+SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
+SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
+SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
new file mode 100644
index 0000000..e718c62
--- /dev/null
+++ b/sys/kern/uipc_sockbuf.c
@@ -0,0 +1,954 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
+ *	$Id: uipc_socket2.c,v 1.42 1998/11/23 00:45:38 truckman Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+
+/*
+ * Primitive routines for operating on sockets and socket buffers
+ */
+
+u_long	sb_max = SB_MAX;		/* XXX should be static */
+
+static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
+
+/*
+ * Procedures to manipulate state flags of socket
+ * and do appropriate wakeups.  Normal sequence from the
+ * active (originating) side is that soisconnecting() is
+ * called during processing of connect() call,
+ * resulting in an eventual call to soisconnected() if/when the
+ * connection is established.  When the connection is torn down
+ * soisdisconnecting() is called during processing of disconnect() call,
+ * and soisdisconnected() is called when the connection to the peer
+ * is totally severed.  The semantics of these routines are such that
+ * connectionless protocols can call soisconnected() and soisdisconnected()
+ * only, bypassing the in-progress calls when setting up a ``connection''
+ * takes no time.
+ *
+ * From the passive side, a socket is created with
+ * two queues of sockets: so_q0 for connections in progress
+ * and so_q for connections already made and awaiting user acceptance.
+ * As a protocol is preparing incoming connections, it creates a socket
+ * structure queued on so_q0 by calling sonewconn().  When the connection
+ * is established, soisconnected() is called, and transfers the
+ * socket structure to so_q, making it available to accept().
+ *
+ * If a socket is closed with sockets on either
+ * so_q0 or so_q, these sockets are dropped.
+ *
+ * If higher level protocols are implemented in
+ * the kernel, the wakeups done here will sometimes
+ * cause software-interrupt process scheduling.
+ */
+
+void
+soisconnecting(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= SS_ISCONNECTING;
+}
+
+void
+soisconnected(so)
+	register struct socket *so;
+{
+	register struct socket *head = so->so_head;
+
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+	so->so_state |= SS_ISCONNECTED;
+	if (head && (so->so_state & SS_INCOMP)) {
+		TAILQ_REMOVE(&head->so_incomp, so, so_list);
+		head->so_incqlen--;
+		so->so_state &= ~SS_INCOMP;
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_state |= SS_COMP;
+		sorwakeup(head);
+		wakeup_one(&head->so_timeo);
+	} else {
+		wakeup(&so->so_timeo);
+		sorwakeup(so);
+		sowwakeup(so);
+	}
+}
+
+void
+soisdisconnecting(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~SS_ISCONNECTING;
+	so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
+	wakeup((caddr_t)&so->so_timeo);
+	sowwakeup(so);
+	sorwakeup(so);
+}
+
+void
+soisdisconnected(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
+	wakeup((caddr_t)&so->so_timeo);
+	sowwakeup(so);
+	sorwakeup(so);
+}
+
+/*
+ * Return a random connection that hasn't been serviced yet and
+ * is eligible for discard.  There is a one in qlen chance that
+ * we will return a null, saying that there are no dropable
+ * requests.  In this case, the protocol specific code should drop
+ * the new request.  This insures fairness.
+ *
+ * This may be used in conjunction with protocol specific queue
+ * congestion routines.
+ */
+struct socket *
+sodropablereq(head)
+	register struct socket *head;
+{
+	register struct socket *so;
+	unsigned int i, j, qlen;
+	static int rnd;
+	static struct timeval old_runtime;
+	static unsigned int cur_cnt, old_cnt;
+	struct timeval tv;
+
+	getmicrouptime(&tv);
+	if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) {
+		old_runtime = tv;
+		old_cnt = cur_cnt / i;
+		cur_cnt = 0;
+	}
+
+	so = TAILQ_FIRST(&head->so_incomp);
+	if (!so)
+		return (so);
+
+	qlen = head->so_incqlen;
+	if (++cur_cnt > qlen || old_cnt > qlen) {
+		rnd = (314159 * rnd + 66329) & 0xffff;
+		j = ((qlen + 1) * rnd) >> 16;
+
+		while (j-- && so)
+		    so = TAILQ_NEXT(so, so_list);
+	}
+
+	return (so);
+}
+
+/*
+ * When an attempt at a new connection is noted on a socket
+ * which accepts connections, sonewconn is called.  If the
+ * connection is possible (subject to space constraints, etc.)
+ * then we allocate a new structure, propoerly linked into the
+ * data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ */
+struct socket *
+sonewconn(head, connstatus)
+	register struct socket *head;
+	int connstatus;
+{
+	register struct socket *so;
+
+	if (head->so_qlen > 3 * head->so_qlimit / 2)
+		return ((struct socket *)0);
+	so = soalloc(0);
+	if (so == NULL)
+		return ((struct socket *)0);
+	so->so_head = head;
+	so->so_type = head->so_type;
+	so->so_options = head->so_options &~ SO_ACCEPTCONN;
+	so->so_linger = head->so_linger;
+	so->so_state = head->so_state | SS_NOFDREF;
+	so->so_proto = head->so_proto;
+	so->so_timeo = head->so_timeo;
+	so->so_uid = head->so_uid;
+	(void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
+
+	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+		sodealloc(so);
+		return ((struct socket *)0);
+	}
+
+	if (connstatus) {
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_state |= SS_COMP;
+	} else {
+		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+		so->so_state |= SS_INCOMP;
+		head->so_incqlen++;
+	}
+	head->so_qlen++;
+	if (connstatus) {
+		sorwakeup(head);
+		wakeup((caddr_t)&head->so_timeo);
+		so->so_state |= connstatus;
+	}
+	return (so);
+}
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the
+ * socket; it would normally be applied to a socket when the user
+ * informs the system that no more data is to be sent, by the protocol
+ * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
+ * will be received, and will normally be applied to the socket by a
+ * protocol when it detects that the peer will send no more data.
+ * Data queued for reading in the socket may yet be read.
+ */
+
+void
+socantsendmore(so)
+	struct socket *so;
+{
+
+	so->so_state |= SS_CANTSENDMORE;
+	sowwakeup(so);
+}
+
+void
+socantrcvmore(so)
+	struct socket *so;
+{
+
+	so->so_state |= SS_CANTRCVMORE;
+	sorwakeup(so);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(sb)
+	struct sockbuf *sb;
+{
+
+	sb->sb_flags |= SB_WAIT;
+	return (tsleep((caddr_t)&sb->sb_cc,
+	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+	    sb->sb_timeo));
+}
+
+/*
+ * Lock a sockbuf already known to be locked;
+ * return any error returned from sleep (EINTR).
+ */
+int
+sb_lock(sb)
+	register struct sockbuf *sb;
+{
+	int error;
+
+	while (sb->sb_flags & SB_LOCK) {
+		sb->sb_flags |= SB_WANT;
+		error = tsleep((caddr_t)&sb->sb_flags,
+		    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
+		    "sblock", 0);
+		if (error)
+			return (error);
+	}
+	sb->sb_flags |= SB_LOCK;
+	return (0);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.
+ * Do asynchronous notification via SIGIO
+ * if the socket has the SS_ASYNC flag set.
+ */
+void
+sowakeup(so, sb)
+	register struct socket *so;
+	register struct sockbuf *sb;
+{
+	selwakeup(&sb->sb_sel);
+	sb->sb_flags &= ~SB_SEL;
+	if (sb->sb_flags & SB_WAIT) {
+		sb->sb_flags &= ~SB_WAIT;
+		wakeup((caddr_t)&sb->sb_cc);
+	}
+	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+		pgsigio(so->so_sigio, SIGIO, 0);
+	if (sb->sb_flags & SB_UPCALL)
+		(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and
+ * one for receiving data.  Each buffer contains a queue of mbufs,
+ * information about the number of mbufs and amount of data in the
+ * queue, and other fields allowing select() statements and notification
+ * on data availability to be implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.
+ * Each record is a list of mbufs chained together with the m_next
+ * field.  Records are chained together with the m_nextpkt field. The upper
+ * level routine soreceive() expects the following conventions to be
+ * observed when placing information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's
+ *    name, then a record containing that name must be present before
+ *    any associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really
+ *    just additional data associated with the message), and there are
+ *    ``rights'' to be received, then a record containing this data
+ *    should be present (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by
+ *    a data record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve().  This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits).  The space
+ * should be released by calling sbrelease() when the socket is destroyed.
+ */
+
+int
+soreserve(so, sndcc, rcvcc)
+	register struct socket *so;
+	u_long sndcc, rcvcc;
+{
+
+	if (sbreserve(&so->so_snd, sndcc) == 0)
+		goto bad;
+	if (sbreserve(&so->so_rcv, rcvcc) == 0)
+		goto bad2;
+	if (so->so_rcv.sb_lowat == 0)
+		so->so_rcv.sb_lowat = 1;
+	if (so->so_snd.sb_lowat == 0)
+		so->so_snd.sb_lowat = MCLBYTES;
+	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+	return (0);
+bad2:
+	sbrelease(&so->so_snd);
+bad:
+	return (ENOBUFS);
+}
+
+/*
+ * Allot mbufs to a sockbuf.
+ * Attempt to scale mbmax so that mbcnt doesn't become limiting
+ * if buffering efficiency is near the normal case.
+ */
+int
+sbreserve(sb, cc)
+	struct sockbuf *sb;
+	u_long cc;
+{
+	if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+		return (0);
+	sb->sb_hiwat = cc;
+	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+	if (sb->sb_lowat > sb->sb_hiwat)
+		sb->sb_lowat = sb->sb_hiwat;
+	return (1);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease(sb)
+	struct sockbuf *sb;
+{
+
+	sbflush(sb);
+	sb->sb_hiwat = sb->sb_mbmax = 0;
+}
+
+/*
+ * Routines to add and remove
+ * data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to
+ * append new mbufs to a socket buffer, after checking that adequate
+ * space is available, comparing the function sbspace() with the amount
+ * of data to be added.  sbappendrecord() differs from sbappend() in
+ * that data supplied is treated as the beginning of a new record.
+ * To place a sender's address, optional access rights, and data in a
+ * socket receive buffer, sbappendaddr() should be used.  To place
+ * access rights and data in a socket receive buffer, sbappendrights()
+ * should be used.  In either case, the new data begins a new record.
+ * Note that unlike sbappend() and sbappendrecord(), these routines check
+ * for the caller that there will be enough space to store the data.
+ * Each fails if there is not enough space, or if it cannot find mbufs
+ * to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data
+ * awaiting acknowledgement.  Data is normally copied from a socket
+ * send buffer in a protocol with m_copy for output to a peer,
+ * and then removing the data from the socket buffer with sbdrop()
+ * or sbdroprecord() when the data is acknowledged by the peer.
+ */
+
+/*
+ * Append mbuf chain m to the last record in the
+ * socket buffer sb.  The additional space associated
+ * the mbuf chain is recorded in sb.  Empty mbufs are
+ * discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(sb, m)
+	struct sockbuf *sb;
+	struct mbuf *m;
+{
+	register struct mbuf *n;
+
+	if (m == 0)
+		return;
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		do {
+			if (n->m_flags & M_EOR) {
+				sbappendrecord(sb, m); /* XXXXXX!!!! */
+				return;
+			}
+		} while (n->m_next && (n = n->m_next));
+	}
+	sbcompress(sb, m, n);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(sb)
+	register struct sockbuf *sb;
+{
+	register struct mbuf *m;
+	register struct mbuf *n = 0;
+	register u_long len = 0, mbcnt = 0;
+
+	for (m = sb->sb_mb; m; m = n) {
+	    n = m->m_nextpkt;
+	    for (; m; m = m->m_next) {
+		len += m->m_len;
+		mbcnt += MSIZE;
+		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+			mbcnt += m->m_ext.ext_size;
+	    }
+	}
+	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+		printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
+		    mbcnt, sb->sb_mbcnt);
+		panic("sbcheck");
+	}
+}
+#endif
+
+/*
+ * As above, except the mbuf chain
+ * begins a new record.
+ */
+void
+sbappendrecord(sb, m0)
+	register struct sockbuf *sb;
+	register struct mbuf *m0;
+{
+	register struct mbuf *m;
+
+	if (m0 == 0)
+		return;
+	m = sb->sb_mb;
+	if (m)
+		while (m->m_nextpkt)
+			m = m->m_nextpkt;
+	/*
+	 * Put the first mbuf on the queue.
+	 * Note this permits zero length records.
+	 */
+	sballoc(sb, m0);
+	if (m)
+		m->m_nextpkt = m0;
+	else
+		sb->sb_mb = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * As above except that OOB data
+ * is inserted at the beginning of the sockbuf,
+ * but after any other OOB data.
+ */
+void
+sbinsertoob(sb, m0)
+	register struct sockbuf *sb;
+	register struct mbuf *m0;
+{
+	register struct mbuf *m;
+	register struct mbuf **mp;
+
+	if (m0 == 0)
+		return;
+	for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+	    m = *mp;
+	    again:
+		switch (m->m_type) {
+
+		case MT_OOBDATA:
+			continue;		/* WANT next train */
+
+		case MT_CONTROL:
+			m = m->m_next;
+			if (m)
+				goto again;	/* inspect THIS train further */
+		}
+		break;
+	}
+	/*
+	 * Put the first mbuf on the queue.
+	 * Note this permits zero length records.
+	 */
+	sballoc(sb, m0);
+	m0->m_nextpkt = *mp;
+	*mp = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data
+ * to the receive queue of a socket.  If present,
+ * m0 must include a packet header with total length.
+ * Returns 0 if no space in sockbuf or insufficient mbufs.
+ */
+int
+sbappendaddr(sb, asa, m0, control)
+	register struct sockbuf *sb;
+	struct sockaddr *asa;
+	struct mbuf *m0, *control;
+{
+	register struct mbuf *m, *n;
+	int space = asa->sa_len;
+
+if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+panic("sbappendaddr");
+	if (m0)
+		space += m0->m_pkthdr.len;
+	for (n = control; n; n = n->m_next) {
+		space += n->m_len;
+		if (n->m_next == 0)	/* keep pointer to last control buf */
+			break;
+	}
+	if (space > sbspace(sb))
+		return (0);
+	if (asa->sa_len > MLEN)
+		return (0);
+	MGET(m, M_DONTWAIT, MT_SONAME);
+	if (m == 0)
+		return (0);
+	m->m_len = asa->sa_len;
+	bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
+	if (n)
+		n->m_next = m0;		/* concatenate data to control */
+	else
+		control = m0;
+	m->m_next = control;
+	for (n = m; n; n = n->m_next)
+		sballoc(sb, n);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		n->m_nextpkt = m;
+	} else
+		sb->sb_mb = m;
+	return (1);
+}
+
+int
+sbappendcontrol(sb, m0, control)
+	struct sockbuf *sb;
+	struct mbuf *control, *m0;
+{
+	register struct mbuf *m, *n;
+	int space = 0;
+
+	if (control == 0)
+		panic("sbappendcontrol");
+	for (m = control; ; m = m->m_next) {
+		space += m->m_len;
+		if (m->m_next == 0)
+			break;
+	}
+	n = m;			/* save pointer to last control buffer */
+	for (m = m0; m; m = m->m_next)
+		space += m->m_len;
+	if (space > sbspace(sb))
+		return (0);
+	n->m_next = m0;			/* concatenate data to control */
+	for (m = control; m; m = m->m_next)
+		sballoc(sb, m);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		n->m_nextpkt = control;
+	} else
+		sb->sb_mb = control;
+	return (1);
+}
+
+/*
+ * Compress mbuf chain m into the socket
+ * buffer sb following mbuf n.  If n
+ * is null, the buffer is presumed empty.
+ */
+void
+sbcompress(sb, m, n)
+	register struct sockbuf *sb;
+	register struct mbuf *m, *n;
+{
+	register int eor = 0;
+	register struct mbuf *o;
+
+	while (m) {
+		eor |= m->m_flags & M_EOR;
+		if (m->m_len == 0 &&
+		    (eor == 0 ||
+		     (((o = m->m_next) || (o = n)) &&
+		      o->m_type == m->m_type))) {
+			m = m_free(m);
+			continue;
+		}
+		if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 &&
+		    (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] &&
+		    n->m_type == m->m_type) {
+			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+			    (unsigned)m->m_len);
+			n->m_len += m->m_len;
+			sb->sb_cc += m->m_len;
+			m = m_free(m);
+			continue;
+		}
+		if (n)
+			n->m_next = m;
+		else
+			sb->sb_mb = m;
+		sballoc(sb, m);
+		n = m;
+		m->m_flags &= ~M_EOR;
+		m = m->m_next;
+		n->m_next = 0;
+	}
+	if (eor) {
+		if (n)
+			n->m_flags |= eor;
+		else
+			printf("semi-panic: sbcompress\n");
+	}
+}
+
+/*
+ * Free all mbufs in a sockbuf.
+ * Check that all resources are reclaimed.
+ */
+void
+sbflush(sb)
+	register struct sockbuf *sb;
+{
+
+	if (sb->sb_flags & SB_LOCK)
+		panic("sbflush: locked");
+	while (sb->sb_mbcnt && sb->sb_cc)
+		sbdrop(sb, (int)sb->sb_cc);
+	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+		panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop(sb, len)
+	register struct sockbuf *sb;
+	register int len;
+{
+	register struct mbuf *m, *mn;
+	struct mbuf *next;
+
+	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+	while (len > 0) {
+		if (m == 0) {
+			if (next == 0)
+				panic("sbdrop");
+			m = next;
+			next = m->m_nextpkt;
+			continue;
+		}
+		if (m->m_len > len) {
+			m->m_len -= len;
+			m->m_data += len;
+			sb->sb_cc -= len;
+			break;
+		}
+		len -= m->m_len;
+		sbfree(sb, m);
+		MFREE(m, mn);
+		m = mn;
+	}
+	while (m && m->m_len == 0) {
+		sbfree(sb, m);
+		MFREE(m, mn);
+		m = mn;
+	}
+	if (m) {
+		sb->sb_mb = m;
+		m->m_nextpkt = next;
+	} else
+		sb->sb_mb = next;
+}
+
+/*
+ * Drop a record off the front of a sockbuf
+ * and move the next record to the front.
+ */
+void
+sbdroprecord(sb)
+	register struct sockbuf *sb;
+{
+	register struct mbuf *m, *mn;
+
+	m = sb->sb_mb;
+	if (m) {
+		sb->sb_mb = m->m_nextpkt;
+		do {
+			sbfree(sb, m);
+			MFREE(m, mn);
+			m = mn;
+		} while (m);
+	}
+}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+	caddr_t p;
+	register int size;
+	int type, level;
+{
+	register struct cmsghdr *cp;
+	struct mbuf *m;
+
+	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+		return ((struct mbuf *) NULL);
+	cp = mtod(m, struct cmsghdr *);
+	/* XXX check size? */
+	(void)memcpy(CMSG_DATA(cp), p, size);
+	size += sizeof(*cp);
+	m->m_len = size;
+	cp->cmsg_len = size;
+	cp->cmsg_level = level;
+	cp->cmsg_type = type;
+	return (m);
+}
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol.  Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+		    struct ifnet *ifp, struct proc *p)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, struct proc *p)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	return 0;
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+dup_sockaddr(sa, canwait)
+	struct sockaddr *sa;
+	int canwait;
+{
+	struct sockaddr *sa2;
+
+	MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME, 
+	       canwait ? M_WAITOK : M_NOWAIT);
+	if (sa2)
+		bcopy(sa, sa2, sa->sa_len);
+	return sa2;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information
+ * in the kernel-format socket structure pointed to by so.  This is done
+ * to reduce the spew of irrelevant information over this interface,
+ * to isolate user code from changes in the kernel structure, and
+ * potentially to provide information-hiding if we decide that
+ * some of this information should be hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+	xso->xso_len = sizeof *xso;
+	xso->xso_so = so;
+	xso->so_type = so->so_type;
+	xso->so_options = so->so_options;
+	xso->so_linger = so->so_linger;
+	xso->so_state = so->so_state;
+	xso->so_pcb = so->so_pcb;
+	xso->xso_protocol = so->so_proto->pr_protocol;
+	xso->xso_family = so->so_proto->pr_domain->dom_family;
+	xso->so_qlen = so->so_qlen;
+	xso->so_incqlen = so->so_incqlen;
+	xso->so_qlimit = so->so_qlimit;
+	xso->so_timeo = so->so_timeo;
+	xso->so_error = so->so_error;
+	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+	xso->so_oobmark = so->so_oobmark;
+	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+	xso->so_uid = so->so_uid;
+}
+
+/*
+ * This does the same for sockbufs.  Note that the xsockbuf structure,
+ * since it is always embedded in a socket, does not include a self
+ * pointer nor a length.  We make this entry point public in case
+ * some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+	xsb->sb_cc = sb->sb_cc;
+	xsb->sb_hiwat = sb->sb_hiwat;
+	xsb->sb_mbcnt = sb->sb_mbcnt;
+	xsb->sb_mbmax = sb->sb_mbmax;
+	xsb->sb_lowat = sb->sb_lowat;
+	xsb->sb_flags = sb->sb_flags;
+	xsb->sb_timeo = sb->sb_timeo;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, &maxsockets, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+	   &sb_efficiency, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, "");
+
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
new file mode 100644
index 0000000..1efa8c5
--- /dev/null
+++ b/sys/kern/uipc_socket.c
@@ -0,0 +1,1216 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
+ *	$Id: uipc_socket.c,v 1.50 1999/01/20 17:31:54 fenner Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/poll.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <vm/vm_zone.h>
+
+#include <machine/limits.h>
+
+struct	vm_zone *socket_zone;
+so_gen_t	so_gencnt;	/* generation count for sockets */
+
+MALLOC_DEFINE(M_SONAME, "soname", "socket name");
+MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
+
+static int somaxconn = SOMAXCONN;
+SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
+	   0, "");
+
+/*
+ * Socket operation routines.
+ * These routines are called by the routines in
+ * sys_socket.c or from a system process, and
+ * implement the semantics of socket operations by
+ * switching out to the protocol specific routines.
+ */
+
+/*
+ * Get a socket structure from our zone, and initialize it.
+ * We don't implement `waitok' yet (see comments in uipc_domain.c).
+ * Note that it would probably be better to allocate socket
+ * and PCB at the same time, but I'm not convinced that all
+ * the protocols can be easily modified to do this.
+ */
+struct socket *
+soalloc(waitok)
+	int waitok;
+{
+	struct socket *so;
+
+	so = zalloci(socket_zone);
+	if (so) {
+		/* XXX race condition for reentrant kernel */
+		bzero(so, sizeof *so);
+		so->so_gencnt = ++so_gencnt;
+		so->so_zone = socket_zone;
+	}
+	return so;
+}
+
+int
+socreate(dom, aso, type, proto, p)
+	int dom;
+	struct socket **aso;
+	register int type;
+	int proto;
+	struct proc *p;
+{
+	register struct protosw *prp;
+	register struct socket *so;
+	register int error;
+
+	if (proto)
+		prp = pffindproto(dom, proto, type);
+	else
+		prp = pffindtype(dom, type);
+	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
+		return (EPROTONOSUPPORT);
+	if (prp->pr_type != type)
+		return (EPROTOTYPE);
+	so = soalloc(p != 0);
+	if (so == 0)
+		return (ENOBUFS);
+
+	TAILQ_INIT(&so->so_incomp);
+	TAILQ_INIT(&so->so_comp);
+	so->so_type = type;
+	if (p != 0)
+		so->so_uid = p->p_ucred->cr_uid;
+	so->so_proto = prp;
+	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
+	if (error) {
+		so->so_state |= SS_NOFDREF;
+		sofree(so);
+		return (error);
+	}
+	*aso = so;
+	return (0);
+}
+
+int
+sobind(so, nam, p)
+	struct socket *so;
+	struct sockaddr *nam;
+	struct proc *p;
+{
+	int s = splnet();
+	int error;
+
+	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
+	splx(s);
+	return (error);
+}
+
+void
+sodealloc(so)
+	struct socket *so;
+{
+	so->so_gencnt = ++so_gencnt;
+	zfreei(so->so_zone, so);
+}
+
+int
+solisten(so, backlog, p)
+	register struct socket *so;
+	int backlog;
+	struct proc *p;
+{
+	int s, error;
+
+	s = splnet();
+	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
+	if (error) {
+		splx(s);
+		return (error);
+	}
+	if (so->so_comp.tqh_first == NULL)
+		so->so_options |= SO_ACCEPTCONN;
+	if (backlog < 0 || backlog > somaxconn)
+		backlog = somaxconn;
+	so->so_qlimit = backlog;
+	splx(s);
+	return (0);
+}
+
+void
+sofree(so)
+	register struct socket *so;
+{
+	struct socket *head = so->so_head;
+
+	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
+		return;
+	if (head != NULL) {
+		if (so->so_state & SS_INCOMP) {
+			TAILQ_REMOVE(&head->so_incomp, so, so_list);
+			head->so_incqlen--;
+		} else if (so->so_state & SS_COMP) {
+			TAILQ_REMOVE(&head->so_comp, so, so_list);
+		} else {
+			panic("sofree: not queued");
+		}
+		head->so_qlen--;
+		so->so_state &= ~(SS_INCOMP|SS_COMP);
+		so->so_head = NULL;
+	}
+	sbrelease(&so->so_snd);
+	sorflush(so);
+	sodealloc(so);
+}
+
+/*
+ * Close a socket on last file table reference removal.
+ * Initiate disconnect if connected.
+ * Free socket when disconnect complete.
+ */
+int
+soclose(so)
+	register struct socket *so;
+{
+	int s = splnet();		/* conservative */
+	int error = 0;
+
+	funsetown(so->so_sigio);
+	if (so->so_options & SO_ACCEPTCONN) {
+		struct socket *sp, *sonext;
+
+		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
+			sonext = sp->so_list.tqe_next;
+			(void) soabort(sp);
+		}
+		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
+			sonext = sp->so_list.tqe_next;
+			(void) soabort(sp);
+		}
+	}
+	if (so->so_pcb == 0)
+		goto discard;
+	if (so->so_state & SS_ISCONNECTED) {
+		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
+			error = sodisconnect(so);
+			if (error)
+				goto drop;
+		}
+		if (so->so_options & SO_LINGER) {
+			if ((so->so_state & SS_ISDISCONNECTING) &&
+			    (so->so_state & SS_NBIO))
+				goto drop;
+			while (so->so_state & SS_ISCONNECTED) {
+				error = tsleep((caddr_t)&so->so_timeo,
+				    PSOCK | PCATCH, "soclos", so->so_linger);
+				if (error)
+					break;
+			}
+		}
+	}
+drop:
+	if (so->so_pcb) {
+		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
+		if (error == 0)
+			error = error2;
+	}
+discard:
+	if (so->so_state & SS_NOFDREF)
+		panic("soclose: NOFDREF");
+	so->so_state |= SS_NOFDREF;
+	sofree(so);
+	splx(s);
+	return (error);
+}
+
+/*
+ * Must be called at splnet...
+ */
+int
+soabort(so)
+	struct socket *so;
+{
+
+	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
+}
+
+int
+soaccept(so, nam)
+	register struct socket *so;
+	struct sockaddr **nam;
+{
+	int s = splnet();
+	int error;
+
+	if ((so->so_state & SS_NOFDREF) == 0)
+		panic("soaccept: !NOFDREF");
+	so->so_state &= ~SS_NOFDREF;
+	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
+	splx(s);
+	return (error);
+}
+
+int
+soconnect(so, nam, p)
+	register struct socket *so;
+	struct sockaddr *nam;
+	struct proc *p;
+{
+	int s;
+	int error;
+
+	if (so->so_options & SO_ACCEPTCONN)
+		return (EOPNOTSUPP);
+	s = splnet();
+	/*
+	 * If protocol is connection-based, can only connect once.
+	 * Otherwise, if connected, try to disconnect first.
+	 * This allows user to disconnect by connecting to, e.g.,
+	 * a null address.
+	 */
+	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
+	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
+	    (error = sodisconnect(so))))
+		error = EISCONN;
+	else
+		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
+	splx(s);
+	return (error);
+}
+
+int
+soconnect2(so1, so2)
+	register struct socket *so1;
+	struct socket *so2;
+{
+	int s = splnet();
+	int error;
+
+	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
+	splx(s);
+	return (error);
+}
+
+int
+sodisconnect(so)
+	register struct socket *so;
+{
+	int s = splnet();
+	int error;
+
+	if ((so->so_state & SS_ISCONNECTED) == 0) {
+		error = ENOTCONN;
+		goto bad;
+	}
+	if (so->so_state & SS_ISDISCONNECTING) {
+		error = EALREADY;
+		goto bad;
+	}
+	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
+bad:
+	splx(s);
+	return (error);
+}
+
+#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+/*
+ * Send on a socket.
+ * If send must go all at once and message is larger than
+ * send buffering, then hard error.
+ * Lock against other senders.
+ * If must go all at once and not enough room now, then
+ * inform user that this would block and do nothing.
+ * Otherwise, if nonblocking, send as much as possible.
+ * The data to be sent is described by "uio" if nonzero,
+ * otherwise by the mbuf chain "top" (which must be null
+ * if uio is not).  Data provided in mbuf chain must be small
+ * enough to send all at once.
+ *
+ * Returns nonzero on error, timeout or signal; callers
+ * must check for short counts if EINTR/ERESTART are returned.
+ * Data and control buffers are freed on return.
+ */
+int
+sosend(so, addr, uio, top, control, flags, p)
+	register struct socket *so;
+	struct sockaddr *addr;
+	struct uio *uio;
+	struct mbuf *top;
+	struct mbuf *control;
+	int flags;
+	struct proc *p;
+{
+	struct mbuf **mp;
+	register struct mbuf *m;
+	register long space, len, resid;
+	int clen = 0, error, s, dontroute, mlen;
+	int atomic = sosendallatonce(so) || top;
+
+	if (uio)
+		resid = uio->uio_resid;
+	else
+		resid = top->m_pkthdr.len;
+	/*
+	 * In theory resid should be unsigned.
+	 * However, space must be signed, as it might be less than 0
+	 * if we over-committed, and we must use a signed comparison
+	 * of space and resid.  On the other hand, a negative resid
+	 * causes us to loop sending 0-length segments to the protocol.
+	 *
+	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+	 * type sockets since that's an error.
+	 */
+	if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	dontroute =
+	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
+	    (so->so_proto->pr_flags & PR_ATOMIC);
+	if (p)
+		p->p_stats->p_ru.ru_msgsnd++;
+	if (control)
+		clen = control->m_len;
+#define	snderr(errno)	{ error = errno; splx(s); goto release; }
+
+restart:
+	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+	if (error)
+		goto out;
+	do {
+		s = splnet();
+		if (so->so_state & SS_CANTSENDMORE)
+			snderr(EPIPE);
+		if (so->so_error) {
+			error = so->so_error;
+			so->so_error = 0;
+			splx(s);
+			goto release;
+		}
+		if ((so->so_state & SS_ISCONNECTED) == 0) {
+			/*
+			 * `sendto' and `sendmsg' is allowed on a connection-
+			 * based socket if it supports implied connect.
+			 * Return ENOTCONN if not connected and no address is
+			 * supplied.
+			 */
+			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+				    !(resid == 0 && clen != 0))
+					snderr(ENOTCONN);
+			} else if (addr == 0)
+			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
+				   ENOTCONN : EDESTADDRREQ);
+		}
+		space = sbspace(&so->so_snd);
+		if (flags & MSG_OOB)
+			space += 1024;
+		if ((atomic && resid > so->so_snd.sb_hiwat) ||
+		    clen > so->so_snd.sb_hiwat)
+			snderr(EMSGSIZE);
+		if (space < resid + clen && uio &&
+		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
+			if (so->so_state & SS_NBIO)
+				snderr(EWOULDBLOCK);
+			sbunlock(&so->so_snd);
+			error = sbwait(&so->so_snd);
+			splx(s);
+			if (error)
+				goto out;
+			goto restart;
+		}
+		splx(s);
+		mp = &top;
+		space -= clen;
+		do {
+		    if (uio == NULL) {
+			/*
+			 * Data is prepackaged in "top".
+			 */
+			resid = 0;
+			if (flags & MSG_EOR)
+				top->m_flags |= M_EOR;
+		    } else do {
+			if (top == 0) {
+				MGETHDR(m, M_WAIT, MT_DATA);
+				mlen = MHLEN;
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = (struct ifnet *)0;
+			} else {
+				MGET(m, M_WAIT, MT_DATA);
+				mlen = MLEN;
+			}
+			if (resid >= MINCLSIZE) {
+				MCLGET(m, M_WAIT);
+				if ((m->m_flags & M_EXT) == 0)
+					goto nopages;
+				mlen = MCLBYTES;
+				len = min(min(mlen, resid), space);
+			} else {
+nopages:
+				len = min(min(mlen, resid), space);
+				/*
+				 * For datagram protocols, leave room
+				 * for protocol headers in first mbuf.
+				 */
+				if (atomic && top == 0 && len < mlen)
+					MH_ALIGN(m, len);
+			}
+			space -= len;
+			error = uiomove(mtod(m, caddr_t), (int)len, uio);
+			resid = uio->uio_resid;
+			m->m_len = len;
+			*mp = m;
+			top->m_pkthdr.len += len;
+			if (error)
+				goto release;
+			mp = &m->m_next;
+			if (resid <= 0) {
+				if (flags & MSG_EOR)
+					top->m_flags |= M_EOR;
+				break;
+			}
+		    } while (space > 0 && atomic);
+		    if (dontroute)
+			    so->so_options |= SO_DONTROUTE;
+		    s = splnet();				/* XXX */
+		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+			(flags & MSG_OOB) ? PRUS_OOB :
+			/*
+			 * If the user set MSG_EOF, the protocol
+			 * understands this flag and nothing left to
+			 * send then use PRU_SEND_EOF instead of PRU_SEND.
+			 */
+			((flags & MSG_EOF) &&
+			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+			 (resid <= 0)) ?
+				PRUS_EOF :
+			/* If there is more to send set PRUS_MORETOCOME */
+			(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+			top, addr, control, p);
+		    splx(s);
+		    if (dontroute)
+			    so->so_options &= ~SO_DONTROUTE;
+		    clen = 0;
+		    control = 0;
+		    top = 0;
+		    mp = &top;
+		    if (error)
+			goto release;
+		} while (resid && space > 0);
+	} while (resid);
+
+release:
+	sbunlock(&so->so_snd);
+out:
+	if (top)
+		m_freem(top);
+	if (control)
+		m_freem(control);
+	return (error);
+}
+
+/*
+ * Implement receive operations on a socket.
+ * We depend on the way that records are added to the sockbuf
+ * by sbappend*.  In particular, each record (mbufs linked through m_next)
+ * must begin with an address if the protocol so specifies,
+ * followed by an optional mbuf or mbufs containing ancillary data,
+ * and then zero or more mbufs of data.
+ * In order to avoid blocking network interrupts for the entire time here,
+ * we splx() while doing the actual copy to user space.
+ * Although the sockbuf is locked, new data may still be appended,
+ * and thus we must maintain consistency of the sockbuf during that time.
+ *
+ * The caller may receive the data as a single mbuf chain by supplying
+ * an mbuf **mp0 for use in returning the chain.  The uio is then used
+ * only for the count in uio_resid.
+ */
+int
+soreceive(so, psa, uio, mp0, controlp, flagsp)
+	register struct socket *so;
+	struct sockaddr **psa;
+	struct uio *uio;
+	struct mbuf **mp0;
+	struct mbuf **controlp;
+	int *flagsp;
+{
+	register struct mbuf *m, **mp;
+	register int flags, len, error, s, offset;
+	struct protosw *pr = so->so_proto;
+	struct mbuf *nextrecord;
+	int moff, type = 0;
+	int orig_resid = uio->uio_resid;
+
+	mp = mp0;
+	if (psa)
+		*psa = 0;
+	if (controlp)
+		*controlp = 0;
+	if (flagsp)
+		flags = *flagsp &~ MSG_EOR;
+	else
+		flags = 0;
+	if (flags & MSG_OOB) {
+		m = m_get(M_WAIT, MT_DATA);
+		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+		if (error)
+			goto bad;
+		do {
+			error = uiomove(mtod(m, caddr_t),
+			    (int) min(uio->uio_resid, m->m_len), uio);
+			m = m_free(m);
+		} while (uio->uio_resid && error == 0 && m);
+bad:
+		if (m)
+			m_freem(m);
+		return (error);
+	}
+	if (mp)
+		*mp = (struct mbuf *)0;
+	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
+		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
+
+restart:
+	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+	if (error)
+		return (error);
+	s = splnet();
+
+	m = so->so_rcv.sb_mb;
+	/*
+	 * If we have less data than requested, block awaiting more
+	 * (subject to any timeout) if:
+	 *   1. the current count is less than the low water mark, or
+	 *   2. MSG_WAITALL is set, and it is possible to do the entire
+	 *	receive operation at once if we block (resid <= hiwat).
+	 *   3. MSG_DONTWAIT is not set
+	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
+	 * we have to do the receive in sections, and thus risk returning
+	 * a short count if a timeout or signal occurs after we start.
+	 */
+	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
+	    so->so_rcv.sb_cc < uio->uio_resid) &&
+	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
+	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
+	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
+		KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1"));
+		if (so->so_error) {
+			if (m)
+				goto dontblock;
+			error = so->so_error;
+			if ((flags & MSG_PEEK) == 0)
+				so->so_error = 0;
+			goto release;
+		}
+		if (so->so_state & SS_CANTRCVMORE) {
+			if (m)
+				goto dontblock;
+			else
+				goto release;
+		}
+		for (; m; m = m->m_next)
+			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
+				m = so->so_rcv.sb_mb;
+				goto dontblock;
+			}
+		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+			error = ENOTCONN;
+			goto release;
+		}
+		if (uio->uio_resid == 0)
+			goto release;
+		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
+			error = EWOULDBLOCK;
+			goto release;
+		}
+		sbunlock(&so->so_rcv);
+		error = sbwait(&so->so_rcv);
+		splx(s);
+		if (error)
+			return (error);
+		goto restart;
+	}
+dontblock:
+	if (uio->uio_procp)
+		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
+	nextrecord = m->m_nextpkt;
+	if (pr->pr_flags & PR_ADDR) {
+		KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
+		orig_resid = 0;
+		if (psa)
+			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
+					    mp0 == 0);
+		if (flags & MSG_PEEK) {
+			m = m->m_next;
+		} else {
+			sbfree(&so->so_rcv, m);
+			MFREE(m, so->so_rcv.sb_mb);
+			m = so->so_rcv.sb_mb;
+		}
+	}
+	while (m && m->m_type == MT_CONTROL && error == 0) {
+		if (flags & MSG_PEEK) {
+			if (controlp)
+				*controlp = m_copy(m, 0, m->m_len);
+			m = m->m_next;
+		} else {
+			sbfree(&so->so_rcv, m);
+			if (controlp) {
+				if (pr->pr_domain->dom_externalize &&
+				    mtod(m, struct cmsghdr *)->cmsg_type ==
+				    SCM_RIGHTS)
+				   error = (*pr->pr_domain->dom_externalize)(m);
+				*controlp = m;
+				so->so_rcv.sb_mb = m->m_next;
+				m->m_next = 0;
+				m = so->so_rcv.sb_mb;
+			} else {
+				MFREE(m, so->so_rcv.sb_mb);
+				m = so->so_rcv.sb_mb;
+			}
+		}
+		if (controlp) {
+			orig_resid = 0;
+			controlp = &(*controlp)->m_next;
+		}
+	}
+	if (m) {
+		if ((flags & MSG_PEEK) == 0)
+			m->m_nextpkt = nextrecord;
+		type = m->m_type;
+		if (type == MT_OOBDATA)
+			flags |= MSG_OOB;
+	}
+	moff = 0;
+	offset = 0;
+	while (m && uio->uio_resid > 0 && error == 0) {
+		if (m->m_type == MT_OOBDATA) {
+			if (type != MT_OOBDATA)
+				break;
+		} else if (type == MT_OOBDATA)
+			break;
+		else
+		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
+			("receive 3"));
+		so->so_state &= ~SS_RCVATMARK;
+		len = uio->uio_resid;
+		if (so->so_oobmark && len > so->so_oobmark - offset)
+			len = so->so_oobmark - offset;
+		if (len > m->m_len - moff)
+			len = m->m_len - moff;
+		/*
+		 * If mp is set, just pass back the mbufs.
+		 * Otherwise copy them out via the uio, then free.
+		 * Sockbuf must be consistent here (points to current mbuf,
+		 * it points to next record) when we drop priority;
+		 * we must note any additions to the sockbuf when we
+		 * block interrupts again.
+		 */
+		if (mp == 0) {
+			splx(s);
+			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
+			s = splnet();
+			if (error)
+				goto release;
+		} else
+			uio->uio_resid -= len;
+		if (len == m->m_len - moff) {
+			if (m->m_flags & M_EOR)
+				flags |= MSG_EOR;
+			if (flags & MSG_PEEK) {
+				m = m->m_next;
+				moff = 0;
+			} else {
+				nextrecord = m->m_nextpkt;
+				sbfree(&so->so_rcv, m);
+				if (mp) {
+					*mp = m;
+					mp = &m->m_next;
+					so->so_rcv.sb_mb = m = m->m_next;
+					*mp = (struct mbuf *)0;
+				} else {
+					MFREE(m, so->so_rcv.sb_mb);
+					m = so->so_rcv.sb_mb;
+				}
+				if (m)
+					m->m_nextpkt = nextrecord;
+			}
+		} else {
+			if (flags & MSG_PEEK)
+				moff += len;
+			else {
+				if (mp)
+					*mp = m_copym(m, 0, len, M_WAIT);
+				m->m_data += len;
+				m->m_len -= len;
+				so->so_rcv.sb_cc -= len;
+			}
+		}
+		if (so->so_oobmark) {
+			if ((flags & MSG_PEEK) == 0) {
+				so->so_oobmark -= len;
+				if (so->so_oobmark == 0) {
+					so->so_state |= SS_RCVATMARK;
+					break;
+				}
+			} else {
+				offset += len;
+				if (offset == so->so_oobmark)
+					break;
+			}
+		}
+		if (flags & MSG_EOR)
+			break;
+		/*
+		 * If the MSG_WAITALL flag is set (for non-atomic socket),
+		 * we must not quit until "uio->uio_resid == 0" or an error
+		 * termination.  If a signal/timeout occurs, return
+		 * with a short count but without error.
+		 * Keep sockbuf locked against other readers.
+		 */
+		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
+		    !sosendallatonce(so) && !nextrecord) {
+			if (so->so_error || so->so_state & SS_CANTRCVMORE)
+				break;
+			error = sbwait(&so->so_rcv);
+			if (error) {
+				sbunlock(&so->so_rcv);
+				splx(s);
+				return (0);
+			}
+			m = so->so_rcv.sb_mb;
+			if (m)
+				nextrecord = m->m_nextpkt;
+		}
+	}
+
+	if (m && pr->pr_flags & PR_ATOMIC) {
+		flags |= MSG_TRUNC;
+		if ((flags & MSG_PEEK) == 0)
+			(void) sbdroprecord(&so->so_rcv);
+	}
+	if ((flags & MSG_PEEK) == 0) {
+		if (m == 0)
+			so->so_rcv.sb_mb = nextrecord;
+		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
+			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
+	}
+	if (orig_resid == uio->uio_resid && orig_resid &&
+	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
+		sbunlock(&so->so_rcv);
+		splx(s);
+		goto restart;
+	}
+
+	if (flagsp)
+		*flagsp |= flags;
+release:
+	sbunlock(&so->so_rcv);
+	splx(s);
+	return (error);
+}
+
+int
+soshutdown(so, how)
+	register struct socket *so;
+	register int how;
+{
+	register struct protosw *pr = so->so_proto;
+
+	how++;
+	if (how & FREAD)
+		sorflush(so);
+	if (how & FWRITE)
+		return ((*pr->pr_usrreqs->pru_shutdown)(so));
+	return (0);
+}
+
+void
+sorflush(so)
+	register struct socket *so;
+{
+	register struct sockbuf *sb = &so->so_rcv;
+	register struct protosw *pr = so->so_proto;
+	register int s;
+	struct sockbuf asb;
+
+	sb->sb_flags |= SB_NOINTR;
+	(void) sblock(sb, M_WAITOK);
+	s = splimp();
+	socantrcvmore(so);
+	sbunlock(sb);
+	asb = *sb;
+	bzero((caddr_t)sb, sizeof (*sb));
+	splx(s);
+	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
+		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
+	sbrelease(&asb);
+}
+
+/*
+ * Perhaps this routine, and sooptcopyout(), below, ought to come in
+ * an additional variant to handle the case where the option value needs
+ * to be some kind of integer, but not a specific size.
+ * In addition to their use here, these functions are also called by the
+ * protocol-level pr_ctloutput() routines.
+ */
+int
+sooptcopyin(sopt, buf, len, minlen)
+	struct	sockopt *sopt;
+	void	*buf;
+	size_t	len;
+	size_t	minlen;
+{
+	size_t	valsize;
+
+	/*
+	 * If the user gives us more than we wanted, we ignore it,
+	 * but if we don't get the minimum length the caller
+	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
+	 * is set to however much we actually retrieved.
+	 */
+	if ((valsize = sopt->sopt_valsize) < minlen)
+		return EINVAL;
+	if (valsize > len)
+		sopt->sopt_valsize = valsize = len;
+
+	if (sopt->sopt_p != 0)
+		return (copyin(sopt->sopt_val, buf, valsize));
+
+	bcopy(sopt->sopt_val, buf, valsize);
+	return 0;
+}
+
+int
+sosetopt(so, sopt)
+	struct socket *so;
+	struct sockopt *sopt;
+{
+	int	error, optval;
+	struct	linger l;
+	struct	timeval tv;
+	short	val;
+
+	error = 0;
+	if (sopt->sopt_level != SOL_SOCKET) {
+		if (so->so_proto && so->so_proto->pr_ctloutput)
+			return ((*so->so_proto->pr_ctloutput)
+				  (so, sopt));
+		error = ENOPROTOOPT;
+	} else {
+		switch (sopt->sopt_name) {
+		case SO_LINGER:
+			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
+			if (error)
+				goto bad;
+
+			so->so_linger = l.l_linger;
+			if (l.l_onoff)
+				so->so_options |= SO_LINGER;
+			else
+				so->so_options &= ~SO_LINGER;
+			break;
+
+		case SO_DEBUG:
+		case SO_KEEPALIVE:
+		case SO_DONTROUTE:
+		case SO_USELOOPBACK:
+		case SO_BROADCAST:
+		case SO_REUSEADDR:
+		case SO_REUSEPORT:
+		case SO_OOBINLINE:
+		case SO_TIMESTAMP:
+			error = sooptcopyin(sopt, &optval, sizeof optval,
+					    sizeof optval);
+			if (error)
+				goto bad;
+			if (optval)
+				so->so_options |= sopt->sopt_name;
+			else
+				so->so_options &= ~sopt->sopt_name;
+			break;
+
+		case SO_SNDBUF:
+		case SO_RCVBUF:
+		case SO_SNDLOWAT:
+		case SO_RCVLOWAT:
+			error = sooptcopyin(sopt, &optval, sizeof optval,
+					    sizeof optval);
+			if (error)
+				goto bad;
+
+			/*
+			 * Values < 1 make no sense for any of these
+			 * options, so disallow them.
+			 */
+			if (optval < 1) {
+				error = EINVAL;
+				goto bad;
+			}
+
+			switch (sopt->sopt_name) {
+			case SO_SNDBUF:
+			case SO_RCVBUF:
+				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
+					      &so->so_snd : &so->so_rcv,
+					      (u_long) optval) == 0) {
+					error = ENOBUFS;
+					goto bad;
+				}
+				break;
+
+			/*
+			 * Make sure the low-water is never greater than
+			 * the high-water.
+			 */
+			case SO_SNDLOWAT:
+				so->so_snd.sb_lowat =
+				    (optval > so->so_snd.sb_hiwat) ?
+				    so->so_snd.sb_hiwat : optval;
+				break;
+			case SO_RCVLOWAT:
+				so->so_rcv.sb_lowat =
+				    (optval > so->so_rcv.sb_hiwat) ?
+				    so->so_rcv.sb_hiwat : optval;
+				break;
+			}
+			break;
+
+		case SO_SNDTIMEO:
+		case SO_RCVTIMEO:
+			error = sooptcopyin(sopt, &tv, sizeof tv,
+					    sizeof tv);
+			if (error)
+				goto bad;
+
+			if (tv.tv_sec > SHRT_MAX / hz - hz) {
+				error = EDOM;
+				goto bad;
+			}
+			val = tv.tv_sec * hz + tv.tv_usec / tick;
+
+			switch (sopt->sopt_name) {
+			case SO_SNDTIMEO:
+				so->so_snd.sb_timeo = val;
+				break;
+			case SO_RCVTIMEO:
+				so->so_rcv.sb_timeo = val;
+				break;
+			}
+			break;
+
+		default:
+			error = ENOPROTOOPT;
+			break;
+		}
+		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
+			(void) ((*so->so_proto->pr_ctloutput)
+				  (so, sopt));
+		}
+	}
+bad:
+	return (error);
+}
+
+/* Helper routine for getsockopt */
+int
+sooptcopyout(sopt, buf, len)
+	struct	sockopt *sopt;
+	void	*buf;
+	size_t	len;
+{
+	int	error;
+	size_t	valsize;
+
+	error = 0;
+
+	/*
+	 * Documented get behavior is that we always return a value,
+	 * possibly truncated to fit in the user's buffer.
+	 * Traditional behavior is that we always tell the user
+	 * precisely how much we copied, rather than something useful
+	 * like the total amount we had available for her.
+	 * Note that this interface is not idempotent; the entire answer must
+	 * generated ahead of time.
+	 */
+	valsize = min(len, sopt->sopt_valsize);
+	sopt->sopt_valsize = valsize;
+	if (sopt->sopt_val != 0) {
+		if (sopt->sopt_p != 0)
+			error = copyout(buf, sopt->sopt_val, valsize);
+		else
+			bcopy(buf, sopt->sopt_val, valsize);
+	}
+	return error;
+}
+
+int
+sogetopt(so, sopt)
+	struct socket *so;
+	struct sockopt *sopt;
+{
+	int	error, optval;
+	struct	linger l;
+	struct	timeval tv;
+
+	error = 0;
+	if (sopt->sopt_level != SOL_SOCKET) {
+		if (so->so_proto && so->so_proto->pr_ctloutput) {
+			return ((*so->so_proto->pr_ctloutput)
+				  (so, sopt));
+		} else
+			return (ENOPROTOOPT);
+	} else {
+		switch (sopt->sopt_name) {
+		case SO_LINGER:
+			l.l_onoff = so->so_options & SO_LINGER;
+			l.l_linger = so->so_linger;
+			error = sooptcopyout(sopt, &l, sizeof l);
+			break;
+
+		case SO_USELOOPBACK:
+		case SO_DONTROUTE:
+		case SO_DEBUG:
+		case SO_KEEPALIVE:
+		case SO_REUSEADDR:
+		case SO_REUSEPORT:
+		case SO_BROADCAST:
+		case SO_OOBINLINE:
+		case SO_TIMESTAMP:
+			optval = so->so_options & sopt->sopt_name;
+integer:
+			error = sooptcopyout(sopt, &optval, sizeof optval);
+			break;
+
+		case SO_TYPE:
+			optval = so->so_type;
+			goto integer;
+
+		case SO_ERROR:
+			optval = so->so_error;
+			so->so_error = 0;
+			goto integer;
+
+		case SO_SNDBUF:
+			optval = so->so_snd.sb_hiwat;
+			goto integer;
+
+		case SO_RCVBUF:
+			optval = so->so_rcv.sb_hiwat;
+			goto integer;
+
+		case SO_SNDLOWAT:
+			optval = so->so_snd.sb_lowat;
+			goto integer;
+
+		case SO_RCVLOWAT:
+			optval = so->so_rcv.sb_lowat;
+			goto integer;
+
+		case SO_SNDTIMEO:
+		case SO_RCVTIMEO:
+			optval = (sopt->sopt_name == SO_SNDTIMEO ?
+				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
+
+			tv.tv_sec = optval / hz;
+			tv.tv_usec = (optval % hz) * tick;
+			error = sooptcopyout(sopt, &tv, sizeof tv);
+			break;			
+
+		default:
+			error = ENOPROTOOPT;
+			break;
+		}
+		return (error);
+	}
+}
+
+void
+sohasoutofband(so)
+	register struct socket *so;
+{
+	if (so->so_sigio != NULL)
+		pgsigio(so->so_sigio, SIGURG, 0);
+	selwakeup(&so->so_rcv.sb_sel);
+}
+
+int
+sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
+{
+	int revents = 0;
+	int s = splnet();
+
+	if (events & (POLLIN | POLLRDNORM))
+		if (soreadable(so))
+			revents |= events & (POLLIN | POLLRDNORM);
+
+	if (events & (POLLOUT | POLLWRNORM))
+		if (sowriteable(so))
+			revents |= events & (POLLOUT | POLLWRNORM);
+
+	if (events & (POLLPRI | POLLRDBAND))
+		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
+			revents |= events & (POLLPRI | POLLRDBAND);
+
+	if (revents == 0) {
+		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
+			selrecord(p, &so->so_rcv.sb_sel);
+			so->so_rcv.sb_flags |= SB_SEL;
+		}
+
+		if (events & (POLLOUT | POLLWRNORM)) {
+			selrecord(p, &so->so_snd.sb_sel);
+			so->so_snd.sb_flags |= SB_SEL;
+		}
+	}
+
+	splx(s);
+	return (revents);
+}
diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c
new file mode 100644
index 0000000..e718c62
--- /dev/null
+++ b/sys/kern/uipc_socket2.c
@@ -0,0 +1,954 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
+ *	$Id: uipc_socket2.c,v 1.42 1998/11/23 00:45:38 truckman Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+
+/*
+ * Primitive routines for operating on sockets and socket buffers
+ */
+
+u_long	sb_max = SB_MAX;		/* XXX should be static */
+
+static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
+
+/*
+ * Procedures to manipulate state flags of socket
+ * and do appropriate wakeups.  Normal sequence from the
+ * active (originating) side is that soisconnecting() is
+ * called during processing of connect() call,
+ * resulting in an eventual call to soisconnected() if/when the
+ * connection is established.  When the connection is torn down
+ * soisdisconnecting() is called during processing of disconnect() call,
+ * and soisdisconnected() is called when the connection to the peer
+ * is totally severed.  The semantics of these routines are such that
+ * connectionless protocols can call soisconnected() and soisdisconnected()
+ * only, bypassing the in-progress calls when setting up a ``connection''
+ * takes no time.
+ *
+ * From the passive side, a socket is created with
+ * two queues of sockets: so_q0 for connections in progress
+ * and so_q for connections already made and awaiting user acceptance.
+ * As a protocol is preparing incoming connections, it creates a socket
+ * structure queued on so_q0 by calling sonewconn().  When the connection
+ * is established, soisconnected() is called, and transfers the
+ * socket structure to so_q, making it available to accept().
+ *
+ * If a socket is closed with sockets on either
+ * so_q0 or so_q, these sockets are dropped.
+ *
+ * If higher level protocols are implemented in
+ * the kernel, the wakeups done here will sometimes
+ * cause software-interrupt process scheduling.
+ */
+
+void
+soisconnecting(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= SS_ISCONNECTING;
+}
+
+void
+soisconnected(so)
+	register struct socket *so;
+{
+	register struct socket *head = so->so_head;
+
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+	so->so_state |= SS_ISCONNECTED;
+	if (head && (so->so_state & SS_INCOMP)) {
+		TAILQ_REMOVE(&head->so_incomp, so, so_list);
+		head->so_incqlen--;
+		so->so_state &= ~SS_INCOMP;
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_state |= SS_COMP;
+		sorwakeup(head);
+		wakeup_one(&head->so_timeo);
+	} else {
+		wakeup(&so->so_timeo);
+		sorwakeup(so);
+		sowwakeup(so);
+	}
+}
+
+void
+soisdisconnecting(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~SS_ISCONNECTING;
+	so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
+	wakeup((caddr_t)&so->so_timeo);
+	sowwakeup(so);
+	sorwakeup(so);
+}
+
+void
+soisdisconnected(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
+	wakeup((caddr_t)&so->so_timeo);
+	sowwakeup(so);
+	sorwakeup(so);
+}
+
+/*
+ * Return a random connection that hasn't been serviced yet and
+ * is eligible for discard.  There is a one in qlen chance that
+ * we will return a null, saying that there are no dropable
+ * requests.  In this case, the protocol specific code should drop
+ * the new request.  This insures fairness.
+ *
+ * This may be used in conjunction with protocol specific queue
+ * congestion routines.
+ */
+struct socket *
+sodropablereq(head)
+	register struct socket *head;
+{
+	register struct socket *so;
+	unsigned int i, j, qlen;
+	static int rnd;
+	static struct timeval old_runtime;
+	static unsigned int cur_cnt, old_cnt;
+	struct timeval tv;
+
+	getmicrouptime(&tv);
+	if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) {
+		old_runtime = tv;
+		old_cnt = cur_cnt / i;
+		cur_cnt = 0;
+	}
+
+	so = TAILQ_FIRST(&head->so_incomp);
+	if (!so)
+		return (so);
+
+	qlen = head->so_incqlen;
+	if (++cur_cnt > qlen || old_cnt > qlen) {
+		rnd = (314159 * rnd + 66329) & 0xffff;
+		j = ((qlen + 1) * rnd) >> 16;
+
+		while (j-- && so)
+		    so = TAILQ_NEXT(so, so_list);
+	}
+
+	return (so);
+}
+
+/*
+ * When an attempt at a new connection is noted on a socket
+ * which accepts connections, sonewconn is called.  If the
+ * connection is possible (subject to space constraints, etc.)
+ * then we allocate a new structure, propoerly linked into the
+ * data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ */
+struct socket *
+sonewconn(head, connstatus)
+	register struct socket *head;
+	int connstatus;
+{
+	register struct socket *so;
+
+	if (head->so_qlen > 3 * head->so_qlimit / 2)
+		return ((struct socket *)0);
+	so = soalloc(0);
+	if (so == NULL)
+		return ((struct socket *)0);
+	so->so_head = head;
+	so->so_type = head->so_type;
+	so->so_options = head->so_options &~ SO_ACCEPTCONN;
+	so->so_linger = head->so_linger;
+	so->so_state = head->so_state | SS_NOFDREF;
+	so->so_proto = head->so_proto;
+	so->so_timeo = head->so_timeo;
+	so->so_uid = head->so_uid;
+	(void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
+
+	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+		sodealloc(so);
+		return ((struct socket *)0);
+	}
+
+	if (connstatus) {
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_state |= SS_COMP;
+	} else {
+		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+		so->so_state |= SS_INCOMP;
+		head->so_incqlen++;
+	}
+	head->so_qlen++;
+	if (connstatus) {
+		sorwakeup(head);
+		wakeup((caddr_t)&head->so_timeo);
+		so->so_state |= connstatus;
+	}
+	return (so);
+}
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the
+ * socket; it would normally be applied to a socket when the user
+ * informs the system that no more data is to be sent, by the protocol
+ * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
+ * will be received, and will normally be applied to the socket by a
+ * protocol when it detects that the peer will send no more data.
+ * Data queued for reading in the socket may yet be read.
+ */
+
+void
+socantsendmore(so)
+	struct socket *so;
+{
+
+	so->so_state |= SS_CANTSENDMORE;
+	sowwakeup(so);
+}
+
+void
+socantrcvmore(so)
+	struct socket *so;
+{
+
+	so->so_state |= SS_CANTRCVMORE;
+	sorwakeup(so);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(sb)
+	struct sockbuf *sb;
+{
+
+	sb->sb_flags |= SB_WAIT;
+	return (tsleep((caddr_t)&sb->sb_cc,
+	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+	    sb->sb_timeo));
+}
+
+/*
+ * Lock a sockbuf already known to be locked;
+ * return any error returned from sleep (EINTR).
+ */
+int
+sb_lock(sb)
+	register struct sockbuf *sb;
+{
+	int error;
+
+	while (sb->sb_flags & SB_LOCK) {
+		sb->sb_flags |= SB_WANT;
+		error = tsleep((caddr_t)&sb->sb_flags,
+		    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
+		    "sblock", 0);
+		if (error)
+			return (error);
+	}
+	sb->sb_flags |= SB_LOCK;
+	return (0);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.
+ * Do asynchronous notification via SIGIO
+ * if the socket has the SS_ASYNC flag set.
+ */
+void
+sowakeup(so, sb)
+	register struct socket *so;
+	register struct sockbuf *sb;
+{
+	selwakeup(&sb->sb_sel);
+	sb->sb_flags &= ~SB_SEL;
+	if (sb->sb_flags & SB_WAIT) {
+		sb->sb_flags &= ~SB_WAIT;
+		wakeup((caddr_t)&sb->sb_cc);
+	}
+	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+		pgsigio(so->so_sigio, SIGIO, 0);
+	if (sb->sb_flags & SB_UPCALL)
+		(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and
+ * one for receiving data.  Each buffer contains a queue of mbufs,
+ * information about the number of mbufs and amount of data in the
+ * queue, and other fields allowing select() statements and notification
+ * on data availability to be implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.
+ * Each record is a list of mbufs chained together with the m_next
+ * field.  Records are chained together with the m_nextpkt field. The upper
+ * level routine soreceive() expects the following conventions to be
+ * observed when placing information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's
+ *    name, then a record containing that name must be present before
+ *    any associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really
+ *    just additional data associated with the message), and there are
+ *    ``rights'' to be received, then a record containing this data
+ *    should be present (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by
+ *    a data record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve().  This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits).  The space
+ * should be released by calling sbrelease() when the socket is destroyed.
+ */
+
+int
+soreserve(so, sndcc, rcvcc)
+	register struct socket *so;
+	u_long sndcc, rcvcc;
+{
+
+	if (sbreserve(&so->so_snd, sndcc) == 0)
+		goto bad;
+	if (sbreserve(&so->so_rcv, rcvcc) == 0)
+		goto bad2;
+	if (so->so_rcv.sb_lowat == 0)
+		so->so_rcv.sb_lowat = 1;
+	if (so->so_snd.sb_lowat == 0)
+		so->so_snd.sb_lowat = MCLBYTES;
+	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+	return (0);
+bad2:
+	sbrelease(&so->so_snd);
+bad:
+	return (ENOBUFS);
+}
+
+/*
+ * Allot mbufs to a sockbuf.
+ * Attempt to scale mbmax so that mbcnt doesn't become limiting
+ * if buffering efficiency is near the normal case.
+ */
+int
+sbreserve(sb, cc)
+	struct sockbuf *sb;
+	u_long cc;
+{
+	if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+		return (0);
+	sb->sb_hiwat = cc;
+	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+	if (sb->sb_lowat > sb->sb_hiwat)
+		sb->sb_lowat = sb->sb_hiwat;
+	return (1);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease(sb)
+	struct sockbuf *sb;
+{
+
+	sbflush(sb);
+	sb->sb_hiwat = sb->sb_mbmax = 0;
+}
+
+/*
+ * Routines to add and remove
+ * data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to
+ * append new mbufs to a socket buffer, after checking that adequate
+ * space is available, comparing the function sbspace() with the amount
+ * of data to be added.  sbappendrecord() differs from sbappend() in
+ * that data supplied is treated as the beginning of a new record.
+ * To place a sender's address, optional access rights, and data in a
+ * socket receive buffer, sbappendaddr() should be used.  To place
+ * access rights and data in a socket receive buffer, sbappendrights()
+ * should be used.  In either case, the new data begins a new record.
+ * Note that unlike sbappend() and sbappendrecord(), these routines check
+ * for the caller that there will be enough space to store the data.
+ * Each fails if there is not enough space, or if it cannot find mbufs
+ * to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data
+ * awaiting acknowledgement.  Data is normally copied from a socket
+ * send buffer in a protocol with m_copy for output to a peer,
+ * and then removing the data from the socket buffer with sbdrop()
+ * or sbdroprecord() when the data is acknowledged by the peer.
+ */
+
+/*
+ * Append mbuf chain m to the last record in the
+ * socket buffer sb.  The additional space associated
+ * the mbuf chain is recorded in sb.  Empty mbufs are
+ * discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(sb, m)
+	struct sockbuf *sb;
+	struct mbuf *m;
+{
+	register struct mbuf *n;
+
+	if (m == 0)
+		return;
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		do {
+			if (n->m_flags & M_EOR) {
+				sbappendrecord(sb, m); /* XXXXXX!!!! */
+				return;
+			}
+		} while (n->m_next && (n = n->m_next));
+	}
+	sbcompress(sb, m, n);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(sb)
+	register struct sockbuf *sb;
+{
+	register struct mbuf *m;
+	register struct mbuf *n = 0;
+	register u_long len = 0, mbcnt = 0;
+
+	for (m = sb->sb_mb; m; m = n) {
+	    n = m->m_nextpkt;
+	    for (; m; m = m->m_next) {
+		len += m->m_len;
+		mbcnt += MSIZE;
+		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+			mbcnt += m->m_ext.ext_size;
+	    }
+	}
+	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+		printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
+		    mbcnt, sb->sb_mbcnt);
+		panic("sbcheck");
+	}
+}
+#endif
+
+/*
+ * As above, except the mbuf chain
+ * begins a new record.
+ */
+void
+sbappendrecord(sb, m0)
+	register struct sockbuf *sb;
+	register struct mbuf *m0;
+{
+	register struct mbuf *m;
+
+	if (m0 == 0)
+		return;
+	m = sb->sb_mb;
+	if (m)
+		while (m->m_nextpkt)
+			m = m->m_nextpkt;
+	/*
+	 * Put the first mbuf on the queue.
+	 * Note this permits zero length records.
+	 */
+	sballoc(sb, m0);
+	if (m)
+		m->m_nextpkt = m0;
+	else
+		sb->sb_mb = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * As above except that OOB data
+ * is inserted at the beginning of the sockbuf,
+ * but after any other OOB data.
+ */
+void
+sbinsertoob(sb, m0)
+	register struct sockbuf *sb;
+	register struct mbuf *m0;
+{
+	register struct mbuf *m;
+	register struct mbuf **mp;
+
+	if (m0 == 0)
+		return;
+	for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+	    m = *mp;
+	    again:
+		switch (m->m_type) {
+
+		case MT_OOBDATA:
+			continue;		/* WANT next train */
+
+		case MT_CONTROL:
+			m = m->m_next;
+			if (m)
+				goto again;	/* inspect THIS train further */
+		}
+		break;
+	}
+	/*
+	 * Put the first mbuf on the queue.
+	 * Note this permits zero length records.
+	 */
+	sballoc(sb, m0);
+	m0->m_nextpkt = *mp;
+	*mp = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data
+ * to the receive queue of a socket.  If present,
+ * m0 must include a packet header with total length.
+ * Returns 0 if no space in sockbuf or insufficient mbufs.
+ */
+int
+sbappendaddr(sb, asa, m0, control)
+	register struct sockbuf *sb;
+	struct sockaddr *asa;
+	struct mbuf *m0, *control;
+{
+	register struct mbuf *m, *n;
+	int space = asa->sa_len;
+
+if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+panic("sbappendaddr");
+	if (m0)
+		space += m0->m_pkthdr.len;
+	for (n = control; n; n = n->m_next) {
+		space += n->m_len;
+		if (n->m_next == 0)	/* keep pointer to last control buf */
+			break;
+	}
+	if (space > sbspace(sb))
+		return (0);
+	if (asa->sa_len > MLEN)
+		return (0);
+	MGET(m, M_DONTWAIT, MT_SONAME);
+	if (m == 0)
+		return (0);
+	m->m_len = asa->sa_len;
+	bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
+	if (n)
+		n->m_next = m0;		/* concatenate data to control */
+	else
+		control = m0;
+	m->m_next = control;
+	for (n = m; n; n = n->m_next)
+		sballoc(sb, n);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		n->m_nextpkt = m;
+	} else
+		sb->sb_mb = m;
+	return (1);
+}
+
+int
+sbappendcontrol(sb, m0, control)
+	struct sockbuf *sb;
+	struct mbuf *control, *m0;
+{
+	register struct mbuf *m, *n;
+	int space = 0;
+
+	if (control == 0)
+		panic("sbappendcontrol");
+	for (m = control; ; m = m->m_next) {
+		space += m->m_len;
+		if (m->m_next == 0)
+			break;
+	}
+	n = m;			/* save pointer to last control buffer */
+	for (m = m0; m; m = m->m_next)
+		space += m->m_len;
+	if (space > sbspace(sb))
+		return (0);
+	n->m_next = m0;			/* concatenate data to control */
+	for (m = control; m; m = m->m_next)
+		sballoc(sb, m);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		n->m_nextpkt = control;
+	} else
+		sb->sb_mb = control;
+	return (1);
+}
+
+/*
+ * Compress mbuf chain m into the socket
+ * buffer sb following mbuf n.  If n
+ * is null, the buffer is presumed empty.
+ */
+void
+sbcompress(sb, m, n)
+	register struct sockbuf *sb;
+	register struct mbuf *m, *n;
+{
+	register int eor = 0;
+	register struct mbuf *o;
+
+	while (m) {
+		eor |= m->m_flags & M_EOR;
+		if (m->m_len == 0 &&
+		    (eor == 0 ||
+		     (((o = m->m_next) || (o = n)) &&
+		      o->m_type == m->m_type))) {
+			m = m_free(m);
+			continue;
+		}
+		if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 &&
+		    (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] &&
+		    n->m_type == m->m_type) {
+			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+			    (unsigned)m->m_len);
+			n->m_len += m->m_len;
+			sb->sb_cc += m->m_len;
+			m = m_free(m);
+			continue;
+		}
+		if (n)
+			n->m_next = m;
+		else
+			sb->sb_mb = m;
+		sballoc(sb, m);
+		n = m;
+		m->m_flags &= ~M_EOR;
+		m = m->m_next;
+		n->m_next = 0;
+	}
+	if (eor) {
+		if (n)
+			n->m_flags |= eor;
+		else
+			printf("semi-panic: sbcompress\n");
+	}
+}
+
+/*
+ * Free all mbufs in a sockbuf.
+ * Check that all resources are reclaimed.
+ */
+void
+sbflush(sb)
+	register struct sockbuf *sb;
+{
+
+	if (sb->sb_flags & SB_LOCK)
+		panic("sbflush: locked");
+	while (sb->sb_mbcnt && sb->sb_cc)
+		sbdrop(sb, (int)sb->sb_cc);
+	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+		panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop(sb, len)
+	register struct sockbuf *sb;
+	register int len;
+{
+	register struct mbuf *m, *mn;
+	struct mbuf *next;
+
+	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+	while (len > 0) {
+		if (m == 0) {
+			if (next == 0)
+				panic("sbdrop");
+			m = next;
+			next = m->m_nextpkt;
+			continue;
+		}
+		if (m->m_len > len) {
+			m->m_len -= len;
+			m->m_data += len;
+			sb->sb_cc -= len;
+			break;
+		}
+		len -= m->m_len;
+		sbfree(sb, m);
+		MFREE(m, mn);
+		m = mn;
+	}
+	while (m && m->m_len == 0) {
+		sbfree(sb, m);
+		MFREE(m, mn);
+		m = mn;
+	}
+	if (m) {
+		sb->sb_mb = m;
+		m->m_nextpkt = next;
+	} else
+		sb->sb_mb = next;
+}
+
+/*
+ * Drop a record off the front of a sockbuf
+ * and move the next record to the front.
+ */
+void
+sbdroprecord(sb)
+	register struct sockbuf *sb;
+{
+	register struct mbuf *m, *mn;
+
+	m = sb->sb_mb;
+	if (m) {
+		sb->sb_mb = m->m_nextpkt;
+		do {
+			sbfree(sb, m);
+			MFREE(m, mn);
+			m = mn;
+		} while (m);
+	}
+}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+	caddr_t p;
+	register int size;
+	int type, level;
+{
+	register struct cmsghdr *cp;
+	struct mbuf *m;
+
+	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+		return ((struct mbuf *) NULL);
+	cp = mtod(m, struct cmsghdr *);
+	/* XXX check size? */
+	(void)memcpy(CMSG_DATA(cp), p, size);
+	size += sizeof(*cp);
+	m->m_len = size;
+	cp->cmsg_len = size;
+	cp->cmsg_level = level;
+	cp->cmsg_type = type;
+	return (m);
+}
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol.  Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+		    struct ifnet *ifp, struct proc *p)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, struct proc *p)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	return 0;
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+dup_sockaddr(sa, canwait)
+	struct sockaddr *sa;
+	int canwait;
+{
+	struct sockaddr *sa2;
+
+	MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME, 
+	       canwait ? M_WAITOK : M_NOWAIT);
+	if (sa2)
+		bcopy(sa, sa2, sa->sa_len);
+	return sa2;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information
+ * in the kernel-format socket structure pointed to by so.  This is done
+ * to reduce the spew of irrelevant information over this interface,
+ * to isolate user code from changes in the kernel structure, and
+ * potentially to provide information-hiding if we decide that
+ * some of this information should be hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+	xso->xso_len = sizeof *xso;
+	xso->xso_so = so;
+	xso->so_type = so->so_type;
+	xso->so_options = so->so_options;
+	xso->so_linger = so->so_linger;
+	xso->so_state = so->so_state;
+	xso->so_pcb = so->so_pcb;
+	xso->xso_protocol = so->so_proto->pr_protocol;
+	xso->xso_family = so->so_proto->pr_domain->dom_family;
+	xso->so_qlen = so->so_qlen;
+	xso->so_incqlen = so->so_incqlen;
+	xso->so_qlimit = so->so_qlimit;
+	xso->so_timeo = so->so_timeo;
+	xso->so_error = so->so_error;
+	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+	xso->so_oobmark = so->so_oobmark;
+	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+	xso->so_uid = so->so_uid;
+}
+
+/*
+ * This does the same for sockbufs.  Note that the xsockbuf structure,
+ * since it is always embedded in a socket, does not include a self
+ * pointer nor a length.  We make this entry point public in case
+ * some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+	xsb->sb_cc = sb->sb_cc;
+	xsb->sb_hiwat = sb->sb_hiwat;
+	xsb->sb_mbcnt = sb->sb_mbcnt;
+	xsb->sb_mbmax = sb->sb_mbmax;
+	xsb->sb_lowat = sb->sb_lowat;
+	xsb->sb_flags = sb->sb_flags;
+	xsb->sb_timeo = sb->sb_timeo;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, &maxsockets, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+	   &sb_efficiency, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, "");
+
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
new file mode 100644
index 0000000..bd5149f
--- /dev/null
+++ b/sys/kern/uipc_syscalls.c
@@ -0,0 +1,1701 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * sendfile(2) and related extensions:
+ * Copyright (c) 1998, David Greenman. All rights reserved. 
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
+ * $Id: uipc_syscalls.c,v 1.50 1999/01/21 08:29:04 dillon Exp $
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysproto.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <machine/limits.h>
+
+static void sf_buf_init(void *arg);
+SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
+static struct sf_buf *sf_buf_alloc(void);
+static void sf_buf_ref(caddr_t addr, u_int size);
+static void sf_buf_free(caddr_t addr, u_int size);
+
+static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
+static int recvit __P((struct proc *p, int s, struct msghdr *mp,
+		       caddr_t namelenp));
+  
+static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
+static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
+			     int compat));
+static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
+			     int compat));
+
+static SLIST_HEAD(, sf_buf) sf_freelist;
+static vm_offset_t sf_base;
+static struct sf_buf *sf_bufs;
+static int sf_buf_alloc_want;
+
+/*
+ * System call interface to the socket abstraction.
+ */
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#define COMPAT_OLDSOCK
+#endif
+
+extern	struct fileops socketops;
+
+int
+socket(p, uap)
+	struct proc *p;
+	register struct socket_args /* {
+		int	domain;
+		int	type;
+		int	protocol;
+	} */ *uap;
+{
+	struct filedesc *fdp = p->p_fd;
+	struct socket *so;
+	struct file *fp;
+	int fd, error;
+
+	error = falloc(p, &fp, &fd);
+	if (error)
+		return (error);
+	fp->f_flag = FREAD|FWRITE;
+	fp->f_type = DTYPE_SOCKET;
+	fp->f_ops = &socketops;
+	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
+	if (error) {
+		fdp->fd_ofiles[fd] = 0;
+		ffree(fp);
+	} else {
+		fp->f_data = (caddr_t)so;
+		p->p_retval[0] = fd;
+	}
+	return (error);
+}
+
+/* ARGSUSED */
+int
+bind(p, uap)
+	struct proc *p;
+	register struct bind_args /* {
+		int	s;
+		caddr_t	name;
+		int	namelen;
+	} */ *uap;
+{
+	struct file *fp;
+	struct sockaddr *sa;
+	int error;
+
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
+		return (error);
+	error = getsockaddr(&sa, uap->name, uap->namelen);
+	if (error)
+		return (error);
+	error = sobind((struct socket *)fp->f_data, sa, p);
+	FREE(sa, M_SONAME);
+	return (error);
+}
+
+/* ARGSUSED */
+int
+listen(p, uap)
+	struct proc *p;
+	register struct listen_args /* {
+		int	s;
+		int	backlog;
+	} */ *uap;
+{
+	struct file *fp;
+	int error;
+
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
+		return (error);
+	return (solisten((struct socket *)fp->f_data, uap->backlog, p));
+}
+
+static int
+accept1(p, uap, compat)
+	struct proc *p;
+	register struct accept_args /* {
+		int	s;
+		caddr_t	name;
+		int	*anamelen;
+	} */ *uap;
+	int compat;
+{
+	struct file *fp;
+	struct sockaddr *sa;
+	int namelen, error, s;
+	struct socket *head, *so;
+	int fd;
+	short fflag;		/* type must match fp->f_flag */
+
+	if (uap->name) {
+		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
+			sizeof (namelen));
+		if(error)
+			return (error);
+	}
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
+		return (error);
+	s = splnet();
+	head = (struct socket *)fp->f_data;
+	if ((head->so_options & SO_ACCEPTCONN) == 0) {
+		splx(s);
+		return (EINVAL);
+	}
+	if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) {
+		splx(s);
+		return (EWOULDBLOCK);
+	}
+	while (head->so_comp.tqh_first == NULL && head->so_error == 0) {
+		if (head->so_state & SS_CANTRCVMORE) {
+			head->so_error = ECONNABORTED;
+			break;
+		}
+		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
+		    "accept", 0);
+		if (error) {
+			splx(s);
+			return (error);
+		}
+	}
+	if (head->so_error) {
+		error = head->so_error;
+		head->so_error = 0;
+		splx(s);
+		return (error);
+	}
+
+	/*
+	 * At this point we know that there is at least one connection
+	 * ready to be accepted. Remove it from the queue prior to
+	 * allocating the file descriptor for it since falloc() may
+	 * block allowing another process to accept the connection
+	 * instead.
+	 */
+	so = head->so_comp.tqh_first;
+	TAILQ_REMOVE(&head->so_comp, so, so_list);
+	head->so_qlen--;
+
+	fflag = fp->f_flag;
+	error = falloc(p, &fp, &fd);
+	if (error) {
+		/*
+		 * Probably ran out of file descriptors. Put the
+		 * unaccepted connection back onto the queue and
+		 * do another wakeup so some other process might
+		 * have a chance at it.
+		 */
+		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
+		head->so_qlen++;
+		wakeup_one(&head->so_timeo);
+		splx(s);
+		return (error);
+	} else
+		p->p_retval[0] = fd;
+
+	so->so_state &= ~SS_COMP;
+	so->so_head = NULL;
+	if (head->so_sigio != NULL)
+		fsetown(fgetown(head->so_sigio), &so->so_sigio);
+
+	fp->f_type = DTYPE_SOCKET;
+	fp->f_flag = fflag;
+	fp->f_ops = &socketops;
+	fp->f_data = (caddr_t)so;
+	sa = 0;
+	(void) soaccept(so, &sa);
+	if (sa == 0) {
+		namelen = 0;
+		if (uap->name)
+			goto gotnoname;
+		return 0;
+	}
+	if (uap->name) {
+		/* check sa_len before it is destroyed */
+		if (namelen > sa->sa_len)
+			namelen = sa->sa_len;
+#ifdef COMPAT_OLDSOCK
+		if (compat)
+			((struct osockaddr *)sa)->sa_family =
+			    sa->sa_family;
+#endif
+		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
+		if (!error)
+gotnoname:
+			error = copyout((caddr_t)&namelen,
+			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
+	}
+	FREE(sa, M_SONAME);
+	splx(s);
+	return (error);
+}
+
+int
+accept(p, uap)
+	struct proc *p;
+	struct accept_args *uap;
+{
+
+	return (accept1(p, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+oaccept(p, uap)
+	struct proc *p;
+	struct accept_args *uap;
+{
+
+	return (accept1(p, uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/* ARGSUSED */
+int
+connect(p, uap)
+	struct proc *p;
+	register struct connect_args /* {
+		int	s;
+		caddr_t	name;
+		int	namelen;
+	} */ *uap;
+{
+	struct file *fp;
+	register struct socket *so;
+	struct sockaddr *sa;
+	int error, s;
+
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
+		return (error);
+	so = (struct socket *)fp->f_data;
+	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
+		return (EALREADY);
+	error = getsockaddr(&sa, uap->name, uap->namelen);
+	if (error)
+		return (error);
+	error = soconnect(so, sa, p);
+	if (error)
+		goto bad;
+	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
+		FREE(sa, M_SONAME);
+		return (EINPROGRESS);
+	}
+	s = splnet();
+	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
+		    "connec", 0);
+		if (error)
+			break;
+	}
+	if (error == 0) {
+		error = so->so_error;
+		so->so_error = 0;
+	}
+	splx(s);
+bad:
+	so->so_state &= ~SS_ISCONNECTING;
+	FREE(sa, M_SONAME);
+	if (error == ERESTART)
+		error = EINTR;
+	return (error);
+}
+
+int
+socketpair(p, uap)
+	struct proc *p;
+	register struct socketpair_args /* {
+		int	domain;
+		int	type;
+		int	protocol;
+		int	*rsv;
+	} */ *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	struct file *fp1, *fp2;
+	struct socket *so1, *so2;
+	int fd, error, sv[2];
+
+	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
+	if (error)
+		return (error);
+	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
+	if (error)
+		goto free1;
+	error = falloc(p, &fp1, &fd);
+	if (error)
+		goto free2;
+	sv[0] = fd;
+	fp1->f_flag = FREAD|FWRITE;
+	fp1->f_type = DTYPE_SOCKET;
+	fp1->f_ops = &socketops;
+	fp1->f_data = (caddr_t)so1;
+	error = falloc(p, &fp2, &fd);
+	if (error)
+		goto free3;
+	fp2->f_flag = FREAD|FWRITE;
+	fp2->f_type = DTYPE_SOCKET;
+	fp2->f_ops = &socketops;
+	fp2->f_data = (caddr_t)so2;
+	sv[1] = fd;
+	error = soconnect2(so1, so2);
+	if (error)
+		goto free4;
+	if (uap->type == SOCK_DGRAM) {
+		/*
+		 * Datagram socket connection is asymmetric.
+		 */
+		 error = soconnect2(so2, so1);
+		 if (error)
+			goto free4;
+	}
+	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
+	return (error);
+free4:
+	ffree(fp2);
+	fdp->fd_ofiles[sv[1]] = 0;
+free3:
+	ffree(fp1);
+	fdp->fd_ofiles[sv[0]] = 0;
+free2:
+	(void)soclose(so2);
+free1:
+	(void)soclose(so1);
+	return (error);
+}
+
+static int
+sendit(p, s, mp, flags)
+	register struct proc *p;
+	int s;
+	register struct msghdr *mp;
+	int flags;
+{
+	struct file *fp;
+	struct uio auio;
+	register struct iovec *iov;
+	register int i;
+	struct mbuf *control;
+	struct sockaddr *to;
+	int len, error;
+	struct socket *so;
+#ifdef KTRACE
+	struct iovec *ktriov = NULL;
+#endif
+
+	error = getsock(p->p_fd, s, &fp);
+	if (error)
+		return (error);
+	auio.uio_iov = mp->msg_iov;
+	auio.uio_iovcnt = mp->msg_iovlen;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_procp = p;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	iov = mp->msg_iov;
+	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+		if ((auio.uio_resid += iov->iov_len) < 0)
+			return (EINVAL);
+	}
+	if (mp->msg_name) {
+		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
+		if (error)
+			return (error);
+	} else
+		to = 0;
+	if (mp->msg_control) {
+		if (mp->msg_controllen < sizeof(struct cmsghdr)
+#ifdef COMPAT_OLDSOCK
+		    && mp->msg_flags != MSG_COMPAT
+#endif
+		) {
+			error = EINVAL;
+			goto bad;
+		}
+		error = sockargs(&control, mp->msg_control,
+		    mp->msg_controllen, MT_CONTROL);
+		if (error)
+			goto bad;
+#ifdef COMPAT_OLDSOCK
+		if (mp->msg_flags == MSG_COMPAT) {
+			register struct cmsghdr *cm;
+
+			M_PREPEND(control, sizeof(*cm), M_WAIT);
+			if (control == 0) {
+				error = ENOBUFS;
+				goto bad;
+			} else {
+				cm = mtod(control, struct cmsghdr *);
+				cm->cmsg_len = control->m_len;
+				cm->cmsg_level = SOL_SOCKET;
+				cm->cmsg_type = SCM_RIGHTS;
+			}
+		}
+#endif
+	} else
+		control = 0;
+#ifdef KTRACE
+	if (KTRPOINT(p, KTR_GENIO)) {
+		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
+
+		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+	}
+#endif
+	len = auio.uio_resid;
+	so = (struct socket *)fp->f_data;
+	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
+						     flags, p);
+	if (error) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		if (error == EPIPE)
+			psignal(p, SIGPIPE);
+	}
+	if (error == 0)
+		p->p_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+	if (ktriov != NULL) {
+		if (error == 0)
+			ktrgenio(p->p_tracep, s, UIO_WRITE,
+				ktriov, p->p_retval[0], error);
+		FREE(ktriov, M_TEMP);
+	}
+#endif
+bad:
+	if (to)
+		FREE(to, M_SONAME);
+	return (error);
+}
+
+int
+sendto(p, uap)
+	struct proc *p;
+	register struct sendto_args /* {
+		int	s;
+		caddr_t	buf;
+		size_t	len;
+		int	flags;
+		caddr_t	to;
+		int	tolen;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+
+	msg.msg_name = uap->to;
+	msg.msg_namelen = uap->tolen;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = 0;
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags = 0;
+#endif
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	return (sendit(p, uap->s, &msg, uap->flags));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+osend(p, uap)
+	struct proc *p;
+	register struct osend_args /* {
+		int	s;
+		caddr_t	buf;
+		int	len;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+
+	msg.msg_name = 0;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = 0;
+	return (sendit(p, uap->s, &msg, uap->flags));
+}
+
+int
+osendmsg(p, uap)
+	struct proc *p;
+	register struct osendmsg_args /* {
+		int	s;
+		caddr_t	msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov[UIO_SMALLIOV], *iov;
+	int error;
+
+	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
+	if (error)
+		return (error);
+	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+			return (EMSGSIZE);
+		MALLOC(iov, struct iovec *,
+		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+		      M_WAITOK);
+	} else
+		iov = aiov;
+	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+	if (error)
+		goto done;
+	msg.msg_flags = MSG_COMPAT;
+	msg.msg_iov = iov;
+	error = sendit(p, uap->s, &msg, uap->flags);
+done:
+	if (iov != aiov)
+		FREE(iov, M_IOV);
+	return (error);
+}
+#endif
+
+int
+sendmsg(p, uap)
+	struct proc *p;
+	register struct sendmsg_args /* {
+		int	s;
+		caddr_t	msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov[UIO_SMALLIOV], *iov;
+	int error;
+
+	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
+	if (error)
+		return (error);
+	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+			return (EMSGSIZE);
+		MALLOC(iov, struct iovec *,
+		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+		       M_WAITOK);
+	} else
+		iov = aiov;
+	if (msg.msg_iovlen &&
+	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
+		goto done;
+	msg.msg_iov = iov;
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags = 0;
+#endif
+	error = sendit(p, uap->s, &msg, uap->flags);
+done:
+	if (iov != aiov)
+		FREE(iov, M_IOV);
+	return (error);
+}
+
+static int
+recvit(p, s, mp, namelenp)
+	register struct proc *p;
+	int s;
+	register struct msghdr *mp;
+	caddr_t namelenp;
+{
+	struct file *fp;
+	struct uio auio;
+	register struct iovec *iov;
+	register int i;
+	int len, error;
+	struct mbuf *m, *control = 0;
+	caddr_t ctlbuf;
+	struct socket *so;
+	struct sockaddr *fromsa = 0;
+#ifdef KTRACE
+	struct iovec *ktriov = NULL;
+#endif
+
+	error = getsock(p->p_fd, s, &fp);
+	if (error)
+		return (error);
+	auio.uio_iov = mp->msg_iov;
+	auio.uio_iovcnt = mp->msg_iovlen;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_READ;
+	auio.uio_procp = p;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	iov = mp->msg_iov;
+	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+		if ((auio.uio_resid += iov->iov_len) < 0)
+			return (EINVAL);
+	}
+#ifdef KTRACE
+	if (KTRPOINT(p, KTR_GENIO)) {
+		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
+
+		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+	}
+#endif
+	len = auio.uio_resid;
+	so = (struct socket *)fp->f_data;
+	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
+	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
+	    &mp->msg_flags);
+	if (error) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	}
+#ifdef KTRACE
+	if (ktriov != NULL) {
+		if (error == 0)
+			ktrgenio(p->p_tracep, s, UIO_READ,
+				ktriov, len - auio.uio_resid, error);
+		FREE(ktriov, M_TEMP);
+	}
+#endif
+	if (error)
+		goto out;
+	p->p_retval[0] = len - auio.uio_resid;
+	if (mp->msg_name) {
+		len = mp->msg_namelen;
+		if (len <= 0 || fromsa == 0)
+			len = 0;
+		else {
+#ifndef MIN
+#define MIN(a,b) ((a)>(b)?(b):(a))
+#endif
+			/* save sa_len before it is destroyed by MSG_COMPAT */
+			len = MIN(len, fromsa->sa_len);
+#ifdef COMPAT_OLDSOCK
+			if (mp->msg_flags & MSG_COMPAT)
+				((struct osockaddr *)fromsa)->sa_family =
+				    fromsa->sa_family;
+#endif
+			error = copyout(fromsa,
+			    (caddr_t)mp->msg_name, (unsigned)len);
+			if (error)
+				goto out;
+		}
+		mp->msg_namelen = len;
+		if (namelenp &&
+		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
+#ifdef COMPAT_OLDSOCK
+			if (mp->msg_flags & MSG_COMPAT)
+				error = 0;	/* old recvfrom didn't check */
+			else
+#endif
+			goto out;
+		}
+	}
+	if (mp->msg_control) {
+#ifdef COMPAT_OLDSOCK
+		/*
+		 * We assume that old recvmsg calls won't receive access
+		 * rights and other control info, esp. as control info
+		 * is always optional and those options didn't exist in 4.3.
+		 * If we receive rights, trim the cmsghdr; anything else
+		 * is tossed.
+		 */
+		if (control && mp->msg_flags & MSG_COMPAT) {
+			if (mtod(control, struct cmsghdr *)->cmsg_level !=
+			    SOL_SOCKET ||
+			    mtod(control, struct cmsghdr *)->cmsg_type !=
+			    SCM_RIGHTS) {
+				mp->msg_controllen = 0;
+				goto out;
+			}
+			control->m_len -= sizeof (struct cmsghdr);
+			control->m_data += sizeof (struct cmsghdr);
+		}
+#endif
+		len = mp->msg_controllen;
+		m = control;
+		mp->msg_controllen = 0;
+		ctlbuf = (caddr_t) mp->msg_control;
+
+		while (m && len > 0) {
+			unsigned int tocopy;
+
+			if (len >= m->m_len) 
+				tocopy = m->m_len;
+			else {
+				mp->msg_flags |= MSG_CTRUNC;
+				tocopy = len;
+			}
+		
+			if (error = copyout((caddr_t)mtod(m, caddr_t),
+					ctlbuf, tocopy))
+				goto out;
+
+			ctlbuf += tocopy;
+			len -= tocopy;
+			m = m->m_next;
+		}
+		mp->msg_controllen = ctlbuf - mp->msg_control;
+	}
+out:
+	if (fromsa)
+		FREE(fromsa, M_SONAME);
+	if (control)
+		m_freem(control);
+	return (error);
+}
+
+int
+recvfrom(p, uap)
+	struct proc *p;
+	register struct recvfrom_args /* {
+		int	s;
+		caddr_t	buf;
+		size_t	len;
+		int	flags;
+		caddr_t	from;
+		int	*fromlenaddr;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+	int error;
+
+	if (uap->fromlenaddr) {
+		error = copyin((caddr_t)uap->fromlenaddr,
+		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
+		if (error)
+			return (error);
+	} else
+		msg.msg_namelen = 0;
+	msg.msg_name = uap->from;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = uap->flags;
+	return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+orecvfrom(p, uap)
+	struct proc *p;
+	struct recvfrom_args *uap;
+{
+
+	uap->flags |= MSG_COMPAT;
+	return (recvfrom(p, uap));
+}
+#endif
+
+
+#ifdef COMPAT_OLDSOCK
+int
+orecv(p, uap)
+	struct proc *p;
+	register struct orecv_args /* {
+		int	s;
+		caddr_t	buf;
+		int	len;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+
+	msg.msg_name = 0;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = uap->flags;
+	return (recvit(p, uap->s, &msg, (caddr_t)0));
+}
+
+/*
+ * Old recvmsg.  This code takes advantage of the fact that the old msghdr
+ * overlays the new one, missing only the flags, and with the (old) access
+ * rights where the control fields are now.
+ */
+int
+orecvmsg(p, uap)
+	struct proc *p;
+	register struct orecvmsg_args /* {
+		int	s;
+		struct	omsghdr *msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov[UIO_SMALLIOV], *iov;
+	int error;
+
+	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
+	    sizeof (struct omsghdr));
+	if (error)
+		return (error);
+	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+			return (EMSGSIZE);
+		MALLOC(iov, struct iovec *,
+		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+		      M_WAITOK);
+	} else
+		iov = aiov;
+	msg.msg_flags = uap->flags | MSG_COMPAT;
+	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+	if (error)
+		goto done;
+	msg.msg_iov = iov;
+	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
+
+	if (msg.msg_controllen && error == 0)
+		error = copyout((caddr_t)&msg.msg_controllen,
+		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
+done:
+	if (iov != aiov)
+		FREE(iov, M_IOV);
+	return (error);
+}
+#endif
+
+int
+recvmsg(p, uap)
+	struct proc *p;
+	register struct recvmsg_args /* {
+		int	s;
+		struct	msghdr *msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
+	register int error;
+
+	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
+	if (error)
+		return (error);
+	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+			return (EMSGSIZE);
+		MALLOC(iov, struct iovec *,
+		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+		       M_WAITOK);
+	} else
+		iov = aiov;
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags = uap->flags &~ MSG_COMPAT;
+#else
+	msg.msg_flags = uap->flags;
+#endif
+	uiov = msg.msg_iov;
+	msg.msg_iov = iov;
+	error = copyin((caddr_t)uiov, (caddr_t)iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+	if (error)
+		goto done;
+	error = recvit(p, uap->s, &msg, (caddr_t)0);
+	if (!error) {
+		msg.msg_iov = uiov;
+		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
+	}
+done:
+	if (iov != aiov)
+		FREE(iov, M_IOV);
+	return (error);
+}
+
+/* ARGSUSED */
+int
+shutdown(p, uap)
+	struct proc *p;
+	register struct shutdown_args /* {
+		int	s;
+		int	how;
+	} */ *uap;
+{
+	struct file *fp;
+	int error;
+
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
+		return (error);
+	return (soshutdown((struct socket *)fp->f_data, uap->how));
+}
+
+/* ARGSUSED */
+int
+setsockopt(p, uap)
+	struct proc *p;
+	register struct setsockopt_args /* {
+		int	s;
+		int	level;
+		int	name;
+		caddr_t	val;
+		int	valsize;
+	} */ *uap;
+{
+	struct file *fp;
+	struct sockopt sopt;
+	int error;
+
+	if (uap->val == 0 && uap->valsize != 0)
+		return (EFAULT);
+	if (uap->valsize < 0)
+		return (EINVAL);
+
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
+		return (error);
+
+	sopt.sopt_dir = SOPT_SET;
+	sopt.sopt_level = uap->level;
+	sopt.sopt_name = uap->name;
+	sopt.sopt_val = uap->val;
+	sopt.sopt_valsize = uap->valsize;
+	sopt.sopt_p = p;
+
+	return (sosetopt((struct socket *)fp->f_data, &sopt));
+}
+
+/* ARGSUSED */
+int
+getsockopt(p, uap)
+	struct proc *p;
+	register struct getsockopt_args /* {
+		int	s;
+		int	level;
+		int	name;
+		caddr_t	val;
+		int	*avalsize;
+	} */ *uap;
+{
+	int	valsize, error;
+	struct	file *fp;
+	struct	sockopt sopt;
+
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
+		return (error);
+	if (uap->val) {
+		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
+		    sizeof (valsize));
+		if (error)
+			return (error);
+		if (valsize < 0)
+			return (EINVAL);
+	} else
+		valsize = 0;
+
+	sopt.sopt_dir = SOPT_GET;
+	sopt.sopt_level = uap->level;
+	sopt.sopt_name = uap->name;
+	sopt.sopt_val = uap->val;
+	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
+	sopt.sopt_p = p;
+
+	error = sogetopt((struct socket *)fp->f_data, &sopt);
+	if (error == 0) {
+		valsize = sopt.sopt_valsize;
+		error = copyout((caddr_t)&valsize,
+				(caddr_t)uap->avalsize, sizeof (valsize));
+	}
+	return (error);
+}
+
+/*
+ * Get socket name.
+ */
+/* ARGSUSED */
+static int
+getsockname1(p, uap, compat)
+	struct proc *p;
+	register struct getsockname_args /* {
+		int	fdes;
+		caddr_t	asa;
+		int	*alen;
+	} */ *uap;
+	int compat;
+{
+	struct file *fp;
+	register struct socket *so;
+	struct sockaddr *sa;
+	int len, error;
+
+	error = getsock(p->p_fd, uap->fdes, &fp);
+	if (error)
+		return (error);
+	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
+	if (error)
+		return (error);
+	so = (struct socket *)fp->f_data;
+	sa = 0;
+	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
+	if (error)
+		goto bad;
+	if (sa == 0) {
+		len = 0;
+		goto gotnothing;
+	}
+
+	len = MIN(len, sa->sa_len);
+#ifdef COMPAT_OLDSOCK
+	if (compat)
+		((struct osockaddr *)sa)->sa_family = sa->sa_family;
+#endif
+	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
+	if (error == 0)
+gotnothing:
+		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
+		    sizeof (len));
+bad:
+	if (sa)
+		FREE(sa, M_SONAME);
+	return (error);
+}
+
+int
+getsockname(p, uap)
+	struct proc *p;
+	struct getsockname_args *uap;
+{
+
+	return (getsockname1(p, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+ogetsockname(p, uap)
+	struct proc *p;
+	struct getsockname_args *uap;
+{
+
+	return (getsockname1(p, uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/*
+ * Get name of peer for connected socket.
+ */
+/* ARGSUSED */
+static int
+getpeername1(p, uap, compat)
+	struct proc *p;
+	register struct getpeername_args /* {
+		int	fdes;
+		caddr_t	asa;
+		int	*alen;
+	} */ *uap;
+	int compat;
+{
+	struct file *fp;
+	register struct socket *so;
+	struct sockaddr *sa;
+	int len, error;
+
+	error = getsock(p->p_fd, uap->fdes, &fp);
+	if (error)
+		return (error);
+	so = (struct socket *)fp->f_data;
+	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
+		return (ENOTCONN);
+	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
+	if (error)
+		return (error);
+	sa = 0;
+	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
+	if (error)
+		goto bad;
+	if (sa == 0) {
+		len = 0;
+		goto gotnothing;
+	}
+	len = MIN(len, sa->sa_len);
+#ifdef COMPAT_OLDSOCK
+	if (compat)
+		((struct osockaddr *)sa)->sa_family =
+		    sa->sa_family;
+#endif
+	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
+	if (error)
+		goto bad;
+gotnothing:
+	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
+bad:
+	if (sa) FREE(sa, M_SONAME);
+	return (error);
+}
+
+int
+getpeername(p, uap)
+	struct proc *p;
+	struct getpeername_args *uap;
+{
+
+	return (getpeername1(p, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+ogetpeername(p, uap)
+	struct proc *p;
+	struct ogetpeername_args *uap;
+{
+
+	/* XXX uap should have type `getpeername_args *' to begin with. */
+	return (getpeername1(p, (struct getpeername_args *)uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+int
+sockargs(mp, buf, buflen, type)
+	struct mbuf **mp;
+	caddr_t buf;
+	int buflen, type;
+{
+	register struct sockaddr *sa;
+	register struct mbuf *m;
+	int error;
+
+	if ((u_int)buflen > MLEN) {
+#ifdef COMPAT_OLDSOCK
+		if (type == MT_SONAME && (u_int)buflen <= 112)
+			buflen = MLEN;		/* unix domain compat. hack */
+		else
+#endif
+		return (EINVAL);
+	}
+	m = m_get(M_WAIT, type);
+	if (m == NULL)
+		return (ENOBUFS);
+	m->m_len = buflen;
+	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
+	if (error)
+		(void) m_free(m);
+	else {
+		*mp = m;
+		if (type == MT_SONAME) {
+			sa = mtod(m, struct sockaddr *);
+
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+				sa->sa_family = sa->sa_len;
+#endif
+			sa->sa_len = buflen;
+		}
+	}
+	return (error);
+}
+
+int
+getsockaddr(namp, uaddr, len)
+	struct sockaddr **namp;
+	caddr_t uaddr;
+	size_t len;
+{
+	struct sockaddr *sa;
+	int error;
+
+	if (len > SOCK_MAXADDRLEN)
+		return ENAMETOOLONG;
+	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
+	error = copyin(uaddr, sa, len);
+	if (error) {
+		FREE(sa, M_SONAME);
+	} else {
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+			sa->sa_family = sa->sa_len;
+#endif
+		sa->sa_len = len;
+		*namp = sa;
+	}
+	return error;
+}
+
+int
+getsock(fdp, fdes, fpp)
+	struct filedesc *fdp;
+	int fdes;
+	struct file **fpp;
+{
+	register struct file *fp;
+
+	if ((unsigned)fdes >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[fdes]) == NULL)
+		return (EBADF);
+	if (fp->f_type != DTYPE_SOCKET)
+		return (ENOTSOCK);
+	*fpp = fp;
+	return (0);
+}
+
+/*
+ * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
+ * XXX - The sf_buf functions are currently private to sendfile(2), so have
+ * been made static, but may be useful in the future for doing zero-copy in
+ * other parts of the networking code. 
+ */
+static void
+sf_buf_init(void *arg)
+{
+	int i;
+
+	SLIST_INIT(&sf_freelist);
+	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
+	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
+	bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
+	for (i = 0; i < nsfbufs; i++) {
+		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
+		SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
+	}
+}
+
+/*
+ * Get an sf_buf from the freelist. Will block if none are available.
+ */
+static struct sf_buf *
+sf_buf_alloc()
+{
+	struct sf_buf *sf;
+	int s;
+
+	s = splimp();
+	while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
+		sf_buf_alloc_want = 1;
+		tsleep(&sf_freelist, PVM, "sfbufa", 0);
+	}
+	SLIST_REMOVE_HEAD(&sf_freelist, free_list);
+	splx(s);
+	sf->refcnt = 1;
+	return (sf);
+}
+
+#define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
+static void
+sf_buf_ref(caddr_t addr, u_int size)
+{
+	struct sf_buf *sf;
+
+	sf = dtosf(addr);
+	if (sf->refcnt == 0)
+		panic("sf_buf_ref: referencing a free sf_buf");
+	sf->refcnt++;
+}
+
+/*
+ * Lose a reference to an sf_buf. When none left, detach mapped page
+ * and release resources back to the system.
+ *
+ * Must be called at splimp.
+ */
+static void
+sf_buf_free(caddr_t addr, u_int size)
+{
+	struct sf_buf *sf;
+	struct vm_page *m;
+	int s;
+
+	sf = dtosf(addr);
+	if (sf->refcnt == 0)
+		panic("sf_buf_free: freeing free sf_buf");
+	sf->refcnt--;
+	if (sf->refcnt == 0) {
+		pmap_qremove((vm_offset_t)addr, 1);
+		m = sf->m;
+		s = splvm();
+		vm_page_unwire(m, 0);
+		/*
+		 * Check for the object going away on us. This can
+		 * happen since we don't hold a reference to it.
+		 * If so, we're responsible for freeing the page.
+		 */
+		if (m->wire_count == 0 && m->object == NULL)
+			vm_page_free(m);
+		splx(s);
+		sf->m = NULL;
+		SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
+		if (sf_buf_alloc_want) {
+			sf_buf_alloc_want = 0;
+			wakeup(&sf_freelist);
+		}
+	}
+}
+
+/*
+ * sendfile(2).
+ * int sendfile(int fd, int s, off_t offset, size_t nbytes,
+ *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
+ *
+ * Send a file specified by 'fd' and starting at 'offset' to a socket
+ * specified by 's'. Send only 'nbytes' of the file or until EOF if
+ * nbytes == 0. Optionally add a header and/or trailer to the socket
+ * output. If specified, write the total number of bytes sent into *sbytes.
+ */
+int
+sendfile(struct proc *p, struct sendfile_args *uap)
+{
+	struct file *fp;
+	struct filedesc *fdp = p->p_fd;
+	struct vnode *vp;
+	struct vm_object *obj;
+	struct socket *so;
+	struct mbuf *m;
+	struct sf_buf *sf;
+	struct vm_page *pg;
+	struct writev_args nuap;
+	struct sf_hdtr hdtr;
+	off_t off, xfsize, sbytes = 0;
+	int error = 0, s;
+
+	/*
+	 * Do argument checking. Must be a regular file in, stream
+	 * type and connected socket out, positive offset.
+	 */
+	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+	    (fp->f_flag & FREAD) == 0) {
+		error = EBADF;
+		goto done;
+	}
+	if (fp->f_type != DTYPE_VNODE) {
+		error = EINVAL;
+		goto done;
+	}
+	vp = (struct vnode *)fp->f_data;
+	obj = vp->v_object;
+	if (vp->v_type != VREG || obj == NULL) {
+		error = EINVAL;
+		goto done;
+	}
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
+		goto done;
+	so = (struct socket *)fp->f_data;
+	if (so->so_type != SOCK_STREAM) {
+		error = EINVAL;
+		goto done;
+	}
+	if ((so->so_state & SS_ISCONNECTED) == 0) {
+		error = ENOTCONN;
+		goto done;
+	}
+	if (uap->offset < 0) {
+		error = EINVAL;
+		goto done;
+	}
+
+	/*
+	 * If specified, get the pointer to the sf_hdtr struct for
+	 * any headers/trailers.
+	 */
+	if (uap->hdtr != NULL) {
+		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
+		if (error)
+			goto done;
+		/*
+		 * Send any headers. Wimp out and use writev(2).
+		 */
+		if (hdtr.headers != NULL) {
+			nuap.fd = uap->s;
+			nuap.iovp = hdtr.headers;
+			nuap.iovcnt = hdtr.hdr_cnt;
+			error = writev(p, &nuap);
+			if (error)
+				goto done;
+			sbytes += p->p_retval[0];
+		}
+	}
+
+	/*
+	 * Protect against multiple writers to the socket.
+	 */
+	(void) sblock(&so->so_snd, M_WAITOK);
+
+	/*
+	 * Loop through the pages in the file, starting with the requested
+	 * offset. Get a file page (do I/O if necessary), map the file page
+	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
+	 * it on the socket.
+	 */
+	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
+		vm_pindex_t pindex;
+		vm_offset_t pgoff;
+
+		pindex = OFF_TO_IDX(off);
+retry_lookup:
+		/*
+		 * Calculate the amount to transfer. Not to exceed a page,
+		 * the EOF, or the passed in nbytes.
+		 */
+		xfsize = obj->un_pager.vnp.vnp_size - off;
+		if (xfsize > PAGE_SIZE)
+			xfsize = PAGE_SIZE;
+		pgoff = (vm_offset_t)(off & PAGE_MASK);
+		if (PAGE_SIZE - pgoff < xfsize)
+			xfsize = PAGE_SIZE - pgoff;
+		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
+			xfsize = uap->nbytes - sbytes;
+		if (xfsize <= 0)
+			break;
+		/*
+		 * Optimize the non-blocking case by looking at the socket space
+		 * before going to the extra work of constituting the sf_buf.
+		 */
+		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
+			if (so->so_state & SS_CANTSENDMORE)
+				error = EPIPE;
+			else
+				error = EAGAIN;
+			sbunlock(&so->so_snd);
+			goto done;
+		}
+		/*
+		 * Attempt to look up the page. If the page doesn't exist or the
+		 * part we're interested in isn't valid, then read it from disk.
+		 * If some other part of the kernel has this page (i.e. it's busy),
+		 * then disk I/O may be occuring on it, so wait and retry.
+		 */
+		pg = vm_page_lookup(obj, pindex);
+		if (pg == NULL || (!(pg->flags & PG_BUSY) && !pg->busy &&
+		    !vm_page_is_valid(pg, pgoff, xfsize))) {
+			struct uio auio;
+			struct iovec aiov;
+			int bsize;
+
+			if (pg == NULL) {
+				pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
+				if (pg == NULL) {
+					VM_WAIT;
+					goto retry_lookup;
+				}
+				/*
+				 * don't just clear PG_BUSY manually -
+				 * vm_page_alloc() should be considered opaque,
+				 * use the VM routine provided to clear
+				 * PG_BUSY.
+				 */
+				vm_page_wakeup(pg);
+			}
+			/*
+			 * Ensure that our page is still around when the I/O completes.
+			 */
+			vm_page_io_start(pg);
+			vm_page_wire(pg);
+			/*
+			 * Get the page from backing store.
+			 */
+			bsize = vp->v_mount->mnt_stat.f_iosize;
+			auio.uio_iov = &aiov;
+			auio.uio_iovcnt = 1;
+			aiov.iov_base = 0;
+			aiov.iov_len = MAXBSIZE;
+			auio.uio_resid = MAXBSIZE;
+			auio.uio_offset = trunc_page(off);
+			auio.uio_segflg = UIO_NOCOPY;
+			auio.uio_rw = UIO_READ;
+			auio.uio_procp = p;
+			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
+			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
+			        p->p_ucred);
+			VOP_UNLOCK(vp, 0, p);
+			vm_page_flag_clear(pg, PG_ZERO);
+			vm_page_io_finish(pg);
+			if (error) {
+				vm_page_unwire(pg, 0);
+				/*
+				 * See if anyone else might know about this page.
+				 * If not and it is not valid, then free it.
+				 */
+				if (pg->wire_count == 0 && pg->valid == 0 &&
+				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
+				    pg->hold_count == 0)
+					vm_page_free(pg);
+				sbunlock(&so->so_snd);
+				goto done;
+			}
+		} else {
+			if (vm_page_sleep_busy(pg, TRUE, "sfpbsy"))
+				goto retry_lookup;
+
+			/*
+			 * Protect from having the page ripped out from 
+			 * beneath us.
+			 */
+			vm_page_wire(pg);
+		}
+		/*
+		 * Allocate a kernel virtual page and insert the physical page
+		 * into it.
+		 */
+		sf = sf_buf_alloc();
+		sf->m = pg;
+		pmap_qenter(sf->kva, &pg, 1);
+		/*
+		 * Get an mbuf header and set it up as having external storage.
+		 */
+		MGETHDR(m, M_WAIT, MT_DATA);
+		m->m_ext.ext_free = sf_buf_free;
+		m->m_ext.ext_ref = sf_buf_ref;
+		m->m_ext.ext_buf = (void *)sf->kva;
+		m->m_ext.ext_size = PAGE_SIZE;
+		m->m_data = (char *) sf->kva + pgoff;
+		m->m_flags |= M_EXT;
+		m->m_pkthdr.len = m->m_len = xfsize;
+		/*
+		 * Add the buffer to the socket buffer chain.
+		 */
+		s = splnet();
+retry_space:
+		/*
+		 * Make sure that the socket is still able to take more data.
+		 * CANTSENDMORE being true usually means that the connection
+		 * was closed. so_error is true when an error was sensed after
+		 * a previous send.
+		 * The state is checked after the page mapping and buffer
+		 * allocation above since those operations may block and make
+		 * any socket checks stale. From this point forward, nothing
+		 * blocks before the pru_send (or more accurately, any blocking
+		 * results in a loop back to here to re-check).
+		 */
+		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
+			if (so->so_state & SS_CANTSENDMORE) {
+				error = EPIPE;
+			} else {
+				error = so->so_error;
+				so->so_error = 0;
+			}
+			m_freem(m);
+			sbunlock(&so->so_snd);
+			splx(s);
+			goto done;
+		}
+		/*
+		 * Wait for socket space to become available. We do this just
+		 * after checking the connection state above in order to avoid
+		 * a race condition with sbwait().
+		 */
+		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
+			if (so->so_state & SS_NBIO) {
+				m_freem(m);
+				sbunlock(&so->so_snd);
+				splx(s);
+				error = EAGAIN;
+				goto done;
+			}
+			error = sbwait(&so->so_snd);
+			/*
+			 * An error from sbwait usually indicates that we've
+			 * been interrupted by a signal. If we've sent anything
+			 * then return bytes sent, otherwise return the error.
+			 */
+			if (error) {
+				m_freem(m);
+				sbunlock(&so->so_snd);
+				splx(s);
+				goto done;
+			}
+			goto retry_space;
+		}
+		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
+		splx(s);
+		if (error) {
+			sbunlock(&so->so_snd);
+			goto done;
+		}
+	}
+	sbunlock(&so->so_snd);
+
+	/*
+	 * Send trailers. Wimp out and use writev(2).
+	 */
+	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
+			nuap.fd = uap->s;
+			nuap.iovp = hdtr.trailers;
+			nuap.iovcnt = hdtr.trl_cnt;
+			error = writev(p, &nuap);
+			if (error)
+				goto done;
+			sbytes += p->p_retval[0];
+	}
+
+done:
+	if (uap->sbytes != NULL) {
+		copyout(&sbytes, uap->sbytes, sizeof(off_t));
+	}
+	return (error);
+}
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
new file mode 100644
index 0000000..abdb71e
--- /dev/null
+++ b/sys/kern/uipc_usrreq.c
@@ -0,0 +1,1186 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
+ *	$Id: uipc_usrreq.c,v 1.38 1999/01/21 08:29:04 dillon Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/domain.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/un.h>
+#include <sys/unpcb.h>
+#include <sys/vnode.h>
+
+#include <vm/vm_zone.h>
+
+struct	vm_zone *unp_zone;
+static	unp_gen_t unp_gencnt;
+static	u_int unp_count;
+
+static	struct unp_head unp_shead, unp_dhead;
+
+/*
+ * Unix communications domain.
+ *
+ * TODO:
+ *	SEQPACKET, RDM
+ *	rethink name space problems
+ *	need a proper out-of-band
+ *	lock pushdown
+ */
+static struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
+static ino_t	unp_ino;		/* prototype for fake inode numbers */
+
+static int     unp_attach __P((struct socket *));
+static void    unp_detach __P((struct unpcb *));
+static int     unp_bind __P((struct unpcb *,struct sockaddr *, struct proc *));
+static int     unp_connect __P((struct socket *,struct sockaddr *,
+				struct proc *));
+static void    unp_disconnect __P((struct unpcb *));
+static void    unp_shutdown __P((struct unpcb *));
+static void    unp_drop __P((struct unpcb *, int));
+static void    unp_gc __P((void));
+static void    unp_scan __P((struct mbuf *, void (*)(struct file *)));
+static void    unp_mark __P((struct file *));
+static void    unp_discard __P((struct file *));
+static int     unp_internalize __P((struct mbuf *, struct proc *));
+
+static int
+uipc_abort(struct socket *so)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	unp_drop(unp, ECONNABORTED);
+	return 0;
+}
+
+static int
+uipc_accept(struct socket *so, struct sockaddr **nam)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+
+	/*
+	 * Pass back name of connected socket,
+	 * if it was bound and we are still connected
+	 * (our peer may have closed already!).
+	 */
+	if (unp->unp_conn && unp->unp_conn->unp_addr) {
+		*nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
+				    1);
+	} else {
+		*nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1);
+	}
+	return 0;
+}
+
+static int
+uipc_attach(struct socket *so, int proto, struct proc *p)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp != 0)
+		return EISCONN;
+	return unp_attach(so);
+}
+
+static int
+uipc_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+
+	return unp_bind(unp, nam, p);
+}
+
+static int
+uipc_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	return unp_connect(so, nam, curproc);
+}
+
+static int
+uipc_connect2(struct socket *so1, struct socket *so2)
+{
+	struct unpcb *unp = sotounpcb(so1);
+
+	if (unp == 0)
+		return EINVAL;
+
+	return unp_connect2(so1, so2);
+}
+
+/* control is EOPNOTSUPP */
+
+static int
+uipc_detach(struct socket *so)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+
+	unp_detach(unp);
+	return 0;
+}
+
+static int
+uipc_disconnect(struct socket *so)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	unp_disconnect(unp);
+	return 0;
+}
+
+static int
+uipc_listen(struct socket *so, struct proc *p)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0 || unp->unp_vnode == 0)
+		return EINVAL;
+	return 0;
+}
+
+static int
+uipc_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	if (unp->unp_conn && unp->unp_conn->unp_addr)
+		*nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
+				    1);
+	return 0;
+}
+
+static int
+uipc_rcvd(struct socket *so, int flags)
+{
+	struct unpcb *unp = sotounpcb(so);
+	struct socket *so2;
+
+	if (unp == 0)
+		return EINVAL;
+	switch (so->so_type) {
+	case SOCK_DGRAM:
+		panic("uipc_rcvd DGRAM?");
+		/*NOTREACHED*/
+
+	case SOCK_STREAM:
+#define	rcv (&so->so_rcv)
+#define snd (&so2->so_snd)
+		if (unp->unp_conn == 0)
+			break;
+		so2 = unp->unp_conn->unp_socket;
+		/*
+		 * Adjust backpressure on sender
+		 * and wakeup any waiting to write.
+		 */
+		snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
+		unp->unp_mbcnt = rcv->sb_mbcnt;
+		snd->sb_hiwat += unp->unp_cc - rcv->sb_cc;
+		unp->unp_cc = rcv->sb_cc;
+		sowwakeup(so2);
+#undef snd
+#undef rcv
+		break;
+
+	default:
+		panic("uipc_rcvd unknown socktype");
+	}
+	return 0;
+}
+
+/* pru_rcvoob is EOPNOTSUPP */
+
+static int
+uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+	  struct mbuf *control, struct proc *p)
+{
+	int error = 0;
+	struct unpcb *unp = sotounpcb(so);
+	struct socket *so2;
+
+	if (unp == 0) {
+		error = EINVAL;
+		goto release;
+	}
+	if (flags & PRUS_OOB) {
+		error = EOPNOTSUPP;
+		goto release;
+	}
+
+	if (control && (error = unp_internalize(control, p)))
+		goto release;
+
+	switch (so->so_type) {
+	case SOCK_DGRAM: 
+	{
+		struct sockaddr *from;
+
+		if (nam) {
+			if (unp->unp_conn) {
+				error = EISCONN;
+				break;
+			}
+			error = unp_connect(so, nam, p);
+			if (error)
+				break;
+		} else {
+			if (unp->unp_conn == 0) {
+				error = ENOTCONN;
+				break;
+			}
+		}
+		so2 = unp->unp_conn->unp_socket;
+		if (unp->unp_addr)
+			from = (struct sockaddr *)unp->unp_addr;
+		else
+			from = &sun_noname;
+		if (sbappendaddr(&so2->so_rcv, from, m, control)) {
+			sorwakeup(so2);
+			m = 0;
+			control = 0;
+		} else
+			error = ENOBUFS;
+		if (nam)
+			unp_disconnect(unp);
+		break;
+	}
+
+	case SOCK_STREAM:
+#define	rcv (&so2->so_rcv)
+#define	snd (&so->so_snd)
+		/* Connect if not connected yet. */
+		/*
+		 * Note: A better implementation would complain
+		 * if not equal to the peer's address.
+		 */
+		if ((so->so_state & SS_ISCONNECTED) == 0) {
+			if (nam) {
+				error = unp_connect(so, nam, p);
+				if (error)
+					break;	/* XXX */
+			} else {
+				error = ENOTCONN;
+				break;
+			}
+		}
+
+		if (so->so_state & SS_CANTSENDMORE) {
+			error = EPIPE;
+			break;
+		}
+		if (unp->unp_conn == 0)
+			panic("uipc_send connected but no connection?");
+		so2 = unp->unp_conn->unp_socket;
+		/*
+		 * Send to paired receive port, and then reduce
+		 * send buffer hiwater marks to maintain backpressure.
+		 * Wake up readers.
+		 */
+		if (control) {
+			if (sbappendcontrol(rcv, m, control))
+				control = 0;
+		} else
+			sbappend(rcv, m);
+		snd->sb_mbmax -=
+			rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
+		unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
+		snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc;
+		unp->unp_conn->unp_cc = rcv->sb_cc;
+		sorwakeup(so2);
+		m = 0;
+#undef snd
+#undef rcv
+		break;
+
+	default:
+		panic("uipc_send unknown socktype");
+	}
+
+	/*
+	 * SEND_EOF is equivalent to a SEND followed by
+	 * a SHUTDOWN.
+	 */
+	if (flags & PRUS_EOF) {
+		socantsendmore(so);
+		unp_shutdown(unp);
+	}
+
+release:
+	if (control)
+		m_freem(control);
+	if (m)
+		m_freem(m);
+	return error;
+}
+
+static int
+uipc_sense(struct socket *so, struct stat *sb)
+{
+	struct unpcb *unp = sotounpcb(so);
+	struct socket *so2;
+
+	if (unp == 0)
+		return EINVAL;
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
+		so2 = unp->unp_conn->unp_socket;
+		sb->st_blksize += so2->so_rcv.sb_cc;
+	}
+	sb->st_dev = NODEV;
+	if (unp->unp_ino == 0)
+		unp->unp_ino = unp_ino++;
+	sb->st_ino = unp->unp_ino;
+	return (0);
+}
+
+static int
+uipc_shutdown(struct socket *so)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	socantsendmore(so);
+	unp_shutdown(unp);
+	return 0;
+}
+
+static int
+uipc_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	if (unp->unp_addr)
+		*nam = dup_sockaddr((struct sockaddr *)unp->unp_addr, 1);
+	return 0;
+}
+
+struct pr_usrreqs uipc_usrreqs = {
+	uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect,
+	uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect,
+	uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp,
+	uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr,
+	sosend, soreceive, sopoll
+};
+	
+/*
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering
+ * for stream sockets, although the total for sender and receiver is
+ * actually only PIPSIZ.
+ * Datagram sockets really use the sendspace as the maximum datagram size,
+ * and don't really want to reserve the sendspace.  Their recvspace should
+ * be large enough for at least one max-size datagram plus address.
+ */
+#ifndef PIPSIZ
+#define	PIPSIZ	8192
+#endif
+static u_long	unpst_sendspace = PIPSIZ;
+static u_long	unpst_recvspace = PIPSIZ;
+static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
+static u_long	unpdg_recvspace = 4*1024;
+
+static int	unp_rights;			/* file descriptors in flight */
+
+SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, 
+	   &unpst_sendspace, 0, "");
+SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpst_recvspace, 0, "");
+SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+	   &unpdg_sendspace, 0, "");
+SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpdg_recvspace, 0, "");
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
+
+static int
+unp_attach(so)
+	struct socket *so;
+{
+	register struct unpcb *unp;
+	int error;
+
+	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+		switch (so->so_type) {
+
+		case SOCK_STREAM:
+			error = soreserve(so, unpst_sendspace, unpst_recvspace);
+			break;
+
+		case SOCK_DGRAM:
+			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
+			break;
+
+		default:
+			panic("unp_attach");
+		}
+		if (error)
+			return (error);
+	}
+	unp = zalloc(unp_zone);
+	if (unp == NULL)
+		return (ENOBUFS);
+	bzero(unp, sizeof *unp);
+	unp->unp_gencnt = ++unp_gencnt;
+	unp_count++;
+	LIST_INIT(&unp->unp_refs);
+	unp->unp_socket = so;
+	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
+			 : &unp_shead, unp, unp_link);
+	so->so_pcb = (caddr_t)unp;
+	return (0);
+}
+
+static void
+unp_detach(unp)
+	register struct unpcb *unp;
+{
+	LIST_REMOVE(unp, unp_link);
+	unp->unp_gencnt = ++unp_gencnt;
+	--unp_count;
+	if (unp->unp_vnode) {
+		unp->unp_vnode->v_socket = 0;
+		vrele(unp->unp_vnode);
+		unp->unp_vnode = 0;
+	}
+	if (unp->unp_conn)
+		unp_disconnect(unp);
+	while (unp->unp_refs.lh_first)
+		unp_drop(unp->unp_refs.lh_first, ECONNRESET);
+	soisdisconnected(unp->unp_socket);
+	unp->unp_socket->so_pcb = 0;
+	if (unp_rights) {
+		/*
+		 * Normally the receive buffer is flushed later,
+		 * in sofree, but if our receive buffer holds references
+		 * to descriptors that are now garbage, we will dispose
+		 * of those descriptor references after the garbage collector
+		 * gets them (resulting in a "panic: closef: count < 0").
+		 */
+		sorflush(unp->unp_socket);
+		unp_gc();
+	}
+	if (unp->unp_addr)
+		FREE(unp->unp_addr, M_SONAME);
+	zfree(unp_zone, unp);
+}
+
+static int
+unp_bind(unp, nam, p)
+	struct unpcb *unp;
+	struct sockaddr *nam;
+	struct proc *p;
+{
+	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error, namelen;
+	struct nameidata nd;
+	char buf[SOCK_MAXADDRLEN];
+
+	if (unp->unp_vnode != NULL)
+		return (EINVAL);
+#define offsetof(s, e) ((char *)&((s *)0)->e - (char *)((s *)0))
+	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
+	if (namelen <= 0)
+		return EINVAL;
+	strncpy(buf, soun->sun_path, namelen);
+	buf[namelen] = 0;	/* null-terminate the string */
+	NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE,
+	    buf, p);
+/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
+	error = namei(&nd);
+	if (error)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(vp);
+		return (EADDRINUSE);
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VSOCK;
+	vattr.va_mode = (ACCESSPERMS & ~p->p_fd->fd_cmask);
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	vput(nd.ni_dvp);
+	if (error)
+		return (error);
+	vp = nd.ni_vp;
+	vp->v_socket = unp->unp_socket;
+	unp->unp_vnode = vp;
+	unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1);
+	VOP_UNLOCK(vp, 0, p);
+	return (0);
+}
+
+static int
+unp_connect(so, nam, p)
+	struct socket *so;
+	struct sockaddr *nam;
+	struct proc *p;
+{
+	register struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+	register struct vnode *vp;
+	register struct socket *so2, *so3;
+	struct unpcb *unp2, *unp3;
+	int error, len;
+	struct nameidata nd;
+	char buf[SOCK_MAXADDRLEN];
+
+	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
+	if (len <= 0)
+		return EINVAL;
+	strncpy(buf, soun->sun_path, len);
+	buf[len] = 0;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, p);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VSOCK) {
+		error = ENOTSOCK;
+		goto bad;
+	}
+	error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p);
+	if (error)
+		goto bad;
+	so2 = vp->v_socket;
+	if (so2 == 0) {
+		error = ECONNREFUSED;
+		goto bad;
+	}
+	if (so->so_type != so2->so_type) {
+		error = EPROTOTYPE;
+		goto bad;
+	}
+	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
+		    (so3 = sonewconn(so2, 0)) == 0) {
+			error = ECONNREFUSED;
+			goto bad;
+		}
+		unp2 = sotounpcb(so2);
+		unp3 = sotounpcb(so3);
+		if (unp2->unp_addr)
+			unp3->unp_addr = (struct sockaddr_un *)
+				dup_sockaddr((struct sockaddr *)
+					     unp2->unp_addr, 1);
+		so2 = so3;
+	}
+	error = unp_connect2(so, so2);
+bad:
+	vput(vp);
+	return (error);
+}
+
+int
+unp_connect2(so, so2)
+	register struct socket *so;
+	register struct socket *so2;
+{
+	register struct unpcb *unp = sotounpcb(so);
+	register struct unpcb *unp2;
+
+	if (so2->so_type != so->so_type)
+		return (EPROTOTYPE);
+	unp2 = sotounpcb(so2);
+	unp->unp_conn = unp2;
+	switch (so->so_type) {
+
+	case SOCK_DGRAM:
+		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
+		soisconnected(so);
+		break;
+
+	case SOCK_STREAM:
+		unp2->unp_conn = unp;
+		soisconnected(so);
+		soisconnected(so2);
+		break;
+
+	default:
+		panic("unp_connect2");
+	}
+	return (0);
+}
+
+static void
+unp_disconnect(unp)
+	struct unpcb *unp;
+{
+	register struct unpcb *unp2 = unp->unp_conn;
+
+	if (unp2 == 0)
+		return;
+	unp->unp_conn = 0;
+	switch (unp->unp_socket->so_type) {
+
+	case SOCK_DGRAM:
+		LIST_REMOVE(unp, unp_reflink);
+		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
+		break;
+
+	case SOCK_STREAM:
+		soisdisconnected(unp->unp_socket);
+		unp2->unp_conn = 0;
+		soisdisconnected(unp2->unp_socket);
+		break;
+	}
+}
+
+#ifdef notdef
+void
+unp_abort(unp)
+	struct unpcb *unp;
+{
+
+	unp_detach(unp);
+}
+#endif
+
+static int
+unp_pcblist SYSCTL_HANDLER_ARGS
+{
+	int error, i, n;
+	struct unpcb *unp, **unp_list;
+	unp_gen_t gencnt;
+	struct xunpgen xug;
+	struct unp_head *head;
+
+	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
+
+	/*
+	 * The process of preparing the PCB list is too time-consuming and
+	 * resource-intensive to repeat twice on every request.
+	 */
+	if (req->oldptr == 0) {
+		n = unp_count;
+		req->oldidx = 2 * (sizeof xug)
+			+ (n + n/8) * sizeof(struct xunpcb);
+		return 0;
+	}
+
+	if (req->newptr != 0)
+		return EPERM;
+
+	/*
+	 * OK, now we're committed to doing something.
+	 */
+	gencnt = unp_gencnt;
+	n = unp_count;
+
+	xug.xug_len = sizeof xug;
+	xug.xug_count = n;
+	xug.xug_gen = gencnt;
+	xug.xug_sogen = so_gencnt;
+	error = SYSCTL_OUT(req, &xug, sizeof xug);
+	if (error)
+		return error;
+
+	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
+	if (unp_list == 0)
+		return ENOMEM;
+	
+	for (unp = head->lh_first, i = 0; unp && i < n;
+	     unp = unp->unp_link.le_next) {
+		if (unp->unp_gencnt <= gencnt)
+			unp_list[i++] = unp;
+	}
+	n = i;			/* in case we lost some during malloc */
+
+	error = 0;
+	for (i = 0; i < n; i++) {
+		unp = unp_list[i];
+		if (unp->unp_gencnt <= gencnt) {
+			struct xunpcb xu;
+			xu.xu_len = sizeof xu;
+			xu.xu_unpp = unp;
+			/*
+			 * XXX - need more locking here to protect against
+			 * connect/disconnect races for SMP.
+			 */
+			if (unp->unp_addr)
+				bcopy(unp->unp_addr, &xu.xu_addr, 
+				      unp->unp_addr->sun_len);
+			if (unp->unp_conn && unp->unp_conn->unp_addr)
+				bcopy(unp->unp_conn->unp_addr,
+				      &xu.xu_caddr,
+				      unp->unp_conn->unp_addr->sun_len);
+			bcopy(unp, &xu.xu_unp, sizeof *unp);
+			sotoxsocket(unp->unp_socket, &xu.xu_socket);
+			error = SYSCTL_OUT(req, &xu, sizeof xu);
+		}
+	}
+	if (!error) {
+		/*
+		 * Give the user an updated idea of our state.
+		 * If the generation differs from what we told
+		 * her before, she knows that something happened
+		 * while we were processing this request, and it
+		 * might be necessary to retry.
+		 */
+		xug.xug_gen = unp_gencnt;
+		xug.xug_sogen = so_gencnt;
+		xug.xug_count = unp_count;
+		error = SYSCTL_OUT(req, &xug, sizeof xug);
+	}
+	free(unp_list, M_TEMP);
+	return error;
+}
+
+SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, 
+	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
+	    "List of active local datagram sockets");
+SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, 
+	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
+	    "List of active local stream sockets");
+
+static void
+unp_shutdown(unp)
+	struct unpcb *unp;
+{
+	struct socket *so;
+
+	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
+	    (so = unp->unp_conn->unp_socket))
+		socantrcvmore(so);
+}
+
+static void
+unp_drop(unp, errno)
+	struct unpcb *unp;
+	int errno;
+{
+	struct socket *so = unp->unp_socket;
+
+	so->so_error = errno;
+	unp_disconnect(unp);
+	if (so->so_head) {
+		LIST_REMOVE(unp, unp_link);
+		unp->unp_gencnt = ++unp_gencnt;
+		unp_count--;
+		so->so_pcb = (caddr_t) 0;
+		if (unp->unp_addr)
+			FREE(unp->unp_addr, M_SONAME);
+		zfree(unp_zone, unp);
+		sofree(so);
+	}
+}
+
+#ifdef notdef
+void
+unp_drain()
+{
+
+}
+#endif
+
+int
+unp_externalize(rights)
+	struct mbuf *rights;
+{
+	struct proc *p = curproc;		/* XXX */
+	register int i;
+	register struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
+	register struct file **rp = (struct file **)(cm + 1);
+	register struct file *fp;
+	int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof (int);
+	int f;
+
+	/*
+	 * if the new FD's will not fit, then we free them all
+	 */
+	if (!fdavail(p, newfds)) {
+		for (i = 0; i < newfds; i++) {
+			fp = *rp;
+			unp_discard(fp);
+			*rp++ = 0;
+		}
+		return (EMSGSIZE);
+	}
+	/*
+	 * now change each pointer to an fd in the global table to 
+	 * an integer that is the index to the local fd table entry
+	 * that we set up to point to the global one we are transferring.
+	 * XXX this assumes a pointer and int are the same size...!
+	 */
+	for (i = 0; i < newfds; i++) {
+		if (fdalloc(p, 0, &f))
+			panic("unp_externalize");
+		fp = *rp;
+		p->p_fd->fd_ofiles[f] = fp;
+		fp->f_msgcount--;
+		unp_rights--;
+		*(int *)rp++ = f;
+	}
+	return (0);
+}
+
+void
+unp_init(void)
+{
+	unp_zone = zinit("unpcb", sizeof(struct unpcb), nmbclusters, 0, 0);
+	if (unp_zone == 0)
+		panic("unp_init");
+	LIST_INIT(&unp_dhead);
+	LIST_INIT(&unp_shead);
+}
+
+#ifndef MIN
+#define	MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+static int
+unp_internalize(control, p)
+	struct mbuf *control;
+	struct proc *p;
+{
+	struct filedesc *fdp = p->p_fd;
+	register struct cmsghdr *cm = mtod(control, struct cmsghdr *);
+	register struct file **rp;
+	register struct file *fp;
+	register int i, fd;
+	register struct cmsgcred *cmcred;
+	int oldfds;
+
+	if ((cm->cmsg_type != SCM_RIGHTS && cm->cmsg_type != SCM_CREDS) ||
+	    cm->cmsg_level != SOL_SOCKET || cm->cmsg_len != control->m_len)
+		return (EINVAL);
+
+	/*
+	 * Fill in credential information.
+	 */
+	if (cm->cmsg_type == SCM_CREDS) {
+		cmcred = (struct cmsgcred *)(cm + 1);
+		cmcred->cmcred_pid = p->p_pid;
+		cmcred->cmcred_uid = p->p_cred->p_ruid;
+		cmcred->cmcred_gid = p->p_cred->p_rgid;
+		cmcred->cmcred_euid = p->p_ucred->cr_uid;
+		cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
+							CMGROUP_MAX);
+		for (i = 0; i < cmcred->cmcred_ngroups; i++)
+			cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
+		return(0);
+	}
+
+	oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int);
+	/*
+	 * check that all the FDs passed in refer to legal OPEN files
+	 * If not, reject the entire operation.
+	 */
+	rp = (struct file **)(cm + 1);
+	for (i = 0; i < oldfds; i++) {
+		fd = *(int *)rp++;
+		if ((unsigned)fd >= fdp->fd_nfiles ||
+		    fdp->fd_ofiles[fd] == NULL)
+			return (EBADF);
+	}
+	/*
+	 * Now replace the integer FDs with pointers to
+	 * the associated global file table entry..
+	 * XXX this assumes a pointer and an int are the same size!
+	 */
+	rp = (struct file **)(cm + 1);
+	for (i = 0; i < oldfds; i++) {
+		fp = fdp->fd_ofiles[*(int *)rp];
+		*rp++ = fp;
+		fp->f_count++;
+		fp->f_msgcount++;
+		unp_rights++;
+	}
+	return (0);
+}
+
+static int	unp_defer, unp_gcing;
+
+static void
+unp_gc()
+{
+	register struct file *fp, *nextfp;
+	register struct socket *so;
+	struct file **extra_ref, **fpp;
+	int nunref, i;
+
+	if (unp_gcing)
+		return;
+	unp_gcing = 1;
+	unp_defer = 0;
+	/* 
+	 * before going through all this, set all FDs to 
+	 * be NOT defered and NOT externally accessible
+	 */
+	for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next)
+		fp->f_flag &= ~(FMARK|FDEFER);
+	do {
+		for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) {
+			/*
+			 * If the file is not open, skip it
+			 */
+			if (fp->f_count == 0)
+				continue;
+			/*
+			 * If we already marked it as 'defer'  in a
+			 * previous pass, then try process it this time
+			 * and un-mark it
+			 */
+			if (fp->f_flag & FDEFER) {
+				fp->f_flag &= ~FDEFER;
+				unp_defer--;
+			} else {
+				/*
+				 * if it's not defered, then check if it's
+				 * already marked.. if so skip it
+				 */
+				if (fp->f_flag & FMARK)
+					continue;
+				/* 
+				 * If all references are from messages
+				 * in transit, then skip it. it's not 
+				 * externally accessible.
+				 */ 
+				if (fp->f_count == fp->f_msgcount)
+					continue;
+				/* 
+				 * If it got this far then it must be
+				 * externally accessible.
+				 */
+				fp->f_flag |= FMARK;
+			}
+			/*
+			 * either it was defered, or it is externally 
+			 * accessible and not already marked so.
+			 * Now check if it is possibly one of OUR sockets.
+			 */ 
+			if (fp->f_type != DTYPE_SOCKET ||
+			    (so = (struct socket *)fp->f_data) == 0)
+				continue;
+			if (so->so_proto->pr_domain != &localdomain ||
+			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
+				continue;
+#ifdef notdef
+			if (so->so_rcv.sb_flags & SB_LOCK) {
+				/*
+				 * This is problematical; it's not clear
+				 * we need to wait for the sockbuf to be
+				 * unlocked (on a uniprocessor, at least),
+				 * and it's also not clear what to do
+				 * if sbwait returns an error due to receipt
+				 * of a signal.  If sbwait does return
+				 * an error, we'll go into an infinite
+				 * loop.  Delete all of this for now.
+				 */
+				(void) sbwait(&so->so_rcv);
+				goto restart;
+			}
+#endif
+			/*
+			 * So, Ok, it's one of our sockets and it IS externally
+			 * accessible (or was defered). Now we look
+			 * to see if we hold any file descriptors in its
+			 * message buffers. Follow those links and mark them 
+			 * as accessible too.
+			 */
+			unp_scan(so->so_rcv.sb_mb, unp_mark);
+		}
+	} while (unp_defer);
+	/*
+	 * We grab an extra reference to each of the file table entries
+	 * that are not otherwise accessible and then free the rights
+	 * that are stored in messages on them.
+	 *
+	 * The bug in the orginal code is a little tricky, so I'll describe
+	 * what's wrong with it here.
+	 *
+	 * It is incorrect to simply unp_discard each entry for f_msgcount
+	 * times -- consider the case of sockets A and B that contain
+	 * references to each other.  On a last close of some other socket,
+	 * we trigger a gc since the number of outstanding rights (unp_rights)
+	 * is non-zero.  If during the sweep phase the gc code un_discards,
+	 * we end up doing a (full) closef on the descriptor.  A closef on A
+	 * results in the following chain.  Closef calls soo_close, which
+	 * calls soclose.   Soclose calls first (through the switch
+	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
+	 * returns because the previous instance had set unp_gcing, and
+	 * we return all the way back to soclose, which marks the socket
+	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
+	 * to free up the rights that are queued in messages on the socket A,
+	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
+	 * switch unp_dispose, which unp_scans with unp_discard.  This second
+	 * instance of unp_discard just calls closef on B.
+	 *
+	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
+	 * which results in another closef on A.  Unfortunately, A is already
+	 * being closed, and the descriptor has already been marked with
+	 * SS_NOFDREF, and soclose panics at this point.
+	 *
+	 * Here, we first take an extra reference to each inaccessible
+	 * descriptor.  Then, we call sorflush ourself, since we know
+	 * it is a Unix domain socket anyhow.  After we destroy all the
+	 * rights carried in messages, we do a last closef to get rid
+	 * of our extra reference.  This is the last close, and the
+	 * unp_detach etc will shut down the socket.
+	 *
+	 * 91/09/19, bsy@cs.cmu.edu
+	 */
+	extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK);
+	for (nunref = 0, fp = filehead.lh_first, fpp = extra_ref; fp != 0;
+	    fp = nextfp) {
+		nextfp = fp->f_list.le_next;
+		/* 
+		 * If it's not open, skip it
+		 */
+		if (fp->f_count == 0)
+			continue;
+		/* 
+		 * If all refs are from msgs, and it's not marked accessible
+		 * then it must be referenced from some unreachable cycle
+		 * of (shut-down) FDs, so include it in our
+		 * list of FDs to remove
+		 */
+		if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
+			*fpp++ = fp;
+			nunref++;
+			fp->f_count++;
+		}
+	}
+	/* 
+	 * for each FD on our hit list, do the following two things
+	 */
+	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
+		struct file *tfp = *fpp;
+		if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL)
+			sorflush((struct socket *)(tfp->f_data));
+	}
+	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
+		closef(*fpp, (struct proc *) NULL);
+	free((caddr_t)extra_ref, M_FILE);
+	unp_gcing = 0;
+}
+
+void
+unp_dispose(m)
+	struct mbuf *m;
+{
+
+	if (m)
+		unp_scan(m, unp_discard);
+}
+
+static void
+unp_scan(m0, op)
+	register struct mbuf *m0;
+	void (*op) __P((struct file *));
+{
+	register struct mbuf *m;
+	register struct file **rp;
+	register struct cmsghdr *cm;
+	register int i;
+	int qfds;
+
+	while (m0) {
+		for (m = m0; m; m = m->m_next)
+			if (m->m_type == MT_CONTROL &&
+			    m->m_len >= sizeof(*cm)) {
+				cm = mtod(m, struct cmsghdr *);
+				if (cm->cmsg_level != SOL_SOCKET ||
+				    cm->cmsg_type != SCM_RIGHTS)
+					continue;
+				qfds = (cm->cmsg_len - sizeof *cm)
+						/ sizeof (struct file *);
+				rp = (struct file **)(cm + 1);
+				for (i = 0; i < qfds; i++)
+					(*op)(*rp++);
+				break;		/* XXX, but saves time */
+			}
+		m0 = m0->m_act;
+	}
+}
+
+static void
+unp_mark(fp)
+	struct file *fp;
+{
+
+	if (fp->f_flag & FMARK)
+		return;
+	unp_defer++;
+	fp->f_flag |= (FMARK|FDEFER);
+}
+
+static void
+unp_discard(fp)
+	struct file *fp;
+{
+
+	fp->f_msgcount--;
+	unp_rights--;
+	(void) closef(fp, (struct proc *)NULL);
+}
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
new file mode 100644
index 0000000..c1af873
--- /dev/null
+++ b/sys/kern/vfs_aio.c
@@ -0,0 +1,2046 @@
+/*
+ * Copyright (c) 1997 John S. Dyson.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. John S. Dyson's name may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
+ * bad that happens because of using this software isn't the responsibility
+ * of the author.  This software is distributed AS-IS.
+ *
+ * $Id: vfs_aio.c,v 1.36 1998/12/15 17:38:33 des Exp $
+ */
+
+/*
+ * This file contains support for the POSIX 1003.1B AIO/LIO facility.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/lock.h>
+#include <sys/unistd.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_zone.h>
+#include <sys/aio.h>
+#include <sys/shm.h>
+
+#include <machine/cpu.h>
+#include <machine/limits.h>
+
+static	long jobrefid;
+
+#define JOBST_NULL			0x0
+#define	JOBST_JOBQPROC		0x1
+#define JOBST_JOBQGLOBAL	0x2
+#define JOBST_JOBRUNNING	0x3
+#define JOBST_JOBFINISHED	0x4
+#define	JOBST_JOBQBUF		0x5
+#define	JOBST_JOBBFINISHED	0x6
+
+#ifndef MAX_AIO_PER_PROC
+#define MAX_AIO_PER_PROC	32
+#endif
+
+#ifndef MAX_AIO_QUEUE_PER_PROC
+#define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef MAX_AIO_PROCS
+#define MAX_AIO_PROCS		32
+#endif
+
+#ifndef MAX_AIO_QUEUE
+#define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef TARGET_AIO_PROCS
+#define TARGET_AIO_PROCS	0
+#endif
+
+#ifndef MAX_BUF_AIO
+#define MAX_BUF_AIO 16
+#endif
+
+#ifndef AIOD_TIMEOUT_DEFAULT
+#define	AIOD_TIMEOUT_DEFAULT (10 * hz)
+#endif
+
+#ifndef AIOD_LIFETIME_DEFAULT
+#define AIOD_LIFETIME_DEFAULT (30 * hz)
+#endif
+
+static int max_aio_procs = MAX_AIO_PROCS;
+static int num_aio_procs = 0;
+static int target_aio_procs = TARGET_AIO_PROCS;
+static int max_queue_count = MAX_AIO_QUEUE;
+static int num_queue_count = 0;
+static int num_buf_aio = 0;
+static int num_aio_resv_start = 0;
+static int aiod_timeout;
+static int aiod_lifetime;
+
+static int max_aio_per_proc = MAX_AIO_PER_PROC,
+	max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC;
+
+static int max_buf_aio = MAX_BUF_AIO;
+
+SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
+	CTLFLAG_RW, &max_aio_per_proc, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
+	CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
+	CTLFLAG_RW, &max_aio_procs, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
+	CTLFLAG_RD, &num_aio_procs, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
+	CTLFLAG_RD, &num_queue_count, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
+	CTLFLAG_RW, &max_queue_count, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
+	CTLFLAG_RW, &target_aio_procs, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
+	CTLFLAG_RW, &max_buf_aio, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
+	CTLFLAG_RD, &num_buf_aio, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
+	CTLFLAG_RW, &aiod_lifetime, 0, "");
+
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
+	CTLFLAG_RW, &aiod_timeout, 0, "");
+
+
+/*
+ * Job queue item
+ */
+
+#define AIOCBLIST_CANCELLED	0x1
+#define AIOCBLIST_RUNDOWN	0x4
+#define AIOCBLIST_ASYNCFREE	0x8
+#define AIOCBLIST_DONE		0x10
+
+struct aiocblist {
+	TAILQ_ENTRY (aiocblist) list;		/* List of jobs */
+	TAILQ_ENTRY (aiocblist) plist;		/* List of jobs for proc */
+	int	jobflags;
+	int	jobstate;
+	int inputcharge, outputcharge;
+	struct	buf *bp;				/* buffer pointer */
+	struct	proc *userproc;			/* User process */
+	struct	aioproclist	*jobaioproc;	/* AIO process descriptor */
+	struct	aio_liojob	*lio;		/* optional lio job */
+	struct	aiocb *uuaiocb;			/* pointer in userspace of aiocb */
+	struct	aiocb uaiocb;			/* Kernel I/O control block */
+};
+
+
+/*
+ * AIO process info
+ */
+#define AIOP_FREE	0x1			/* proc on free queue */
+#define AIOP_SCHED	0x2			/* proc explicitly scheduled */
+
+struct aioproclist {
+	int aioprocflags;			/* AIO proc flags */
+	TAILQ_ENTRY(aioproclist) list;		/* List of processes */
+	struct proc *aioproc;			/* The AIO thread */
+	TAILQ_HEAD (,aiocblist) jobtorun;	/* suggested job to run */
+};
+
+/*
+ * data-structure for lio signal management
+ */
+struct aio_liojob {
+	int lioj_flags;
+	int	lioj_buffer_count;
+	int	lioj_buffer_finished_count;
+	int	lioj_queue_count;
+	int	lioj_queue_finished_count;
+	struct sigevent lioj_signal;	/* signal on all I/O done */
+	TAILQ_ENTRY (aio_liojob) lioj_list;
+	struct kaioinfo *lioj_ki;
+};
+#define	LIOJ_SIGNAL			0x1 /* signal on all done (lio) */
+#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
+
+/*
+ * per process aio data structure
+ */
+struct kaioinfo {
+	int	kaio_flags;			/* per process kaio flags */
+	int	kaio_maxactive_count;	/* maximum number of AIOs */
+	int	kaio_active_count;	/* number of currently used AIOs */
+	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
+	int	kaio_queue_count;	/* size of AIO queue */
+	int	kaio_ballowed_count;	/* maximum number of buffers */
+	int	kaio_queue_finished_count;	/* number of daemon jobs finished */
+	int	kaio_buffer_count;	/* number of physio buffers */
+	int	kaio_buffer_finished_count;	/* count of I/O done */
+	struct proc *kaio_p;			/* process that uses this kaio block */
+	TAILQ_HEAD (,aio_liojob) kaio_liojoblist;	/* list of lio jobs */
+	TAILQ_HEAD (,aiocblist)	kaio_jobqueue;	/* job queue for process */
+	TAILQ_HEAD (,aiocblist)	kaio_jobdone;	/* done queue for process */
+	TAILQ_HEAD (,aiocblist)	kaio_bufqueue;	/* buffer job queue for process */
+	TAILQ_HEAD (,aiocblist)	kaio_bufdone;	/* buffer done queue for process */
+};
+
+#define KAIO_RUNDOWN 0x1		/* process is being run down */
+#define KAIO_WAKEUP 0x2			/* wakeup process when there is a significant
+								   event */
+
+
+static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc;
+static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
+static TAILQ_HEAD(,aiocblist) aio_bufjobs;		/* Phys I/O job list */
+static TAILQ_HEAD(,aiocblist) aio_freejobs;		/* Pool of free jobs */
+
+static void aio_init_aioinfo(struct proc *p) ;
+static void aio_onceonly(void *) ;
+static int aio_free_entry(struct aiocblist *aiocbe);
+static void aio_process(struct aiocblist *aiocbe);
+static int aio_newproc(void) ;
+static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ;
+static void aio_physwakeup(struct buf *bp);
+static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
+static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
+static void aio_daemon(void *uproc);
+
+SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
+
+static vm_zone_t kaio_zone=0, aiop_zone=0,
+	aiocb_zone=0, aiol_zone=0, aiolio_zone=0;
+
+/*
+ * Single AIOD vmspace shared amongst all of them
+ */
+struct vmspace *aiovmspace = NULL;
+
+/*
+ * Startup initialization
+ */
+void
+aio_onceonly(void *na)
+{
+	TAILQ_INIT(&aio_freeproc);
+	TAILQ_INIT(&aio_activeproc);
+	TAILQ_INIT(&aio_jobs);
+	TAILQ_INIT(&aio_bufjobs);
+	TAILQ_INIT(&aio_freejobs);
+	kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
+	aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
+	aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
+	aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
+	aiolio_zone = zinit("AIOLIO",
+		AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1);
+	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
+	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
+	jobrefid = 1;
+}
+
+/*
+ * Init the per-process aioinfo structure.
+ * The aioinfo limits are set per-process for user limit (resource) management.
+ */
+void
+aio_init_aioinfo(struct proc *p)
+{
+	struct kaioinfo *ki;
+	if (p->p_aioinfo == NULL) {
+		ki = zalloc(kaio_zone);
+		p->p_aioinfo = ki;
+		ki->kaio_flags = 0;
+		ki->kaio_maxactive_count = max_aio_per_proc;
+		ki->kaio_active_count = 0;
+		ki->kaio_qallowed_count = max_aio_queue_per_proc;
+		ki->kaio_queue_count = 0;
+		ki->kaio_ballowed_count = max_buf_aio;
+		ki->kaio_buffer_count = 0;
+		ki->kaio_buffer_finished_count = 0;
+		ki->kaio_p = p;
+		TAILQ_INIT(&ki->kaio_jobdone);
+		TAILQ_INIT(&ki->kaio_jobqueue);
+		TAILQ_INIT(&ki->kaio_bufdone);
+		TAILQ_INIT(&ki->kaio_bufqueue);
+		TAILQ_INIT(&ki->kaio_liojoblist);
+	}
+}
+
+/*
+ * Free a job entry.  Wait for completion if it is currently
+ * active, but don't delay forever.  If we delay, we return
+ * a flag that says that we have to restart the queue scan.
+ */
+int
+aio_free_entry(struct aiocblist *aiocbe)
+{
+	struct kaioinfo *ki;
+	struct aioproclist *aiop;
+	struct aio_liojob *lj;
+	struct proc *p;
+	int error;
+	int s;
+
+	if (aiocbe->jobstate == JOBST_NULL)
+		panic("aio_free_entry: freeing already free job");
+
+	p = aiocbe->userproc;
+	ki = p->p_aioinfo;
+	lj = aiocbe->lio;
+	if (ki == NULL)
+		panic("aio_free_entry: missing p->p_aioinfo");
+
+	if (aiocbe->jobstate == JOBST_JOBRUNNING) {
+		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
+			return 0;
+		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
+		tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0);
+	}
+	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
+
+	if (aiocbe->bp == NULL) {
+		if (ki->kaio_queue_count <= 0)
+			panic("aio_free_entry: process queue size <= 0");
+		if (num_queue_count <= 0)
+			panic("aio_free_entry: system wide queue size <= 0");
+	
+		if(lj) {
+			lj->lioj_queue_count--;
+			if (aiocbe->jobflags & AIOCBLIST_DONE)
+				lj->lioj_queue_finished_count--;
+		}
+		ki->kaio_queue_count--;
+		if (aiocbe->jobflags & AIOCBLIST_DONE)
+			ki->kaio_queue_finished_count--;
+		num_queue_count--;
+
+	} else {
+		if(lj) {
+			lj->lioj_buffer_count--;
+			if (aiocbe->jobflags & AIOCBLIST_DONE)
+				lj->lioj_buffer_finished_count--;
+		}
+		if (aiocbe->jobflags & AIOCBLIST_DONE)
+			ki->kaio_buffer_finished_count--;
+		ki->kaio_buffer_count--;
+		num_buf_aio--;
+
+	}
+
+	if ((ki->kaio_flags & KAIO_WAKEUP) ||
+		(ki->kaio_flags & KAIO_RUNDOWN) &&
+		((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0))) {
+		ki->kaio_flags &= ~KAIO_WAKEUP;
+		wakeup(p);
+	}
+
+	if ( aiocbe->jobstate == JOBST_JOBQBUF) {
+		if ((error = aio_fphysio(p, aiocbe, 1)) != 0)
+			return error;
+		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
+			panic("aio_free_entry: invalid physio finish-up state");
+		s = splbio();
+		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
+		splx(s);
+	} else if ( aiocbe->jobstate == JOBST_JOBQPROC) {
+		aiop = aiocbe->jobaioproc;
+		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
+	} else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) {
+		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
+	} else if ( aiocbe->jobstate == JOBST_JOBFINISHED) {
+		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
+	} else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) {
+		s = splbio();
+		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
+		splx(s);
+		if (aiocbe->bp) {
+			vunmapbuf(aiocbe->bp);
+			relpbuf(aiocbe->bp, NULL);
+			aiocbe->bp = NULL;
+		}
+	}
+	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
+		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+		zfree(aiolio_zone, lj);
+	}
+	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+	aiocbe->jobstate = JOBST_NULL;
+	return 0;
+}
+
+/*
+ * Rundown the jobs for a given process.  
+ */
+void
+aio_proc_rundown(struct proc *p)
+{
+	int s;
+	struct kaioinfo *ki;
+	struct aio_liojob *lj, *ljn;
+	struct aiocblist *aiocbe, *aiocbn;
+	
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return;
+
+	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
+	while ((ki->kaio_active_count > 0) ||
+		(ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) {
+		ki->kaio_flags |= KAIO_RUNDOWN;
+		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
+			break;
+	}
+
+restart1:
+	for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone);
+		aiocbe;
+		aiocbe = aiocbn) {
+		aiocbn = TAILQ_NEXT(aiocbe, plist);
+		if (aio_free_entry(aiocbe))
+			goto restart1;
+	}
+
+restart2:
+	for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue);
+		aiocbe;
+		aiocbe = aiocbn) {
+		aiocbn = TAILQ_NEXT(aiocbe, plist);
+		if (aio_free_entry(aiocbe))
+			goto restart2;
+	}
+
+/*
+ * Note the use of lots of splbio here, trying to avoid
+ * splbio for long chains of I/O.  Probably unnecessary.
+ */
+
+restart3:
+	s = splbio();
+	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
+		ki->kaio_flags |= KAIO_WAKEUP;
+		tsleep (p, PRIBIO, "aioprn", 0);	
+		splx(s);
+		goto restart3;
+	}
+	splx(s);
+
+restart4:
+	s = splbio();
+	for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone);
+		aiocbe;
+		aiocbe = aiocbn) {
+		aiocbn = TAILQ_NEXT(aiocbe, plist);
+		if (aio_free_entry(aiocbe)) {
+			splx(s);
+			goto restart4;
+		}
+	}
+	splx(s);
+
+	for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist);
+		  lj;
+		  lj = ljn) {
+			ljn = TAILQ_NEXT(lj, lioj_list);
+			if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
+				TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+				zfree(aiolio_zone, lj);
+			} else {
+#if defined(DIAGNOSTIC)
+				printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n",
+					lj->lioj_buffer_count, lj->lioj_buffer_finished_count,
+					lj->lioj_queue_count, lj->lioj_queue_finished_count);
+#endif
+			}
+	}
+
+	zfree(kaio_zone, ki);
+	p->p_aioinfo = NULL;
+}
+
+/*
+ * Select a job to run (called by an AIO daemon)
+ */
+static struct aiocblist *
+aio_selectjob(struct aioproclist *aiop)
+{
+
+	struct aiocblist *aiocbe;
+
+	aiocbe = TAILQ_FIRST(&aiop->jobtorun);
+	if (aiocbe) {
+		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
+		return aiocbe;
+	}
+
+	for (aiocbe = TAILQ_FIRST(&aio_jobs);
+		aiocbe;
+		aiocbe = TAILQ_NEXT(aiocbe, list)) {
+		struct kaioinfo *ki;
+		struct proc *userp;
+
+		userp = aiocbe->userproc;
+		ki = userp->p_aioinfo;
+
+		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
+			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
+			return aiocbe;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * The AIO processing activity.  This is the code that does the
+ * I/O request for the non-physio version of the operations.  The
+ * normal vn operations are used, and this code should work in
+ * all instances for every type of file, including pipes, sockets,
+ * fifos, and regular files.
+ */
+void
+aio_process(struct aiocblist *aiocbe)
+{
+	struct filedesc *fdp;
+	struct proc *userp, *mycp;
+	struct aiocb *cb;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	unsigned int fd;
+	int cnt;
+	int error;
+	off_t offset;
+	int oublock_st, oublock_end;
+	int inblock_st, inblock_end;
+
+	userp = aiocbe->userproc;
+	cb = &aiocbe->uaiocb;
+
+	mycp = curproc;
+
+	fdp = mycp->p_fd;
+	fd = cb->aio_fildes;
+	fp = fdp->fd_ofiles[fd];
+
+	aiov.iov_base = (void *) cb->aio_buf;
+	aiov.iov_len = cb->aio_nbytes;
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = offset = cb->aio_offset;
+	auio.uio_resid = cb->aio_nbytes;
+	cnt = cb->aio_nbytes;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = mycp;
+
+	inblock_st = mycp->p_stats->p_ru.ru_inblock;
+	oublock_st = mycp->p_stats->p_ru.ru_oublock;
+	if (cb->aio_lio_opcode == LIO_READ) {
+		auio.uio_rw = UIO_READ;
+		error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
+	} else {
+		auio.uio_rw = UIO_WRITE;
+		error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
+	}
+	inblock_end = mycp->p_stats->p_ru.ru_inblock;
+	oublock_end = mycp->p_stats->p_ru.ru_oublock;
+
+	aiocbe->inputcharge = inblock_end - inblock_st;
+	aiocbe->outputcharge = oublock_end - oublock_st;
+
+	if (error) {
+		if (auio.uio_resid != cnt) {
+			if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
+				error = 0;
+			if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
+				psignal(userp, SIGPIPE);
+		}
+	}
+
+	cnt -= auio.uio_resid;
+	cb->_aiocb_private.error = error;
+	cb->_aiocb_private.status = cnt;
+	
+	return;
+
+}
+
+/*
+ * The AIO daemon, most of the actual work is done in aio_process,
+ * but the setup (and address space mgmt) is done in this routine.
+ */
+static void
+aio_daemon(void *uproc)
+{
+	int s;
+	struct aioproclist *aiop;
+	struct vmspace *myvm, *aiovm;
+	struct proc *mycp;
+
+	/*
+	 * Local copies of curproc (cp) and vmspace (myvm)
+	 */
+	mycp = curproc;
+	myvm = mycp->p_vmspace;
+
+	/*
+	 * We manage to create only one VM space for all AIOD processes.
+	 * The VM space for the first AIOD created becomes the shared VM
+	 * space for all of them.  We add an additional reference count,
+	 * even for the first AIOD, so the address space does not go away,
+	 * and we continue to use that original VM space even if the first
+	 * AIOD exits.
+	 */
+	if ((aiovm = aiovmspace) == NULL) {
+		aiovmspace = myvm;
+		myvm->vm_refcnt++;
+		/*
+		 * Remove userland cruft from address space.
+		 */
+		if (myvm->vm_shm)
+			shmexit(mycp);
+		pmap_remove_pages(&myvm->vm_pmap, 0, USRSTACK);
+		vm_map_remove(&myvm->vm_map, 0, USRSTACK);
+		myvm->vm_tsize = 0;
+		myvm->vm_dsize = 0;
+		myvm->vm_ssize = 0;
+	} else {
+		aiovm->vm_refcnt++;
+		mycp->p_vmspace = aiovm;
+		pmap_activate(mycp);
+		vmspace_free(myvm);
+		myvm = aiovm;
+	}
+
+	if (mycp->p_textvp) {
+		vrele(mycp->p_textvp);
+		mycp->p_textvp = NULL;
+	}
+
+	/*
+	 * Allocate and ready the aio control info.  There is one
+	 * aiop structure per daemon.
+	 */
+	aiop = zalloc(aiop_zone);
+	aiop->aioproc = mycp;
+	aiop->aioprocflags |= AIOP_FREE;
+	TAILQ_INIT(&aiop->jobtorun);
+
+	/*
+	 * Place thread (lightweight process) onto the AIO free thread list
+	 */
+	if (TAILQ_EMPTY(&aio_freeproc))
+		wakeup(&aio_freeproc);
+	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+
+	/*
+	 * Make up a name for the daemon
+	 */
+	strcpy(mycp->p_comm, "aiod");
+
+	/*
+	 * Get rid of our current filedescriptors.  AIOD's don't need any
+	 * filedescriptors, except as temporarily inherited from the client.
+	 * Credentials are also cloned, and made equivalent to "root."
+	 */
+	fdfree(mycp);
+	mycp->p_fd = NULL;
+	mycp->p_ucred = crcopy(mycp->p_ucred);
+	mycp->p_ucred->cr_uid = 0;
+	mycp->p_ucred->cr_ngroups = 1;
+	mycp->p_ucred->cr_groups[0] = 1;
+
+	/*
+	 * The daemon resides in its own pgrp.
+	 */
+	enterpgrp(mycp, mycp->p_pid, 1);
+
+	/*
+	 * Mark special process type
+	 */
+	mycp->p_flag |= P_SYSTEM|P_KTHREADP;
+
+	/*
+	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
+	 * creating to many daemons.)
+	 */
+	wakeup(mycp);
+
+	while(1) {
+		struct proc *curcp;
+		struct	aiocblist *aiocbe;
+
+		/*
+		 * curcp is the current daemon process context.
+		 * userp is the current user process context.
+		 */
+		curcp = mycp;
+
+		/*
+		 * Take daemon off of free queue
+		 */
+		if (aiop->aioprocflags & AIOP_FREE) {
+			TAILQ_REMOVE(&aio_freeproc, aiop, list);
+			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
+			aiop->aioprocflags &= ~AIOP_FREE;
+		}
+		aiop->aioprocflags &= ~AIOP_SCHED;
+
+		/*
+		 * Check for jobs
+		 */
+		while ( aiocbe = aio_selectjob(aiop)) {
+			struct proc *userp;
+			struct aiocb *cb;
+			struct kaioinfo *ki;
+			struct aio_liojob *lj;
+
+			cb = &aiocbe->uaiocb;
+			userp = aiocbe->userproc;
+
+			aiocbe->jobstate = JOBST_JOBRUNNING;
+
+			/*
+			 * Connect to process address space for user program
+			 */
+			if (userp != curcp) {
+				struct vmspace *tmpvm;
+				/*
+				 * Save the current address space that we are connected to.
+				 */
+				tmpvm = mycp->p_vmspace;
+				/*
+				 * Point to the new user address space, and refer to it.
+				 */
+				mycp->p_vmspace = userp->p_vmspace;
+				mycp->p_vmspace->vm_refcnt++;
+				/*
+				 * Activate the new mapping.
+				 */
+				pmap_activate(mycp);
+				/*
+				 * If the old address space wasn't the daemons own address
+				 * space, then we need to remove the daemon's reference from
+				 * the other process that it was acting on behalf of.
+				 */
+				if (tmpvm != myvm) {
+					vmspace_free(tmpvm);
+				}
+				/*
+				 * Disassociate from previous clients file descriptors, and
+				 * associate to the new clients descriptors.  Note that
+				 * the daemon doesn't need to worry about its orginal
+				 * descriptors, because they were originally freed.
+				 */
+				if (mycp->p_fd)
+					fdfree(mycp);
+				mycp->p_fd = fdshare(userp);
+				curcp = userp;
+			}
+
+			ki = userp->p_aioinfo;
+			lj = aiocbe->lio;
+
+			/*
+			 * Account for currently active jobs
+			 */
+			ki->kaio_active_count++;
+
+			/*
+			 * Do the I/O function
+			 */
+			aiocbe->jobaioproc = aiop;
+			aio_process(aiocbe);
+
+			/*
+			 * decrement the active job count
+			 */
+			ki->kaio_active_count--;
+
+			/*
+			 * increment the completion count for wakeup/signal comparisons
+			 */
+			aiocbe->jobflags |= AIOCBLIST_DONE;
+			ki->kaio_queue_finished_count++;
+			if (lj) {
+				lj->lioj_queue_finished_count++;
+			}
+			if ((ki->kaio_flags & KAIO_WAKEUP) ||
+				(ki->kaio_flags & KAIO_RUNDOWN) &&
+				(ki->kaio_active_count == 0)) {
+				ki->kaio_flags &= ~KAIO_WAKEUP;
+				wakeup(userp);
+			}
+
+			s = splbio();
+			if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
+				LIOJ_SIGNAL) {
+				if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) &&
+					(lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) {
+						psignal(userp, lj->lioj_signal.sigev_signo);
+						lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+				}
+			}
+			splx(s);
+
+			aiocbe->jobstate = JOBST_JOBFINISHED;
+
+			/*
+			 * If the I/O request should be automatically rundown, do the
+			 * needed cleanup.  Otherwise, place the queue entry for
+			 * the just finished I/O request into the done queue for the
+			 * associated client.
+			 */
+			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
+				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
+				TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+			} else {
+				TAILQ_REMOVE(&ki->kaio_jobqueue,
+					aiocbe, plist);
+				TAILQ_INSERT_TAIL(&ki->kaio_jobdone,
+					aiocbe, plist);
+			}
+
+			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
+				wakeup(aiocbe);
+				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
+			}
+
+			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+				psignal(userp, cb->aio_sigevent.sigev_signo);
+			}
+		}
+
+		/*
+		 * Disconnect from user address space
+		 */
+		if (curcp != mycp) {
+			struct vmspace *tmpvm;
+			/*
+			 * Get the user address space to disconnect from.
+			 */
+			tmpvm = mycp->p_vmspace;
+			/*
+			 * Get original address space for daemon.
+			 */
+			mycp->p_vmspace = myvm;
+			/*
+			 * Activate the daemon's address space.
+			 */
+			pmap_activate(mycp);
+#if defined(DIAGNOSTIC)
+			if (tmpvm == myvm)
+				printf("AIOD: vmspace problem -- %d\n", mycp->p_pid);
+#endif
+			/*
+			 * remove our vmspace reference.
+			 */
+			vmspace_free(tmpvm);
+			/*
+			 * disassociate from the user process's file descriptors.
+			 */
+			if (mycp->p_fd)
+				fdfree(mycp);
+			mycp->p_fd = NULL;
+			curcp = mycp;
+		}
+
+		/*
+		 * If we are the first to be put onto the free queue, wakeup
+		 * anyone waiting for a daemon.
+		 */
+		TAILQ_REMOVE(&aio_activeproc, aiop, list);
+		if (TAILQ_EMPTY(&aio_freeproc))
+			wakeup(&aio_freeproc);
+		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+		aiop->aioprocflags |= AIOP_FREE;
+
+		/*
+		 * If daemon is inactive for a long time, allow it to exit, thereby
+		 * freeing resources.
+		 */
+		if (((aiop->aioprocflags & AIOP_SCHED) == 0) &&
+			tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) {
+			if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
+				(TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
+				if ((aiop->aioprocflags & AIOP_FREE) &&
+					(num_aio_procs > target_aio_procs)) {
+					TAILQ_REMOVE(&aio_freeproc, aiop, list);
+					zfree(aiop_zone, aiop);
+					num_aio_procs--;
+#if defined(DIAGNOSTIC)
+					if (mycp->p_vmspace->vm_refcnt <= 1)
+						printf("AIOD: bad vm refcnt for exiting daemon: %d\n",
+							mycp->p_vmspace->vm_refcnt);
+#endif
+					exit1(mycp, 0);
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.
+ * The AIO daemon modifies its environment itself.
+ */
+static int
+aio_newproc()
+{
+	int error;
+	struct rfork_args rfa;
+	struct proc *p, *np;
+
+	rfa.flags = RFPROC | RFCFDG;
+
+	p = curproc;
+	if (error = rfork(p, &rfa))
+		return error;
+
+	np = pfind(p->p_retval[0]);
+	cpu_set_fork_handler(np, aio_daemon, p);
+
+	/*
+	 * Wait until daemon is started, but continue on just in case (to
+	 * handle error conditions.
+	 */
+	error = tsleep(np, PZERO, "aiosta", aiod_timeout);
+	num_aio_procs++;
+
+	return error;
+
+}
+
+/*
+ * Try the high-performance physio method for eligible VCHR devices.  This
+ * routine doesn't require the use of any additional threads, and have
+ * overhead.
+ */
+int
+aio_qphysio(p, aiocbe)
+	struct proc *p;
+	struct aiocblist *aiocbe;
+{
+	int error;
+	struct aiocb *cb;
+	struct file *fp;
+	struct buf *bp;
+	int bflags;
+	struct vnode *vp;
+	struct kaioinfo *ki;
+	struct filedesc *fdp;
+	struct aio_liojob *lj;
+	int fd;
+	int majordev;
+	int s;
+	int cnt;
+	dev_t dev;
+	int rw;
+	d_strategy_t *fstrategy;
+	struct cdevsw *cdev;
+	struct cdevsw *bdev;
+
+	cb = &aiocbe->uaiocb;
+	fdp = p->p_fd;
+	fd = cb->aio_fildes;
+	fp = fdp->fd_ofiles[fd];
+
+	if (fp->f_type != DTYPE_VNODE) {
+		return -1;
+	}
+
+	vp = (struct vnode *)fp->f_data;
+	if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) {
+		return -1;
+	}
+
+	if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) {
+		return -1;
+	}
+
+	if ((vp->v_specinfo == NULL) || (vp->v_flag & VISTTY)) {
+		return -1;
+	}
+
+	majordev = major(vp->v_rdev);
+	if (majordev == NODEV) {
+		return -1;
+	}
+
+	cdev = cdevsw[major(vp->v_rdev)];
+	if (cdev == NULL) {
+		return -1;
+	}
+
+	if (cdev->d_bmaj == -1) {
+		return -1;
+	}
+	bdev = cdev;
+
+	ki = p->p_aioinfo;
+	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
+		return -1;
+	}
+
+	cnt = cb->aio_nbytes;
+	if (cnt > MAXPHYS) {
+		return -1;
+	}
+
+	dev = makedev(bdev->d_bmaj, minor(vp->v_rdev));
+
+	/*
+	 * Physical I/O is charged directly to the process, so we don't have
+	 * to fake it.
+	 */
+	aiocbe->inputcharge = 0;
+	aiocbe->outputcharge = 0;
+
+	ki->kaio_buffer_count++;
+
+	lj = aiocbe->lio;
+	if (lj) {
+		lj->lioj_buffer_count++;
+	}
+
+	/* create and build a buffer header for a transfer */
+	bp = (struct buf *)getpbuf(NULL);
+
+	/*
+	 * get a copy of the kva from the physical buffer
+	 */
+	bp->b_proc = p;
+	bp->b_dev = dev;
+	error = bp->b_error = 0;
+
+	if (cb->aio_lio_opcode == LIO_WRITE) {
+		rw = 0;
+		bflags = B_WRITE;
+	} else {
+		rw = 1;
+		bflags = B_READ;
+	}
+	
+	bp->b_bcount = cb->aio_nbytes;
+	bp->b_bufsize = cb->aio_nbytes;
+	bp->b_flags = B_BUSY | B_PHYS | B_CALL | bflags;
+	bp->b_iodone = aio_physwakeup;
+	bp->b_saveaddr = bp->b_data;
+	bp->b_data = (void *) cb->aio_buf;
+	bp->b_blkno = btodb(cb->aio_offset);
+
+	if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) {
+		error = EFAULT;
+		goto doerror;
+	}
+	if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) {
+		error = EFAULT;
+		goto doerror;
+	}
+
+	/* bring buffer into kernel space */
+	vmapbuf(bp);
+
+	s = splbio();
+	aiocbe->bp = bp;
+	bp->b_spc = (void *)aiocbe;
+	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
+	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
+	aiocbe->jobstate = JOBST_JOBQBUF;
+	cb->_aiocb_private.status = cb->aio_nbytes;
+	num_buf_aio++;
+	fstrategy = bdev->d_strategy;
+	bp->b_error = 0;
+
+	splx(s);
+	/* perform transfer */
+	(*fstrategy)(bp);
+
+	s = splbio();
+	/*
+	 * If we had an error invoking the request, or an error in processing
+	 * the request before we have returned, we process it as an error
+	 * in transfer.  Note that such an I/O error is not indicated immediately,
+	 * but is returned using the aio_error mechanism.  In this case, aio_suspend
+	 * will return immediately.
+	 */
+	if (bp->b_error || (bp->b_flags & B_ERROR)) {
+		struct aiocb *job = aiocbe->uuaiocb;
+
+		aiocbe->uaiocb._aiocb_private.status = 0;
+		suword(&job->_aiocb_private.status, 0);
+		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+		suword(&job->_aiocb_private.error, bp->b_error);
+
+		ki->kaio_buffer_finished_count++;
+
+		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
+			aiocbe->jobstate = JOBST_JOBBFINISHED;
+			aiocbe->jobflags |= AIOCBLIST_DONE;
+			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
+			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
+		}
+	}
+	splx(s);
+	return 0;
+
+doerror:
+	ki->kaio_buffer_count--;
+	if (lj) {
+		lj->lioj_buffer_count--;
+	}
+	aiocbe->bp = NULL;
+	relpbuf(bp, NULL);
+	return error;
+}
+
+/*
+ * This waits/tests physio completion.
+ */
+int
+aio_fphysio(p, iocb, flgwait)
+	struct proc *p;
+	struct aiocblist *iocb;
+	int flgwait;
+{
+	int s;
+	struct buf *bp;
+	int error;
+
+	bp = iocb->bp;
+
+	s = splbio();
+	if (flgwait == 0) {
+		if ((bp->b_flags & B_DONE) == 0) {
+			splx(s);
+			return EINPROGRESS;
+		}
+	}
+
+	while ((bp->b_flags & B_DONE) == 0) {
+		if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) {
+			if ((bp->b_flags & B_DONE) == 0) {
+				splx(s);
+				return EINPROGRESS;
+			} else {
+				break;
+			}
+		}
+	}
+
+	/* release mapping into kernel space */
+	vunmapbuf(bp);
+	iocb->bp = 0;
+
+	error = 0;
+	/*
+	 * check for an error
+	 */
+	if (bp->b_flags & B_ERROR) {
+		error = bp->b_error;
+	}
+
+	relpbuf(bp, NULL);
+	return (error);
+}
+
+/*
+ * Queue a new AIO request.  Choosing either the threaded or direct physio
+ * VCHR technique is done in this code.
+ */
+static int
+_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	unsigned int fd;
+
+	int error;
+	int opcode;
+	struct aiocblist *aiocbe;
+	struct aioproclist *aiop;
+	struct kaioinfo *ki;
+
+	if (aiocbe = TAILQ_FIRST(&aio_freejobs)) {
+		TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
+	} else {
+		aiocbe = zalloc (aiocb_zone);
+	}
+
+	aiocbe->inputcharge = 0;
+	aiocbe->outputcharge = 0;
+
+	suword(&job->_aiocb_private.status, -1);
+	suword(&job->_aiocb_private.error, 0);
+	suword(&job->_aiocb_private.kernelinfo, -1);
+
+	error = copyin((caddr_t)job,
+		(caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb);
+	if (error) {
+		suword(&job->_aiocb_private.error, error);
+
+		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+		return error;
+	}
+
+	/*
+	 * Save userspace address of the job info
+	 */
+	aiocbe->uuaiocb = job;
+
+	/*
+	 * Get the opcode
+	 */
+	if (type != LIO_NOP) {
+		aiocbe->uaiocb.aio_lio_opcode = type;
+	}
+	opcode = aiocbe->uaiocb.aio_lio_opcode;
+
+	/*
+	 * Get the fd info for process
+	 */
+	fdp = p->p_fd;
+
+	/*
+	 * Range check file descriptor
+	 */
+	fd = aiocbe->uaiocb.aio_fildes;
+	if (fd >= fdp->fd_nfiles) {
+		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+		if (type == 0) {
+			suword(&job->_aiocb_private.error, EBADF);
+		}
+		return EBADF;
+	}
+
+	fp = fdp->fd_ofiles[fd];
+	if ((fp == NULL) ||
+		((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) {
+		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+		if (type == 0) {
+			suword(&job->_aiocb_private.error, EBADF);
+		}
+		return EBADF;
+	}
+
+	if (aiocbe->uaiocb.aio_offset == -1LL) {
+		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+		if (type == 0) {
+			suword(&job->_aiocb_private.error, EINVAL);
+		}
+		return EINVAL;
+	}
+
+	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
+	if (error) {
+		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+		if (type == 0) {
+			suword(&job->_aiocb_private.error, EINVAL);
+		}
+		return error;
+	}
+
+	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
+	if (jobrefid == LONG_MAX)
+		jobrefid = 1;
+	else
+		jobrefid++;
+	
+	if (opcode == LIO_NOP) {
+		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+		if (type == 0) {
+			suword(&job->_aiocb_private.error, 0);
+			suword(&job->_aiocb_private.status, 0);
+			suword(&job->_aiocb_private.kernelinfo, 0);
+		}
+		return 0;
+	}
+
+	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
+		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
+		if (type == 0) {
+			suword(&job->_aiocb_private.status, 0);
+			suword(&job->_aiocb_private.error, EINVAL);
+		}
+		return EINVAL;
+	}
+
+	suword(&job->_aiocb_private.error, EINPROGRESS);
+	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
+	aiocbe->userproc = p;
+	aiocbe->jobflags = 0;
+	aiocbe->lio = lj;
+	ki = p->p_aioinfo;
+
+	if ((error = aio_qphysio(p, aiocbe)) == 0) {
+		return 0;
+	} else if (error > 0) {
+		suword(&job->_aiocb_private.status, 0);
+		aiocbe->uaiocb._aiocb_private.error = error;
+		suword(&job->_aiocb_private.error, error);
+		return error;
+	}
+
+	/*
+	 * No buffer for daemon I/O
+	 */
+	aiocbe->bp = NULL;
+
+	ki->kaio_queue_count++;
+	if (lj) {
+		lj->lioj_queue_count++;
+	}
+	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
+	aiocbe->jobstate = JOBST_JOBQGLOBAL;
+
+	num_queue_count++;
+	error = 0;
+
+	/*
+	 * If we don't have a free AIO process, and we are below our
+	 * quota, then start one.  Otherwise, depend on the subsequent
+	 * I/O completions to pick-up this job.  If we don't sucessfully
+	 * create the new process (thread) due to resource issues, we
+	 * return an error for now (EAGAIN), which is likely not the
+	 * correct thing to do.
+	 */
+retryproc:
+	if (aiop = TAILQ_FIRST(&aio_freeproc)) {
+		TAILQ_REMOVE(&aio_freeproc, aiop, list);
+		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
+		aiop->aioprocflags &= ~AIOP_FREE;
+		wakeup(aiop->aioproc);
+	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
+			((ki->kaio_active_count + num_aio_resv_start) <
+				ki->kaio_maxactive_count)) {
+		num_aio_resv_start++;
+		if ((error = aio_newproc()) == 0) {
+			num_aio_resv_start--;
+			p->p_retval[0] = 0;
+			goto retryproc;
+		}
+		num_aio_resv_start--;
+	}
+	return error;
+}
+
+/*
+ * This routine queues an AIO request, checking for quotas.
+ */
+static int
+aio_aqueue(struct proc *p, struct aiocb *job, int type)
+{
+	struct kaioinfo *ki;
+
+	if (p->p_aioinfo == NULL) {
+		aio_init_aioinfo(p);
+	}
+
+	if (num_queue_count >= max_queue_count)
+		return EAGAIN;
+
+	ki = p->p_aioinfo;
+	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
+		return EAGAIN;
+
+	return _aio_aqueue(p, job, NULL, type);
+}
+
+/*
+ * Support the aio_return system call, as a side-effect, kernel
+ * resources are released.
+ */
+int
+aio_return(struct proc *p, struct aio_return_args *uap)
+{
+	int s;
+	int jobref;
+	struct aiocblist *cb, *ncb;
+	struct aiocb *ujob;
+	struct kaioinfo *ki;
+
+	ki = p->p_aioinfo;
+	if (ki == NULL) {
+		return EINVAL;
+	}
+
+	ujob = uap->aiocbp;
+
+	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
+	if (jobref == -1 || jobref == 0)
+		return EINVAL;
+
+	for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
+		cb;
+		cb = TAILQ_NEXT(cb, plist)) {
+		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+			if (ujob == cb->uuaiocb) {
+				p->p_retval[0] = cb->uaiocb._aiocb_private.status;
+			} else {
+				p->p_retval[0] = EFAULT;
+			}
+			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+				curproc->p_stats->p_ru.ru_oublock += cb->outputcharge;
+				cb->outputcharge = 0;
+			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+				curproc->p_stats->p_ru.ru_inblock += cb->inputcharge;
+				cb->inputcharge = 0;
+			}
+			aio_free_entry(cb);
+			return 0;
+		}
+	}
+
+	s = splbio();
+	for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
+		cb;
+		cb = ncb) {
+		ncb = TAILQ_NEXT(cb, plist);
+		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+			splx(s);
+			if (ujob == cb->uuaiocb) {
+				p->p_retval[0] = cb->uaiocb._aiocb_private.status;
+			} else {
+				p->p_retval[0] = EFAULT;
+			}
+			aio_free_entry(cb);
+			return 0;
+		}
+	}
+	splx(s);
+
+	return (EINVAL);
+}
+
+/*
+ * Allow a process to wakeup when any of the I/O requests are
+ * completed.
+ */
+int
+aio_suspend(struct proc *p, struct aio_suspend_args *uap)
+{
+	struct timeval atv;
+	struct timespec ts;
+	struct aiocb *const *cbptr, *cbp;
+	struct kaioinfo *ki;
+	struct aiocblist *cb;
+	int i;
+	int njoblist;
+	int error, s, timo;
+	int *ijoblist;
+	struct aiocb **ujoblist;
+	
+	if (uap->nent >= AIO_LISTIO_MAX)
+		return EINVAL;
+
+	timo = 0;
+	if (uap->timeout) {
+		/*
+		 * Get timespec struct
+		 */
+		if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) {
+			return error;
+		}
+
+		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
+			return (EINVAL);
+
+		TIMESPEC_TO_TIMEVAL(&atv, &ts);
+		if (itimerfix(&atv))
+			return (EINVAL);
+		timo = tvtohz(&atv);
+	}
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return EAGAIN;
+
+	njoblist = 0;
+	ijoblist = zalloc(aiol_zone);
+	ujoblist = zalloc(aiol_zone);
+	cbptr = uap->aiocbp;
+
+	for(i = 0; i < uap->nent; i++) {
+		cbp = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
+		if (cbp == 0)
+			continue;
+		ujoblist[njoblist] = cbp;
+		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
+		njoblist++;
+	}
+	if (njoblist == 0) {
+		zfree(aiol_zone, ijoblist);
+		zfree(aiol_zone, ujoblist);
+		return 0;
+	}
+
+	error = 0;
+	while (1) {
+		for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
+			cb; cb = TAILQ_NEXT(cb, plist)) {
+			for(i = 0; i < njoblist; i++) {
+				if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
+					ijoblist[i]) {
+					if (ujoblist[i] != cb->uuaiocb)
+						error = EINVAL;
+					zfree(aiol_zone, ijoblist);
+					zfree(aiol_zone, ujoblist);
+					return error;
+				}
+			}
+		}
+
+		s = splbio();
+		for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
+			cb; cb = TAILQ_NEXT(cb, plist)) {
+			for(i = 0; i < njoblist; i++) {
+				if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
+					ijoblist[i]) {
+					splx(s);
+					if (ujoblist[i] != cb->uuaiocb)
+						error = EINVAL;
+					zfree(aiol_zone, ijoblist);
+					zfree(aiol_zone, ujoblist);
+					return error;
+				}
+			}
+		}
+
+		ki->kaio_flags |= KAIO_WAKEUP;
+		error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo);
+		splx(s);
+
+		if (error == EINTR) {
+			zfree(aiol_zone, ijoblist);
+			zfree(aiol_zone, ujoblist);
+			return EINTR;
+		} else if (error == EWOULDBLOCK) {
+			zfree(aiol_zone, ijoblist);
+			zfree(aiol_zone, ujoblist);
+			return EAGAIN;
+		}
+	}
+
+/* NOTREACHED */
+	return EINVAL;
+}
+
+/*
+ * aio_cancel at the kernel level is a NOOP right now.  It
+ * might be possible to support it partially in user mode, or
+ * in kernel mode later on.
+ */
+int
+aio_cancel(struct proc *p, struct aio_cancel_args *uap)
+{
+      return ENOSYS;
+}
+
+/*
+ * aio_error is implemented in the kernel level for compatibility
+ * purposes only.  For a user mode async implementation, it would be
+ * best to do it in a userland subroutine.
+ */
+int
+aio_error(struct proc *p, struct aio_error_args *uap)
+{
+	int s;
+	struct aiocblist *cb;
+	struct kaioinfo *ki;
+	int jobref;
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return EINVAL;
+
+	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
+	if ((jobref == -1) || (jobref == 0))
+		return EINVAL;
+
+	for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
+		cb;
+		cb = TAILQ_NEXT(cb, plist)) {
+
+		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+			p->p_retval[0] = cb->uaiocb._aiocb_private.error;
+			return 0;
+		}
+	}
+
+	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue);
+		cb;
+		cb = TAILQ_NEXT(cb, plist)) {
+
+		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+			p->p_retval[0] = EINPROGRESS;
+			return 0;
+		}
+	}
+
+	s = splbio();
+	for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
+		cb;
+		cb = TAILQ_NEXT(cb, plist)) {
+		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+			p->p_retval[0] = cb->uaiocb._aiocb_private.error;
+			splx(s);
+			return 0;
+		}
+	}
+
+	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue);
+		cb;
+		cb = TAILQ_NEXT(cb, plist)) {
+		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
+			p->p_retval[0] = EINPROGRESS;
+			splx(s);
+			return 0;
+		}
+	}
+	splx(s);
+
+
+	/*
+	 * Hack for lio
+	 */
+/*
+	status = fuword(&uap->aiocbp->_aiocb_private.status);
+	if (status == -1) {
+		return fuword(&uap->aiocbp->_aiocb_private.error);
+	}
+*/
+	return EINVAL;
+}
+
+int
+aio_read(struct proc *p, struct aio_read_args *uap)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	unsigned int fd;
+	int cnt;
+	struct aiocb iocb;
+	int error, pmodes;
+
+	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
+	if ((pmodes & AIO_PMODE_SYNC) == 0) {
+		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
+	}
+
+	/*
+	 * Get control block
+	 */
+	if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
+		return error;
+
+	/*
+	 * Get the fd info for process
+	 */
+	fdp = p->p_fd;
+
+	/*
+	 * Range check file descriptor
+	 */
+	fd = iocb.aio_fildes;
+	if (fd >= fdp->fd_nfiles)
+		return EBADF;
+	fp = fdp->fd_ofiles[fd];
+	if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
+		return EBADF;
+	if (iocb.aio_offset == -1LL)
+		return EINVAL;
+
+	auio.uio_resid = iocb.aio_nbytes;
+	if (auio.uio_resid < 0)
+		return (EINVAL);
+
+	/*
+	 * Process sync simply -- queue async request.
+	 */
+	if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) {
+		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
+	}
+
+	aiov.iov_base = (void *) iocb.aio_buf;
+	aiov.iov_len = iocb.aio_nbytes;
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = iocb.aio_offset;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+
+	cnt = iocb.aio_nbytes;
+	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
+	if (error &&
+		(auio.uio_resid != cnt) &&
+		(error == ERESTART || error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	cnt -= auio.uio_resid;
+	p->p_retval[0] = cnt;
+	return error;
+}
+
+int
+aio_write(struct proc *p, struct aio_write_args *uap)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	unsigned int fd;
+	int cnt;
+	struct aiocb iocb;
+	int error;
+	int pmodes;
+
+	/*
+	 * Process sync simply -- queue async request.
+	 */
+	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
+	if ((pmodes & AIO_PMODE_SYNC) == 0) {
+		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE);
+	}
+
+	if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
+		return error;
+
+	/*
+	 * Get the fd info for process
+	 */
+	fdp = p->p_fd;
+
+	/*
+	 * Range check file descriptor
+	 */
+	fd = iocb.aio_fildes;
+	if (fd >= fdp->fd_nfiles)
+		return EBADF;
+	fp = fdp->fd_ofiles[fd];
+	if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
+		return EBADF;
+	if (iocb.aio_offset == -1LL)
+		return EINVAL;
+
+	aiov.iov_base = (void *) iocb.aio_buf;
+	aiov.iov_len = iocb.aio_nbytes;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = iocb.aio_offset;
+
+	auio.uio_resid = iocb.aio_nbytes;
+	if (auio.uio_resid < 0)
+		return (EINVAL);
+
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+
+	cnt = iocb.aio_nbytes;
+	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
+	if (error) {
+		if (auio.uio_resid != cnt) {
+			if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
+				error = 0;
+			if (error == EPIPE)
+				psignal(p, SIGPIPE);
+		}
+	}
+	cnt -= auio.uio_resid;
+	p->p_retval[0] = cnt;
+	return error;
+}
+
+int
+lio_listio(struct proc *p, struct lio_listio_args *uap)
+{
+	int nent, nentqueued;
+	struct aiocb *iocb, * const *cbptr;
+	struct aiocblist *cb;
+	struct kaioinfo *ki;
+	struct aio_liojob *lj;
+	int error, runningcode;
+	int nerror;
+	int i;
+	int s;
+
+	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) {
+		return EINVAL;
+	}
+
+	nent = uap->nent;
+	if (nent > AIO_LISTIO_MAX) {
+		return EINVAL;
+	}
+
+	if (p->p_aioinfo == NULL) {
+		aio_init_aioinfo(p);
+	}
+
+	if ((nent + num_queue_count) > max_queue_count) {
+		return EAGAIN;
+	}
+
+	ki = p->p_aioinfo;
+	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) {
+		return EAGAIN;
+	}
+
+	lj = zalloc(aiolio_zone);
+	if (!lj) {
+		return EAGAIN;
+	}
+
+	lj->lioj_flags = 0;
+	lj->lioj_buffer_count = 0;
+	lj->lioj_buffer_finished_count = 0;
+	lj->lioj_queue_count = 0;
+	lj->lioj_queue_finished_count = 0;
+	lj->lioj_ki = ki;
+	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
+
+	/*
+	 * Setup signal
+	 */
+	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal);
+		if (error)
+			return error;
+		lj->lioj_flags |= LIOJ_SIGNAL;
+		lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
+	} else {
+		lj->lioj_flags &= ~LIOJ_SIGNAL;
+	}
+
+/*
+ * get pointers to the list of I/O requests
+ */
+
+	nerror = 0;
+	nentqueued = 0;
+	cbptr = uap->acb_list;
+	for(i = 0; i < uap->nent; i++) {
+		iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
+		if (((intptr_t) iocb != -1) && ((intptr_t) iocb != NULL)) {
+			error = _aio_aqueue(p, iocb, lj, 0);
+			if (error == 0) {
+				nentqueued++;
+			} else {
+				nerror++;
+			}
+		}
+	}
+
+	/*
+	 * If we haven't queued any, then just return error
+	 */
+	if (nentqueued == 0) {
+		return 0;
+	}
+
+	/*
+	 * Calculate the appropriate error return
+	 */
+	runningcode = 0;
+	if (nerror)
+		runningcode = EIO;
+
+	if (uap->mode == LIO_WAIT) {
+		while (1) {
+			int found;
+			found = 0;
+			for(i = 0; i < uap->nent; i++) {
+				int jobref, command;
+
+				/*
+				 * Fetch address of the control buf pointer in user space
+				 */
+				iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]);
+				if (((intptr_t) iocb == -1) || ((intptr_t) iocb == 0))
+					continue;
+
+				/*
+				 * Fetch the associated command from user space
+				 */
+				command = fuword(&iocb->aio_lio_opcode);
+				if (command == LIO_NOP) {
+					found++;
+					continue;
+				}
+
+				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
+
+				for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
+					cb;
+					cb = TAILQ_NEXT(cb, plist)) {
+					if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
+						jobref) {
+						if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+							curproc->p_stats->p_ru.ru_oublock +=
+								cb->outputcharge;
+							cb->outputcharge = 0;
+						} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+							curproc->p_stats->p_ru.ru_inblock +=
+								cb->inputcharge;
+							cb->inputcharge = 0;
+						}
+						found++;
+						break;
+					}
+				}
+
+				s = splbio();
+				for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
+					cb;
+					cb = TAILQ_NEXT(cb, plist)) {
+					if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
+						jobref) {
+						found++;
+						break;
+					}
+				}
+				splx(s);
+				
+			}
+
+			/*
+			 * If all I/Os have been disposed of, then we can return
+			 */
+			if (found == nentqueued) {
+				return runningcode;
+			}
+			
+			ki->kaio_flags |= KAIO_WAKEUP;
+			error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0);
+
+			if (error == EINTR) {
+				return EINTR;
+			} else if (error == EWOULDBLOCK) {
+				return EAGAIN;
+			}
+
+		}
+	}
+
+	return runningcode;
+}
+
+/*
+ * This is a wierd hack so that we can post a signal.  It is safe
+ * to do so from a timeout routine, but *not* from an interrupt routine.
+ */
+static void
+process_signal(void *ljarg)
+{
+	struct aio_liojob *lj = ljarg;
+	if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) {
+		if (lj->lioj_queue_count == lj->lioj_queue_finished_count) {
+			psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
+			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+		}
+	}
+}
+
+/*
+ * Interrupt handler for physio, performs the necessary process wakeups,
+ * and signals.
+ */
+static void
+aio_physwakeup(bp)
+	struct buf *bp;
+{
+	struct aiocblist *aiocbe;
+	struct proc *p;
+	struct kaioinfo *ki;
+	struct aio_liojob *lj;
+	int s;
+	s = splbio();
+
+	wakeup((caddr_t) bp);
+	bp->b_flags &= ~B_CALL;
+	bp->b_flags |= B_DONE;
+
+	aiocbe = (struct aiocblist *)bp->b_spc;
+	if (aiocbe) {
+		p = bp->b_proc;
+
+		aiocbe->jobstate = JOBST_JOBBFINISHED;
+		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+		aiocbe->uaiocb._aiocb_private.error = 0;
+		aiocbe->jobflags |= AIOCBLIST_DONE;
+
+		if (bp->b_flags & B_ERROR) {
+			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+		}
+
+		lj = aiocbe->lio;
+		if (lj) {
+			lj->lioj_buffer_finished_count++;
+			/*
+			 * wakeup/signal if all of the interrupt jobs are done
+			 */
+			if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) {
+				/*
+				 * post a signal if it is called for
+				 */
+				if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
+					LIOJ_SIGNAL) {
+					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+					timeout(process_signal, lj, 0);
+				}
+			}
+		}
+
+		ki = p->p_aioinfo;
+		if (ki) {
+			ki->kaio_buffer_finished_count++;
+			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
+			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
+			/*
+			 * and do the wakeup
+			 */
+			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
+				ki->kaio_flags &= ~KAIO_WAKEUP;
+				wakeup(p);
+			}
+		}
+	}
+	splx(s);
+}
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
new file mode 100644
index 0000000..3664ccd
--- /dev/null
+++ b/sys/kern/vfs_bio.c
@@ -0,0 +1,2443 @@
+/*
+ * Copyright (c) 1994,1997 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
+ * 2. Absolutely no warranty of function or purpose is made by the author
+ *		John S. Dyson.
+ *
+ * $Id: vfs_bio.c,v 1.194 1999/01/21 08:29:05 dillon Exp $
+ */
+
+/*
+ * this file contains a new buffer I/O scheme implementing a coherent
+ * VM object and buffer cache scheme.  Pains have been taken to make
+ * sure that the performance degradation associated with schemes such
+ * as this is not realized.
+ *
+ * Author:  John S. Dyson
+ * Significant help during the development and debugging phases
+ * had been provided by David Greenman, also of the FreeBSD core team.
+ *
+ * see man buf(9) for more info.
+ */
+
+#define VMIO
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/vmmeter.h>
+#include <sys/lock.h>
+#include <miscfs/specfs/specdev.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <sys/buf.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/resourcevar.h>
+
+static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
+
+struct	bio_ops bioops;		/* I/O operation notification */
+
+#if 0 	/* replaced bu sched_sync */
+static void vfs_update __P((void));
+static struct	proc *updateproc;
+static struct kproc_desc up_kp = {
+	"update",
+	vfs_update,
+	&updateproc
+};
+SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+#endif
+
+struct buf *buf;		/* buffer header pool */
+struct swqueue bswlist;
+
+static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
+		vm_offset_t to);
+static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
+		vm_offset_t to);
+static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
+			      vm_offset_t off, vm_offset_t size,
+			      vm_page_t m);
+static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
+			       int pageno, vm_page_t m);
+static void vfs_clean_pages(struct buf * bp);
+static void vfs_setdirty(struct buf *bp);
+static void vfs_vmio_release(struct buf *bp);
+static void flushdirtybuffers(int slpflag, int slptimeo);
+
+int needsbuffer;
+
+/*
+ * Internal update daemon, process 3
+ *	The variable vfs_update_wakeup allows for internal syncs.
+ */
+int vfs_update_wakeup;
+
+
+/*
+ * buffers base kva
+ */
+
+/*
+ * bogus page -- for I/O to/from partially complete buffers
+ * this is a temporary solution to the problem, but it is not
+ * really that bad.  it would be better to split the buffer
+ * for input in the case of buffers partially already in memory,
+ * but the code is intricate enough already.
+ */
+vm_page_t bogus_page;
+static vm_offset_t bogus_offset;
+
+static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
+	bufmallocspace, maxbufmallocspace;
+int numdirtybuffers;
+static int lodirtybuffers, hidirtybuffers;
+static int numfreebuffers, lofreebuffers, hifreebuffers;
+static int kvafreespace;
+
+SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
+	&numdirtybuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
+	&lodirtybuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
+	&hidirtybuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
+	&numfreebuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
+	&lofreebuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
+	&hifreebuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
+	&maxbufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
+	&bufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
+	&maxvmiobufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
+	&vmiospace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
+	&maxbufmallocspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
+	&bufmallocspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
+	&kvafreespace, 0, "");
+
+static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
+struct bqueues bufqueues[BUFFER_QUEUES] = {0};
+
+extern int vm_swap_size;
+
+#define BUF_MAXUSE 24
+
+#define VFS_BIO_NEED_ANY 1
+#define VFS_BIO_NEED_LOWLIMIT 2
+#define VFS_BIO_NEED_FREE 4
+
+/*
+ * Initialize buffer headers and related structures.
+ */
+void
+bufinit()
+{
+	struct buf *bp;
+	int i;
+
+	TAILQ_INIT(&bswlist);
+	LIST_INIT(&invalhash);
+
+	/* first, make a null hash table */
+	for (i = 0; i < BUFHSZ; i++)
+		LIST_INIT(&bufhashtbl[i]);
+
+	/* next, make a null set of free lists */
+	for (i = 0; i < BUFFER_QUEUES; i++)
+		TAILQ_INIT(&bufqueues[i]);
+
+	/* finally, initialize each buffer header and stick on empty q */
+	for (i = 0; i < nbuf; i++) {
+		bp = &buf[i];
+		bzero(bp, sizeof *bp);
+		bp->b_flags = B_INVAL;	/* we're just an empty header */
+		bp->b_dev = NODEV;
+		bp->b_rcred = NOCRED;
+		bp->b_wcred = NOCRED;
+		bp->b_qindex = QUEUE_EMPTY;
+		bp->b_xflags = 0;
+		LIST_INIT(&bp->b_dep);
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+	}
+/*
+ * maxbufspace is currently calculated to support all filesystem blocks
+ * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
+ * cache is still the same as it would be for 8K filesystems.  This
+ * keeps the size of the buffer cache "in check" for big block filesystems.
+ */
+	maxbufspace = (nbuf + 8) * DFLTBSIZE;
+/*
+ * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
+ */
+	maxvmiobufspace = 2 * maxbufspace / 3;
+/*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space.  Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on average
+ * (small) directories.
+ */
+	maxbufmallocspace = maxbufspace / 20;
+
+/*
+ * Remove the probability of deadlock conditions by limiting the
+ * number of dirty buffers.
+ */
+	hidirtybuffers = nbuf / 8 + 20;
+	lodirtybuffers = nbuf / 16 + 10;
+	numdirtybuffers = 0;
+	lofreebuffers = nbuf / 18 + 5;
+	hifreebuffers = 2 * lofreebuffers;
+	numfreebuffers = nbuf;
+	kvafreespace = 0;
+
+	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+	bogus_page = vm_page_alloc(kernel_object,
+			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+			VM_ALLOC_NORMAL);
+
+}
+
+/*
+ * Free the kva allocation for a buffer
+ * Must be called only at splbio or higher,
+ *  as this is the only locking for buffer_map.
+ */
+static void
+bfreekva(struct buf * bp)
+{
+	if (bp->b_kvasize == 0)
+		return;
+		
+	vm_map_delete(buffer_map,
+		(vm_offset_t) bp->b_kvabase,
+		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
+
+	bp->b_kvasize = 0;
+
+}
+
+/*
+ * remove the buffer from the appropriate free list
+ */
+void
+bremfree(struct buf * bp)
+{
+	int s = splbio();
+
+	if (bp->b_qindex != QUEUE_NONE) {
+		if (bp->b_qindex == QUEUE_EMPTY) {
+			kvafreespace -= bp->b_kvasize;
+		}
+		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+		bp->b_qindex = QUEUE_NONE;
+	} else {
+#if !defined(MAX_PERF)
+		panic("bremfree: removing a buffer when not on a queue");
+#endif
+	}
+	if ((bp->b_flags & B_INVAL) ||
+		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
+		--numfreebuffers;
+	splx(s);
+}
+
+
+/*
+ * Get a buffer with the specified data.  Look in the cache first.
+ */
+int
+bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
+    struct buf ** bpp)
+{
+	struct buf *bp;
+
+	bp = getblk(vp, blkno, size, 0, 0);
+	*bpp = bp;
+
+	/* if not found in cache, do some I/O */
+	if ((bp->b_flags & B_CACHE) == 0) {
+		if (curproc != NULL)
+			curproc->p_stats->p_ru.ru_inblock++;
+		bp->b_flags |= B_READ;
+		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+		if (bp->b_rcred == NOCRED) {
+			if (cred != NOCRED)
+				crhold(cred);
+			bp->b_rcred = cred;
+		}
+		vfs_busy_pages(bp, 0);
+		VOP_STRATEGY(vp, bp);
+		return (biowait(bp));
+	}
+	return (0);
+}
+
+/*
+ * Operates like bread, but also starts asynchronous I/O on
+ * read-ahead blocks.
+ */
+int
+breadn(struct vnode * vp, daddr_t blkno, int size,
+    daddr_t * rablkno, int *rabsize,
+    int cnt, struct ucred * cred, struct buf ** bpp)
+{
+	struct buf *bp, *rabp;
+	int i;
+	int rv = 0, readwait = 0;
+
+	*bpp = bp = getblk(vp, blkno, size, 0, 0);
+
+	/* if not found in cache, do some I/O */
+	if ((bp->b_flags & B_CACHE) == 0) {
+		if (curproc != NULL)
+			curproc->p_stats->p_ru.ru_inblock++;
+		bp->b_flags |= B_READ;
+		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+		if (bp->b_rcred == NOCRED) {
+			if (cred != NOCRED)
+				crhold(cred);
+			bp->b_rcred = cred;
+		}
+		vfs_busy_pages(bp, 0);
+		VOP_STRATEGY(vp, bp);
+		++readwait;
+	}
+	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+		if (inmem(vp, *rablkno))
+			continue;
+		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
+
+		if ((rabp->b_flags & B_CACHE) == 0) {
+			if (curproc != NULL)
+				curproc->p_stats->p_ru.ru_inblock++;
+			rabp->b_flags |= B_READ | B_ASYNC;
+			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+			if (rabp->b_rcred == NOCRED) {
+				if (cred != NOCRED)
+					crhold(cred);
+				rabp->b_rcred = cred;
+			}
+			vfs_busy_pages(rabp, 0);
+			VOP_STRATEGY(vp, rabp);
+		} else {
+			brelse(rabp);
+		}
+	}
+
+	if (readwait) {
+		rv = biowait(bp);
+	}
+	return (rv);
+}
+
+/*
+ * Write, release buffer on completion.  (Done by iodone
+ * if async.)
+ */
+int
+bwrite(struct buf * bp)
+{
+	int oldflags, s;
+	struct vnode *vp;
+	struct mount *mp;
+
+
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return (0);
+	}
+
+	oldflags = bp->b_flags;
+
+#if !defined(MAX_PERF)
+	if ((bp->b_flags & B_BUSY) == 0)
+		panic("bwrite: buffer is not busy???");
+#endif
+
+	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
+	bp->b_flags |= B_WRITEINPROG;
+
+	s = splbio();
+	if ((oldflags & B_DELWRI) == B_DELWRI) {
+		--numdirtybuffers;
+		reassignbuf(bp, bp->b_vp);
+	}
+
+	bp->b_vp->v_numoutput++;
+	vfs_busy_pages(bp, 1);
+	if (curproc != NULL)
+		curproc->p_stats->p_ru.ru_oublock++;
+	splx(s);
+	VOP_STRATEGY(bp->b_vp, bp);
+
+	/*
+	 * Collect statistics on synchronous and asynchronous writes.
+	 * Writes to block devices are charged to their associated
+	 * filesystem (if any).
+	 */
+	if ((vp = bp->b_vp) != NULL) {
+		if (vp->v_type == VBLK)
+			mp = vp->v_specmountpoint;
+		else
+			mp = vp->v_mount;
+		if (mp != NULL)
+			if ((oldflags & B_ASYNC) == 0)
+				mp->mnt_stat.f_syncwrites++;
+			else
+				mp->mnt_stat.f_asyncwrites++;
+	}
+
+	if ((oldflags & B_ASYNC) == 0) {
+		int rtval = biowait(bp);
+		brelse(bp);
+		return (rtval);
+	}
+	return (0);
+}
+
+void
+vfs_bio_need_satisfy(void) {
+	++numfreebuffers;
+	if (!needsbuffer)
+		return;
+	if (numdirtybuffers < lodirtybuffers) {
+		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
+	} else {
+		needsbuffer &= ~VFS_BIO_NEED_ANY;
+	}
+	if (numfreebuffers >= hifreebuffers) {
+		needsbuffer &= ~VFS_BIO_NEED_FREE;
+	}
+	wakeup(&needsbuffer);
+}
+
+/*
+ * Delayed write. (Buffer is marked dirty).
+ */
+void
+bdwrite(struct buf * bp)
+{
+	struct vnode *vp;
+
+#if !defined(MAX_PERF)
+	if ((bp->b_flags & B_BUSY) == 0) {
+		panic("bdwrite: buffer is not busy");
+	}
+#endif
+
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return;
+	}
+	bp->b_flags &= ~(B_READ|B_RELBUF);
+	if ((bp->b_flags & B_DELWRI) == 0) {
+		bp->b_flags |= B_DONE | B_DELWRI;
+		reassignbuf(bp, bp->b_vp);
+		++numdirtybuffers;
+	}
+
+	/*
+	 * This bmap keeps the system from needing to do the bmap later,
+	 * perhaps when the system is attempting to do a sync.  Since it
+	 * is likely that the indirect block -- or whatever other datastructure
+	 * that the filesystem needs is still in memory now, it is a good
+	 * thing to do this.  Note also, that if the pageout daemon is
+	 * requesting a sync -- there might not be enough memory to do
+	 * the bmap then...  So, this is important to do.
+	 */
+	if (bp->b_lblkno == bp->b_blkno) {
+		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+	}
+
+	/*
+	 * Set the *dirty* buffer range based upon the VM system dirty pages.
+	 */
+	vfs_setdirty(bp);
+
+	/*
+	 * We need to do this here to satisfy the vnode_pager and the
+	 * pageout daemon, so that it thinks that the pages have been
+	 * "cleaned".  Note that since the pages are in a delayed write
+	 * buffer -- the VFS layer "will" see that the pages get written
+	 * out on the next sync, or perhaps the cluster will be completed.
+	 */
+	vfs_clean_pages(bp);
+	bqrelse(bp);
+
+	/*
+	 * XXX The soft dependency code is not prepared to
+	 * have I/O done when a bdwrite is requested. For
+	 * now we just let the write be delayed if it is
+	 * requested by the soft dependency code.
+	 */
+	if ((vp = bp->b_vp) &&
+	    ((vp->v_type == VBLK && vp->v_specmountpoint &&
+		  (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
+		 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
+		return;
+
+	if (numdirtybuffers >= hidirtybuffers)
+		flushdirtybuffers(0, 0);
+
+	return;
+}
+
+
+/*
+ * Same as first half of bdwrite, mark buffer dirty, but do not release it.
+ * Check how this compares with vfs_setdirty(); XXX [JRE]
+ */
+void
+bdirty(bp)
+      struct buf *bp;
+{
+	
+	bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
+	if ((bp->b_flags & B_DELWRI) == 0) {
+		bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
+		reassignbuf(bp, bp->b_vp);
+		++numdirtybuffers;
+	}
+}
+
+/*
+ * Asynchronous write.
+ * Start output on a buffer, but do not wait for it to complete.
+ * The buffer is released when the output completes.
+ */
+void
+bawrite(struct buf * bp)
+{
+	bp->b_flags |= B_ASYNC;
+	(void) VOP_BWRITE(bp);
+}
+
+/*
+ * Ordered write.
+ * Start output on a buffer, and flag it so that the device will write
+ * it in the order it was queued.  The buffer is released when the output
+ * completes.
+ */
+int
+bowrite(struct buf * bp)
+{
+	bp->b_flags |= B_ORDERED|B_ASYNC;
+	return (VOP_BWRITE(bp));
+}
+
+/*
+ * Release a buffer.
+ */
+void
+brelse(struct buf * bp)
+{
+	int s;
+
+	if (bp->b_flags & B_CLUSTER) {
+		relpbuf(bp, NULL);
+		return;
+	}
+
+	s = splbio();
+
+	/* anyone need this block? */
+	if (bp->b_flags & B_WANTED) {
+		bp->b_flags &= ~(B_WANTED | B_AGE);
+		wakeup(bp);
+	} 
+
+	if (bp->b_flags & B_LOCKED)
+		bp->b_flags &= ~B_ERROR;
+
+	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
+	    (bp->b_bufsize <= 0)) {
+		bp->b_flags |= B_INVAL;
+		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
+			(*bioops.io_deallocate)(bp);
+		if (bp->b_flags & B_DELWRI)
+			--numdirtybuffers;
+		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
+		if ((bp->b_flags & B_VMIO) == 0) {
+			if (bp->b_bufsize)
+				allocbuf(bp, 0);
+			if (bp->b_vp)
+				brelvp(bp);
+		}
+	}
+
+	/*
+	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
+	 * is called with B_DELWRI set, the underlying pages may wind up
+	 * getting freed causing a previous write (bdwrite()) to get 'lost'
+	 * because pages associated with a B_DELWRI bp are marked clean.
+	 * 
+	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
+	 * if B_DELWRI is set.
+	 */
+
+	if (bp->b_flags & B_DELWRI)
+		bp->b_flags &= ~B_RELBUF;
+
+	/*
+	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
+	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
+	 * but the VM object is kept around.  The B_NOCACHE flag is used to
+	 * invalidate the pages in the VM object.
+	 *
+	 * The b_{validoff,validend,dirtyoff,dirtyend} values are relative 
+	 * to b_offset and currently have byte granularity, whereas the
+	 * valid flags in the vm_pages have only DEV_BSIZE resolution.
+	 * The byte resolution fields are used to avoid unnecessary re-reads
+	 * of the buffer but the code really needs to be genericized so
+	 * other filesystem modules can take advantage of these fields.
+	 *
+	 * XXX this seems to cause performance problems.
+	 */
+	if ((bp->b_flags & B_VMIO)
+	    && !(bp->b_vp->v_tag == VT_NFS &&
+		 bp->b_vp->v_type != VBLK &&
+		 (bp->b_flags & B_DELWRI) != 0)
+#ifdef notdef
+	    && (bp->b_vp->v_tag != VT_NFS
+		|| bp->b_vp->v_type == VBLK
+		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
+		|| bp->b_validend == 0
+		|| (bp->b_validoff == 0
+		    && bp->b_validend == bp->b_bufsize))
+#endif
+	    ) {
+
+		int i, j, resid;
+		vm_page_t m;
+		off_t foff;
+		vm_pindex_t poff;
+		vm_object_t obj;
+		struct vnode *vp;
+
+		vp = bp->b_vp;
+
+		/*
+		 * Get the base offset and length of the buffer.  Note that 
+		 * for block sizes that are less then PAGE_SIZE, the b_data
+		 * base of the buffer does not represent exactly b_offset and
+		 * neither b_offset nor b_size are necessarily page aligned.
+		 * Instead, the starting position of b_offset is:
+		 *
+		 * 	b_data + (b_offset & PAGE_MASK)
+		 *
+		 * block sizes less then DEV_BSIZE (usually 512) are not 
+		 * supported due to the page granularity bits (m->valid,
+		 * m->dirty, etc...). 
+		 *
+		 * See man buf(9) for more information
+		 */
+
+		resid = bp->b_bufsize;
+		foff = bp->b_offset;
+
+		for (i = 0; i < bp->b_npages; i++) {
+			m = bp->b_pages[i];
+			vm_page_flag_clear(m, PG_ZERO);
+			if (m == bogus_page) {
+
+				obj = (vm_object_t) vp->v_object;
+				poff = OFF_TO_IDX(bp->b_offset);
+
+				for (j = i; j < bp->b_npages; j++) {
+					m = bp->b_pages[j];
+					if (m == bogus_page) {
+						m = vm_page_lookup(obj, poff + j);
+#if !defined(MAX_PERF)
+						if (!m) {
+							panic("brelse: page missing\n");
+						}
+#endif
+						bp->b_pages[j] = m;
+					}
+				}
+
+				if ((bp->b_flags & B_INVAL) == 0) {
+					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+				}
+			}
+			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
+				int poffset = foff & PAGE_MASK;
+				int presid = resid > (PAGE_SIZE - poffset) ?
+					(PAGE_SIZE - poffset) : resid;
+
+				KASSERT(presid >= 0, ("brelse: extra page"));
+				vm_page_set_invalid(m, poffset, presid);
+			}
+			resid -= PAGE_SIZE - (foff & PAGE_MASK);
+			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+		}
+
+		if (bp->b_flags & (B_INVAL | B_RELBUF))
+			vfs_vmio_release(bp);
+
+	} else if (bp->b_flags & B_VMIO) {
+
+		if (bp->b_flags & (B_INVAL | B_RELBUF))
+			vfs_vmio_release(bp);
+
+	}
+			
+#if !defined(MAX_PERF)
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("brelse: free buffer onto another queue???");
+#endif
+
+	/* enqueue */
+	/* buffers with no memory */
+	if (bp->b_bufsize == 0) {
+		bp->b_flags |= B_INVAL;
+		bp->b_qindex = QUEUE_EMPTY;
+		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+		LIST_REMOVE(bp, b_hash);
+		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+		bp->b_dev = NODEV;
+		kvafreespace += bp->b_kvasize;
+
+	/* buffers with junk contents */
+	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
+		bp->b_flags |= B_INVAL;
+		bp->b_qindex = QUEUE_AGE;
+		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
+		LIST_REMOVE(bp, b_hash);
+		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+		bp->b_dev = NODEV;
+
+	/* buffers that are locked */
+	} else if (bp->b_flags & B_LOCKED) {
+		bp->b_qindex = QUEUE_LOCKED;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+
+	/* buffers with stale but valid contents */
+	} else if (bp->b_flags & B_AGE) {
+		bp->b_qindex = QUEUE_AGE;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
+
+	/* buffers with valid and quite potentially reuseable contents */
+	} else {
+		bp->b_qindex = QUEUE_LRU;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+	}
+
+	if ((bp->b_flags & B_INVAL) ||
+		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
+		if (bp->b_flags & B_DELWRI) {
+			--numdirtybuffers;
+			bp->b_flags &= ~B_DELWRI;
+		}
+		vfs_bio_need_satisfy();
+	}
+
+	/* unlock */
+	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
+		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+	splx(s);
+}
+
+/*
+ * Release a buffer.
+ */
+void
+bqrelse(struct buf * bp)
+{
+	int s;
+
+	s = splbio();
+
+	/* anyone need this block? */
+	if (bp->b_flags & B_WANTED) {
+		bp->b_flags &= ~(B_WANTED | B_AGE);
+		wakeup(bp);
+	} 
+
+#if !defined(MAX_PERF)
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("bqrelse: free buffer onto another queue???");
+#endif
+
+	if (bp->b_flags & B_LOCKED) {
+		bp->b_flags &= ~B_ERROR;
+		bp->b_qindex = QUEUE_LOCKED;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+		/* buffers with stale but valid contents */
+	} else {
+		bp->b_qindex = QUEUE_LRU;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+	}
+
+	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
+		vfs_bio_need_satisfy();
+	}
+
+	/* unlock */
+	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
+		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+	splx(s);
+}
+
+static void
+vfs_vmio_release(bp)
+	struct buf *bp;
+{
+	int i, s;
+	vm_page_t m;
+
+	s = splvm();
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		bp->b_pages[i] = NULL;
+		/*
+		 * In order to keep page LRU ordering consistent, put
+		 * everything on the inactive queue.
+		 */
+		vm_page_unwire(m, 0);
+		/*
+		 * We don't mess with busy pages, it is
+		 * the responsibility of the process that
+		 * busied the pages to deal with them.
+		 */
+		if ((m->flags & PG_BUSY) || (m->busy != 0))
+			continue;
+			
+		if (m->wire_count == 0) {
+			vm_page_flag_clear(m, PG_ZERO);
+			/*
+			 * Might as well free the page if we can and it has
+			 * no valid data.
+			 */
+			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
+				vm_page_busy(m);
+				vm_page_protect(m, VM_PROT_NONE);
+				vm_page_free(m);
+			}
+		}
+	}
+	splx(s);
+	bufspace -= bp->b_bufsize;
+	vmiospace -= bp->b_bufsize;
+	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+	bp->b_npages = 0;
+	bp->b_bufsize = 0;
+	bp->b_flags &= ~B_VMIO;
+	if (bp->b_vp)
+		brelvp(bp);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+gbincore(struct vnode * vp, daddr_t blkno)
+{
+	struct buf *bp;
+	struct bufhashhdr *bh;
+
+	bh = BUFHASH(vp, blkno);
+	bp = bh->lh_first;
+
+	/* Search hash chain */
+	while (bp != NULL) {
+		/* hit */
+		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
+		    (bp->b_flags & B_INVAL) == 0) {
+			break;
+		}
+		bp = bp->b_hash.le_next;
+	}
+	return (bp);
+}
+
+/*
+ * this routine implements clustered async writes for
+ * clearing out B_DELWRI buffers...  This is much better
+ * than the old way of writing only one buffer at a time.
+ */
+int
+vfs_bio_awrite(struct buf * bp)
+{
+	int i;
+	daddr_t lblkno = bp->b_lblkno;
+	struct vnode *vp = bp->b_vp;
+	int s;
+	int ncl;
+	struct buf *bpa;
+	int nwritten;
+	int size;
+	int maxcl;
+
+	s = splbio();
+	/*
+	 * right now we support clustered writing only to regular files
+	 */
+	if ((vp->v_type == VREG) && 
+	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
+	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
+
+		size = vp->v_mount->mnt_stat.f_iosize;
+		maxcl = MAXPHYS / size;
+
+		for (i = 1; i < maxcl; i++) {
+			if ((bpa = gbincore(vp, lblkno + i)) &&
+			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+			    (B_DELWRI | B_CLUSTEROK)) &&
+			    (bpa->b_bufsize == size)) {
+				if ((bpa->b_blkno == bpa->b_lblkno) ||
+				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
+					break;
+			} else {
+				break;
+			}
+		}
+		ncl = i;
+		/*
+		 * this is a possible cluster write
+		 */
+		if (ncl != 1) {
+			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
+			splx(s);
+			return nwritten;
+		}
+	}
+
+	bremfree(bp);
+	bp->b_flags |= B_BUSY | B_ASYNC;
+
+	splx(s);
+	/*
+	 * default (old) behavior, writing out only one block
+	 */
+	nwritten = bp->b_bufsize;
+	(void) VOP_BWRITE(bp);
+	return nwritten;
+}
+
+
+/*
+ * Find a buffer header which is available for use.
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, daddr_t blkno,
+	int slpflag, int slptimeo, int size, int maxsize)
+{
+	struct buf *bp, *bp1;
+	int nbyteswritten = 0;
+	vm_offset_t addr;
+	static int writerecursion = 0;
+
+start:
+	if (bufspace >= maxbufspace)
+		goto trytofreespace;
+
+	/* can we constitute a new buffer? */
+	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
+#if !defined(MAX_PERF)
+		if (bp->b_qindex != QUEUE_EMPTY)
+			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
+			    bp->b_qindex);
+#endif
+		bp->b_flags |= B_BUSY;
+		bremfree(bp);
+		goto fillbuf;
+	}
+trytofreespace:
+	/*
+	 * We keep the file I/O from hogging metadata I/O
+	 * This is desirable because file data is cached in the
+	 * VM/Buffer cache even if a buffer is freed.
+	 */
+	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
+#if !defined(MAX_PERF)
+		if (bp->b_qindex != QUEUE_AGE)
+			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
+			    bp->b_qindex);
+#endif
+	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
+#if !defined(MAX_PERF)
+		if (bp->b_qindex != QUEUE_LRU)
+			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
+			    bp->b_qindex);
+#endif
+	}
+	if (!bp) {
+		/* wait for a free buffer of any kind */
+		needsbuffer |= VFS_BIO_NEED_ANY;
+		do
+			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
+			    slptimeo);
+		while (needsbuffer & VFS_BIO_NEED_ANY);
+		return (0);
+	}
+	KASSERT(!(bp->b_flags & B_BUSY), 
+	    ("getnewbuf: busy buffer on free list\n"));
+	/*
+	 * We are fairly aggressive about freeing VMIO buffers, but since
+	 * the buffering is intact without buffer headers, there is not
+	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
+	 */
+	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
+		if ((bp->b_flags & B_VMIO) == 0 ||
+			(vmiospace < maxvmiobufspace)) {
+			--bp->b_usecount;
+			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
+			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
+				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+				goto start;
+			}
+			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+		}
+	}
+
+
+	/* if we are a delayed write, convert to an async write */
+	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
+
+		/*
+		 * If our delayed write is likely to be used soon, then
+		 * recycle back onto the LRU queue.
+		 */
+		if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
+			(bp->b_lblkno >= blkno) && (maxsize > 0)) {
+
+			if (bp->b_usecount > 0) {
+				if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
+
+					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
+
+					if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
+						TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+						bp->b_usecount--;
+						goto start;
+					}
+					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+				}
+			}
+		}
+
+		/*
+		 * Certain layered filesystems can recursively re-enter the vfs_bio
+		 * code, due to delayed writes.  This helps keep the system from
+		 * deadlocking.
+		 */
+		if (writerecursion > 0) {
+			if (writerecursion > 5) {
+				bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
+				while (bp) {
+					if ((bp->b_flags & B_DELWRI) == 0)
+						break;
+					bp = TAILQ_NEXT(bp, b_freelist);
+				}
+				if (bp == NULL) {
+					bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
+					while (bp) {
+						if ((bp->b_flags & B_DELWRI) == 0)
+							break;
+						bp = TAILQ_NEXT(bp, b_freelist);
+					}
+				}
+				if (bp == NULL)
+					panic("getnewbuf: cannot get buffer, infinite recursion failure");
+			} else {
+				bremfree(bp);
+				bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
+				nbyteswritten += bp->b_bufsize;
+				++writerecursion;
+				VOP_BWRITE(bp);
+				--writerecursion;
+				if (!slpflag && !slptimeo) {
+					return (0);
+				}
+				goto start;
+			}
+		} else {
+			++writerecursion;
+			nbyteswritten += vfs_bio_awrite(bp);
+			--writerecursion;
+			if (!slpflag && !slptimeo) {
+				return (0);
+			}
+			goto start;
+		}
+	}
+
+	if (bp->b_flags & B_WANTED) {
+		bp->b_flags &= ~B_WANTED;
+		wakeup(bp);
+	}
+	bremfree(bp);
+	bp->b_flags |= B_BUSY;
+
+	if (bp->b_flags & B_VMIO) {
+		bp->b_flags &= ~B_ASYNC;
+		vfs_vmio_release(bp);
+	}
+
+	if (bp->b_vp)
+		brelvp(bp);
+
+fillbuf:
+
+	/* we are not free, nor do we contain interesting data */
+	if (bp->b_rcred != NOCRED) {
+		crfree(bp->b_rcred);
+		bp->b_rcred = NOCRED;
+	}
+	if (bp->b_wcred != NOCRED) {
+		crfree(bp->b_wcred);
+		bp->b_wcred = NOCRED;
+	}
+	if (LIST_FIRST(&bp->b_dep) != NULL &&
+	    bioops.io_deallocate)
+		(*bioops.io_deallocate)(bp);
+
+	LIST_REMOVE(bp, b_hash);
+	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+	if (bp->b_bufsize) {
+		allocbuf(bp, 0);
+	}
+	bp->b_flags = B_BUSY;
+	bp->b_dev = NODEV;
+	bp->b_vp = NULL;
+	bp->b_blkno = bp->b_lblkno = 0;
+	bp->b_offset = NOOFFSET;
+	bp->b_iodone = 0;
+	bp->b_error = 0;
+	bp->b_resid = 0;
+	bp->b_bcount = 0;
+	bp->b_npages = 0;
+	bp->b_dirtyoff = bp->b_dirtyend = 0;
+	bp->b_validoff = bp->b_validend = 0;
+	bp->b_usecount = 5;
+	/* Here, not kern_physio.c, is where this should be done*/
+	LIST_INIT(&bp->b_dep);
+
+	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
+
+	/*
+	 * we assume that buffer_map is not at address 0
+	 */
+	addr = 0;
+	if (maxsize != bp->b_kvasize) {
+		bfreekva(bp);
+		
+findkvaspace:
+		/*
+		 * See if we have buffer kva space
+		 */
+		if (vm_map_findspace(buffer_map,
+			vm_map_min(buffer_map), maxsize, &addr)) {
+			if (kvafreespace > 0) {
+				int totfree = 0, freed;
+				do {
+					freed = 0;
+					for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+						bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
+						if (bp1->b_kvasize != 0) {
+							totfree += bp1->b_kvasize;
+							freed = bp1->b_kvasize;
+							bremfree(bp1);
+							bfreekva(bp1);
+							brelse(bp1);
+							break;
+						}
+					}
+				} while (freed);
+				/*
+				 * if we found free space, then retry with the same buffer.
+				 */
+				if (totfree)
+					goto findkvaspace;
+			}
+			bp->b_flags |= B_INVAL;
+			brelse(bp);
+			goto trytofreespace;
+		}
+	}
+
+	/*
+	 * See if we are below are allocated minimum
+	 */
+	if (bufspace >= (maxbufspace + nbyteswritten)) {
+		bp->b_flags |= B_INVAL;
+		brelse(bp);
+		goto trytofreespace;
+	}
+
+	/*
+	 * create a map entry for the buffer -- in essence
+	 * reserving the kva space.
+	 */
+	if (addr) {
+		vm_map_insert(buffer_map, NULL, 0,
+			addr, addr + maxsize,
+			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
+
+		bp->b_kvabase = (caddr_t) addr;
+		bp->b_kvasize = maxsize;
+	}
+	bp->b_data = bp->b_kvabase;
+	
+	return (bp);
+}
+
+static void
+waitfreebuffers(int slpflag, int slptimeo) {
+	while (numfreebuffers < hifreebuffers) {
+		flushdirtybuffers(slpflag, slptimeo);
+		if (numfreebuffers < hifreebuffers)
+			break;
+		needsbuffer |= VFS_BIO_NEED_FREE;
+		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
+			break;
+	}
+}
+
+static void
+flushdirtybuffers(int slpflag, int slptimeo) {
+	int s;
+	static pid_t flushing = 0;
+
+	s = splbio();
+
+	if (flushing) {
+		if (flushing == curproc->p_pid) {
+			splx(s);
+			return;
+		}
+		while (flushing) {
+			if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
+				splx(s);
+				return;
+			}
+		}
+	}
+	flushing = curproc->p_pid;
+
+	while (numdirtybuffers > lodirtybuffers) {
+		struct buf *bp;
+		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
+		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
+		if (bp == NULL)
+			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
+
+		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
+			bp = TAILQ_NEXT(bp, b_freelist);
+		}
+
+		if (bp) {
+			vfs_bio_awrite(bp);
+			continue;
+		}
+		break;
+	}
+
+	flushing = 0;
+	wakeup(&flushing);
+	splx(s);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+incore(struct vnode * vp, daddr_t blkno)
+{
+	struct buf *bp;
+
+	int s = splbio();
+	bp = gbincore(vp, blkno);
+	splx(s);
+	return (bp);
+}
+
+/*
+ * Returns true if no I/O is needed to access the
+ * associated VM object.  This is like incore except
+ * it also hunts around in the VM system for the data.
+ */
+
+int
+inmem(struct vnode * vp, daddr_t blkno)
+{
+	vm_object_t obj;
+	vm_offset_t toff, tinc, size;
+	vm_page_t m;
+	vm_ooffset_t off;
+
+	if (incore(vp, blkno))
+		return 1;
+	if (vp->v_mount == NULL)
+		return 0;
+	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
+		return 0;
+
+	obj = vp->v_object;
+	size = PAGE_SIZE;
+	if (size > vp->v_mount->mnt_stat.f_iosize)
+		size = vp->v_mount->mnt_stat.f_iosize;
+	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
+
+	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
+		if (!m)
+			return 0;
+		tinc = size;
+		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
+			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
+		if (vm_page_is_valid(m,
+		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * now we set the dirty range for the buffer --
+ * for NFS -- if the file is mapped and pages have
+ * been written to, let it know.  We want the
+ * entire range of the buffer to be marked dirty if
+ * any of the pages have been written to for consistancy
+ * with the b_validoff, b_validend set in the nfs write
+ * code, and used by the nfs read code.
+ */
+static void
+vfs_setdirty(struct buf *bp) {
+	int i;
+	vm_object_t object;
+	vm_offset_t boffset;
+#if 0
+	vm_offset_t offset;
+#endif
+
+	/*
+	 * We qualify the scan for modified pages on whether the
+	 * object has been flushed yet.  The OBJ_WRITEABLE flag
+	 * is not cleared simply by protecting pages off.
+	 */
+	if ((bp->b_flags & B_VMIO) &&
+		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
+		/*
+		 * test the pages to see if they have been modified directly
+		 * by users through the VM system.
+		 */
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
+			vm_page_test_dirty(bp->b_pages[i]);
+		}
+
+		/*
+		 * scan forwards for the first page modified
+		 */
+		for (i = 0; i < bp->b_npages; i++) {
+			if (bp->b_pages[i]->dirty) {
+				break;
+			}
+		}
+
+		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+		if (boffset < bp->b_dirtyoff) {
+			bp->b_dirtyoff = max(boffset, 0);
+		}
+
+		/*
+		 * scan backwards for the last page modified
+		 */
+		for (i = bp->b_npages - 1; i >= 0; --i) {
+			if (bp->b_pages[i]->dirty) {
+				break;
+			}
+		}
+		boffset = (i + 1);
+#if 0
+		offset = boffset + bp->b_pages[0]->pindex;
+		if (offset >= object->size)
+			boffset = object->size - bp->b_pages[0]->pindex;
+#endif
+		boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+		if (bp->b_dirtyend < boffset)
+			bp->b_dirtyend = min(boffset, bp->b_bufsize);
+	}
+}
+
+/*
+ * Get a block given a specified block and offset into a file/device.
+ */
+struct buf *
+getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
+{
+	struct buf *bp;
+	int i, s;
+	struct bufhashhdr *bh;
+
+#if !defined(MAX_PERF)
+	if (size > MAXBSIZE)
+		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+#endif
+
+	s = splbio();
+loop:
+	if (numfreebuffers < lofreebuffers) {
+		waitfreebuffers(slpflag, slptimeo);
+	}
+
+	if ((bp = gbincore(vp, blkno))) {
+		if (bp->b_flags & B_BUSY) {
+			bp->b_flags |= B_WANTED;
+			if (bp->b_usecount < BUF_MAXUSE)
+				++bp->b_usecount;
+
+			if (!tsleep(bp,
+				(PRIBIO + 4) | slpflag, "getblk", slptimeo)) {
+				goto loop;
+			}
+
+			splx(s);
+			return (struct buf *) NULL;
+		}
+		bp->b_flags |= B_BUSY | B_CACHE;
+		bremfree(bp);
+
+		/*
+		 * check for size inconsistancies for non-VMIO case.
+		 */
+
+		if (bp->b_bcount != size) {
+			if ((bp->b_flags & B_VMIO) == 0 ||
+			    (size > bp->b_kvasize)
+			) {
+				if (bp->b_flags & B_DELWRI) {
+					bp->b_flags |= B_NOCACHE;
+					VOP_BWRITE(bp);
+				} else {
+					if ((bp->b_flags & B_VMIO) &&
+					   (LIST_FIRST(&bp->b_dep) == NULL)) {
+						bp->b_flags |= B_RELBUF;
+						brelse(bp);
+					} else {
+						bp->b_flags |= B_NOCACHE;
+						VOP_BWRITE(bp);
+					}
+				}
+				goto loop;
+			}
+		}
+
+		/*
+		 * If the size is inconsistant in the VMIO case, we can resize
+		 * the buffer.  This might lead to B_CACHE getting cleared.
+		 */
+
+		if (bp->b_bcount != size)
+			allocbuf(bp, size);
+
+		KASSERT(bp->b_offset != NOOFFSET, 
+		    ("getblk: no buffer offset"));
+
+		/*
+		 * Check that the constituted buffer really deserves for the
+		 * B_CACHE bit to be set.  B_VMIO type buffers might not
+		 * contain fully valid pages.  Normal (old-style) buffers
+		 * should be fully valid.  This might also lead to B_CACHE
+		 * getting clear.
+		 */
+		if ((bp->b_flags & B_VMIO|B_CACHE) == (B_VMIO|B_CACHE)) {
+			int checksize = bp->b_bufsize;
+			int poffset = bp->b_offset & PAGE_MASK;
+			int resid;
+			for (i = 0; i < bp->b_npages; i++) {
+				resid = (checksize > (PAGE_SIZE - poffset)) ?
+					(PAGE_SIZE - poffset) : checksize;
+				if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
+					bp->b_flags &= ~(B_CACHE | B_DONE);
+					break;
+				}
+				checksize -= resid;
+				poffset = 0;
+			}
+		}
+
+		/*
+		 * If B_DELWRI is set and B_CACHE got cleared ( or was
+		 * already clear ), we have to commit the write and
+		 * retry.  The NFS code absolutely depends on this,
+		 * and so might the FFS code.  In anycase, it formalizes
+		 * the B_CACHE rules.  See sys/buf.h.
+		 */
+
+		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+			VOP_BWRITE(bp);
+			goto loop;
+		}
+
+		if (bp->b_usecount < BUF_MAXUSE)
+			++bp->b_usecount;
+		splx(s);
+		return (bp);
+	} else {
+		int bsize, maxsize, vmio;
+		off_t offset;
+
+		if (vp->v_type == VBLK)
+			bsize = DEV_BSIZE;
+		else if (vp->v_mountedhere)
+			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
+		else if (vp->v_mount)
+			bsize = vp->v_mount->mnt_stat.f_iosize;
+		else
+			bsize = size;
+
+		offset = (off_t)blkno * bsize;
+		vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF);
+		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
+		maxsize = imax(maxsize, bsize);
+
+		if ((bp = getnewbuf(vp, blkno,
+			slpflag, slptimeo, size, maxsize)) == 0) {
+			if (slpflag || slptimeo) {
+				splx(s);
+				return NULL;
+			}
+			goto loop;
+		}
+
+		/*
+		 * This code is used to make sure that a buffer is not
+		 * created while the getnewbuf routine is blocked.
+		 * Normally the vnode is locked so this isn't a problem.
+		 * VBLK type I/O requests, however, don't lock the vnode.
+		 */
+		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE && gbincore(vp, blkno)) {
+			bp->b_flags |= B_INVAL;
+			brelse(bp);
+			goto loop;
+		}
+
+		/*
+		 * Insert the buffer into the hash, so that it can
+		 * be found by incore.
+		 */
+		bp->b_blkno = bp->b_lblkno = blkno;
+		bp->b_offset = offset;
+
+		bgetvp(vp, bp);
+		LIST_REMOVE(bp, b_hash);
+		bh = BUFHASH(vp, blkno);
+		LIST_INSERT_HEAD(bh, bp, b_hash);
+
+		if (vmio) {
+			bp->b_flags |= (B_VMIO | B_CACHE);
+#if defined(VFS_BIO_DEBUG)
+			if (vp->v_type != VREG && vp->v_type != VBLK)
+				printf("getblk: vmioing file type %d???\n", vp->v_type);
+#endif
+		} else {
+			bp->b_flags &= ~B_VMIO;
+		}
+
+		allocbuf(bp, size);
+
+		splx(s);
+		return (bp);
+	}
+}
+
+/*
+ * Get an empty, disassociated buffer of given size.
+ */
+struct buf *
+geteblk(int size)
+{
+	struct buf *bp;
+	int s;
+
+	s = splbio();
+	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
+	splx(s);
+	allocbuf(bp, size);
+	bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
+	return (bp);
+}
+
+
+/*
+ * This code constitutes the buffer memory from either anonymous system
+ * memory (in the case of non-VMIO operations) or from an associated
+ * VM object (in the case of VMIO operations).  This code is able to
+ * resize a buffer up or down.
+ *
+ * Note that this code is tricky, and has many complications to resolve
+ * deadlock or inconsistant data situations.  Tread lightly!!! 
+ * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
+ * the caller.  Calling this code willy nilly can result in the loss of data.
+ */
+
+int
+allocbuf(struct buf *bp, int size)
+{
+	int newbsize, mbsize;
+	int i;
+
+#if !defined(MAX_PERF)
+	if (!(bp->b_flags & B_BUSY))
+		panic("allocbuf: buffer not busy");
+
+	if (bp->b_kvasize < size)
+		panic("allocbuf: buffer too small");
+#endif
+
+	if ((bp->b_flags & B_VMIO) == 0) {
+		caddr_t origbuf;
+		int origbufsize;
+		/*
+		 * Just get anonymous memory from the kernel
+		 */
+		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+#if !defined(NO_B_MALLOC)
+		if (bp->b_flags & B_MALLOC)
+			newbsize = mbsize;
+		else
+#endif
+			newbsize = round_page(size);
+
+		if (newbsize < bp->b_bufsize) {
+#if !defined(NO_B_MALLOC)
+			/*
+			 * malloced buffers are not shrunk
+			 */
+			if (bp->b_flags & B_MALLOC) {
+				if (newbsize) {
+					bp->b_bcount = size;
+				} else {
+					free(bp->b_data, M_BIOBUF);
+					bufspace -= bp->b_bufsize;
+					bufmallocspace -= bp->b_bufsize;
+					bp->b_data = bp->b_kvabase;
+					bp->b_bufsize = 0;
+					bp->b_bcount = 0;
+					bp->b_flags &= ~B_MALLOC;
+				}
+				return 1;
+			}		
+#endif
+			vm_hold_free_pages(
+			    bp,
+			    (vm_offset_t) bp->b_data + newbsize,
+			    (vm_offset_t) bp->b_data + bp->b_bufsize);
+		} else if (newbsize > bp->b_bufsize) {
+#if !defined(NO_B_MALLOC)
+			/*
+			 * We only use malloced memory on the first allocation.
+			 * and revert to page-allocated memory when the buffer grows.
+			 */
+			if ( (bufmallocspace < maxbufmallocspace) &&
+				(bp->b_bufsize == 0) &&
+				(mbsize <= PAGE_SIZE/2)) {
+
+				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
+				bp->b_bufsize = mbsize;
+				bp->b_bcount = size;
+				bp->b_flags |= B_MALLOC;
+				bufspace += mbsize;
+				bufmallocspace += mbsize;
+				return 1;
+			}
+#endif
+			origbuf = NULL;
+			origbufsize = 0;
+#if !defined(NO_B_MALLOC)
+			/*
+			 * If the buffer is growing on its other-than-first allocation,
+			 * then we revert to the page-allocation scheme.
+			 */
+			if (bp->b_flags & B_MALLOC) {
+				origbuf = bp->b_data;
+				origbufsize = bp->b_bufsize;
+				bp->b_data = bp->b_kvabase;
+				bufspace -= bp->b_bufsize;
+				bufmallocspace -= bp->b_bufsize;
+				bp->b_bufsize = 0;
+				bp->b_flags &= ~B_MALLOC;
+				newbsize = round_page(newbsize);
+			}
+#endif
+			vm_hold_load_pages(
+			    bp,
+			    (vm_offset_t) bp->b_data + bp->b_bufsize,
+			    (vm_offset_t) bp->b_data + newbsize);
+#if !defined(NO_B_MALLOC)
+			if (origbuf) {
+				bcopy(origbuf, bp->b_data, origbufsize);
+				free(origbuf, M_BIOBUF);
+			}
+#endif
+		}
+	} else {
+		vm_page_t m;
+		int desiredpages;
+
+		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+		desiredpages = (size == 0) ? 0 :
+			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
+
+#if !defined(NO_B_MALLOC)
+		if (bp->b_flags & B_MALLOC)
+			panic("allocbuf: VMIO buffer can't be malloced");
+#endif
+
+		if (newbsize < bp->b_bufsize) {
+			if (desiredpages < bp->b_npages) {
+				for (i = desiredpages; i < bp->b_npages; i++) {
+					/*
+					 * the page is not freed here -- it
+					 * is the responsibility of vnode_pager_setsize
+					 */
+					m = bp->b_pages[i];
+					KASSERT(m != bogus_page,
+					    ("allocbuf: bogus page found"));
+					while (vm_page_sleep_busy(m, TRUE, "biodep"))
+						;
+
+					bp->b_pages[i] = NULL;
+					vm_page_unwire(m, 0);
+				}
+				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
+				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
+				bp->b_npages = desiredpages;
+			}
+		} else if (newbsize > bp->b_bufsize) {
+			vm_object_t obj;
+			vm_offset_t tinc, toff;
+			vm_ooffset_t off;
+			vm_pindex_t objoff;
+			int pageindex, curbpnpages;
+			struct vnode *vp;
+			int bsize;
+			int orig_validoff = bp->b_validoff;
+			int orig_validend = bp->b_validend;
+
+			vp = bp->b_vp;
+
+			if (vp->v_type == VBLK)
+				bsize = DEV_BSIZE;
+			else
+				bsize = vp->v_mount->mnt_stat.f_iosize;
+
+			if (bp->b_npages < desiredpages) {
+				obj = vp->v_object;
+				tinc = PAGE_SIZE;
+
+				off = bp->b_offset;
+				KASSERT(bp->b_offset != NOOFFSET,
+				    ("allocbuf: no buffer offset"));
+				curbpnpages = bp->b_npages;
+		doretry:
+				bp->b_validoff = orig_validoff;
+				bp->b_validend = orig_validend;
+				bp->b_flags |= B_CACHE;
+				for (toff = 0; toff < newbsize; toff += tinc) {
+					objoff = OFF_TO_IDX(off + toff);
+					pageindex = objoff - OFF_TO_IDX(off);
+					tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK);
+					if (pageindex < curbpnpages) {
+
+						m = bp->b_pages[pageindex];
+#ifdef VFS_BIO_DIAG
+						if (m->pindex != objoff)
+							panic("allocbuf: page changed offset?!!!?");
+#endif
+						if (tinc > (newbsize - toff))
+							tinc = newbsize - toff;
+						if (bp->b_flags & B_CACHE)
+							vfs_buf_set_valid(bp, off, toff, tinc, m);
+						continue;
+					}
+					m = vm_page_lookup(obj, objoff);
+					if (!m) {
+						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
+						if (!m) {
+							VM_WAIT;
+							vm_pageout_deficit += (desiredpages - curbpnpages);
+							goto doretry;
+						}
+
+						vm_page_wire(m);
+						vm_page_wakeup(m);
+						bp->b_flags &= ~B_CACHE;
+
+					} else if (vm_page_sleep_busy(m, FALSE, "pgtblk")) {
+						/*
+						 *  If we had to sleep, retry.
+						 *
+						 *  Also note that we only test
+						 *  PG_BUSY here, not m->busy.
+						 *  
+						 *  We cannot sleep on m->busy
+						 *  here because a vm_fault ->
+						 *  getpages -> cluster-read ->
+						 *  ...-> allocbuf sequence 
+						 *  will convert PG_BUSY to
+						 *  m->busy so we have to let 
+						 *  m->busy through if we do 
+						 *  not want to deadlock.
+						 */
+						goto doretry;
+					} else {
+						if ((curproc != pageproc) &&
+							((m->queue - m->pc) == PQ_CACHE) &&
+						    ((cnt.v_free_count + cnt.v_cache_count) <
+								(cnt.v_free_min + cnt.v_cache_min))) {
+							pagedaemon_wakeup();
+						}
+						if (tinc > (newbsize - toff))
+							tinc = newbsize - toff;
+						if (bp->b_flags & B_CACHE)
+							vfs_buf_set_valid(bp, off, toff, tinc, m);
+						vm_page_flag_clear(m, PG_ZERO);
+						vm_page_wire(m);
+					}
+					bp->b_pages[pageindex] = m;
+					curbpnpages = pageindex + 1;
+				}
+				if (vp->v_tag == VT_NFS && 
+				    vp->v_type != VBLK) {
+					if (bp->b_dirtyend > 0) {
+						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
+						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
+					}
+					if (bp->b_validend == 0)
+						bp->b_flags &= ~B_CACHE;
+				}
+				bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data);
+				bp->b_npages = curbpnpages;
+				pmap_qenter((vm_offset_t) bp->b_data,
+					bp->b_pages, bp->b_npages);
+				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
+			}
+		}
+	}
+	if (bp->b_flags & B_VMIO)
+		vmiospace += (newbsize - bp->b_bufsize);
+	bufspace += (newbsize - bp->b_bufsize);
+	bp->b_bufsize = newbsize;
+	bp->b_bcount = size;
+	return 1;
+}
+
+/*
+ * Wait for buffer I/O completion, returning error status.
+ */
+int
+biowait(register struct buf * bp)
+{
+	int s;
+
+	s = splbio();
+	while ((bp->b_flags & B_DONE) == 0)
+#if defined(NO_SCHEDULE_MODS)
+		tsleep(bp, PRIBIO, "biowait", 0);
+#else
+		if (bp->b_flags & B_READ)
+			tsleep(bp, PRIBIO, "biord", 0);
+		else
+			tsleep(bp, PRIBIO, "biowr", 0);
+#endif
+	splx(s);
+	if (bp->b_flags & B_EINTR) {
+		bp->b_flags &= ~B_EINTR;
+		return (EINTR);
+	}
+	if (bp->b_flags & B_ERROR) {
+		return (bp->b_error ? bp->b_error : EIO);
+	} else {
+		return (0);
+	}
+}
+
+/*
+ * Finish I/O on a buffer, calling an optional function.
+ * This is usually called from interrupt level, so process blocking
+ * is not *a good idea*.
+ */
+void
+biodone(register struct buf * bp)
+{
+	int s;
+
+	s = splbio();
+
+#if !defined(MAX_PERF)
+	if (!(bp->b_flags & B_BUSY))
+		panic("biodone: buffer not busy");
+#endif
+
+	if (bp->b_flags & B_DONE) {
+		splx(s);
+#if !defined(MAX_PERF)
+		printf("biodone: buffer already done\n");
+#endif
+		return;
+	}
+	bp->b_flags |= B_DONE;
+
+	if (bp->b_flags & B_FREEBUF) {
+		brelse(bp);
+		splx(s);
+		return;
+	}
+
+	if ((bp->b_flags & B_READ) == 0) {
+		vwakeup(bp);
+	}
+
+	/* call optional completion function if requested */
+	if (bp->b_flags & B_CALL) {
+		bp->b_flags &= ~B_CALL;
+		(*bp->b_iodone) (bp);
+		splx(s);
+		return;
+	}
+	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
+		(*bioops.io_complete)(bp);
+
+	if (bp->b_flags & B_VMIO) {
+		int i, resid;
+		vm_ooffset_t foff;
+		vm_page_t m;
+		vm_object_t obj;
+		int iosize;
+		struct vnode *vp = bp->b_vp;
+
+		obj = vp->v_object;
+
+#if defined(VFS_BIO_DEBUG)
+		if (vp->v_usecount == 0) {
+			panic("biodone: zero vnode ref count");
+		}
+
+		if (vp->v_object == NULL) {
+			panic("biodone: missing VM object");
+		}
+
+		if ((vp->v_flag & VOBJBUF) == 0) {
+			panic("biodone: vnode is not setup for merged cache");
+		}
+#endif
+
+		foff = bp->b_offset;
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("biodone: no buffer offset"));
+
+#if !defined(MAX_PERF)
+		if (!obj) {
+			panic("biodone: no object");
+		}
+#endif
+#if defined(VFS_BIO_DEBUG)
+		if (obj->paging_in_progress < bp->b_npages) {
+			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
+			    obj->paging_in_progress, bp->b_npages);
+		}
+#endif
+		iosize = bp->b_bufsize;
+		for (i = 0; i < bp->b_npages; i++) {
+			int bogusflag = 0;
+			m = bp->b_pages[i];
+			if (m == bogus_page) {
+				bogusflag = 1;
+				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
+				if (!m) {
+#if defined(VFS_BIO_DEBUG)
+					printf("biodone: page disappeared\n");
+#endif
+					vm_object_pip_subtract(obj, 1);
+					continue;
+				}
+				bp->b_pages[i] = m;
+				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+			}
+#if defined(VFS_BIO_DEBUG)
+			if (OFF_TO_IDX(foff) != m->pindex) {
+				printf(
+"biodone: foff(%lu)/m->pindex(%d) mismatch\n",
+				    (unsigned long)foff, m->pindex);
+			}
+#endif
+			resid = IDX_TO_OFF(m->pindex + 1) - foff;
+			if (resid > iosize)
+				resid = iosize;
+
+			/*
+			 * In the write case, the valid and clean bits are
+			 * already changed correctly, so we only need to do this
+			 * here in the read case.
+			 */
+			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
+				vfs_page_set_valid(bp, foff, i, m);
+			}
+			vm_page_flag_clear(m, PG_ZERO);
+
+			/*
+			 * when debugging new filesystems or buffer I/O methods, this
+			 * is the most common error that pops up.  if you see this, you
+			 * have not set the page busy flag correctly!!!
+			 */
+			if (m->busy == 0) {
+#if !defined(MAX_PERF)
+				printf("biodone: page busy < 0, "
+				    "pindex: %d, foff: 0x(%x,%x), "
+				    "resid: %d, index: %d\n",
+				    (int) m->pindex, (int)(foff >> 32),
+						(int) foff & 0xffffffff, resid, i);
+#endif
+				if (vp->v_type != VBLK)
+#if !defined(MAX_PERF)
+					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
+					    bp->b_vp->v_mount->mnt_stat.f_iosize,
+					    (int) bp->b_lblkno,
+					    bp->b_flags, bp->b_npages);
+				else
+					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
+					    (int) bp->b_lblkno,
+					    bp->b_flags, bp->b_npages);
+				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
+				    m->valid, m->dirty, m->wire_count);
+#endif
+				panic("biodone: page busy < 0\n");
+			}
+			vm_page_io_finish(m);
+			vm_object_pip_subtract(obj, 1);
+			foff += resid;
+			iosize -= resid;
+		}
+		if (obj)
+			vm_object_pip_wakeupn(obj, 0);
+	}
+	/*
+	 * For asynchronous completions, release the buffer now. The brelse
+	 * checks for B_WANTED and will do the wakeup there if necessary - so
+	 * no need to do a wakeup here in the async case.
+	 */
+
+	if (bp->b_flags & B_ASYNC) {
+		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
+			brelse(bp);
+		else
+			bqrelse(bp);
+	} else {
+		bp->b_flags &= ~B_WANTED;
+		wakeup(bp);
+	}
+	splx(s);
+}
+
+#if 0	/* not with kirks code */
+static int vfs_update_interval = 30;
+
+static void
+vfs_update()
+{
+	while (1) {
+		tsleep(&vfs_update_wakeup, PUSER, "update",
+		    hz * vfs_update_interval);
+		vfs_update_wakeup = 0;
+		sync(curproc, NULL);
+	}
+}
+
+static int
+sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
+{
+	int error = sysctl_handle_int(oidp,
+		oidp->oid_arg1, oidp->oid_arg2, req);
+	if (!error)
+		wakeup(&vfs_update_wakeup);
+	return error;
+}
+
+SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
+	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
+
+#endif
+
+
+/*
+ * This routine is called in lieu of iodone in the case of
+ * incomplete I/O.  This keeps the busy status for pages
+ * consistant.
+ */
+void
+vfs_unbusy_pages(struct buf * bp)
+{
+	int i;
+
+	if (bp->b_flags & B_VMIO) {
+		struct vnode *vp = bp->b_vp;
+		vm_object_t obj = vp->v_object;
+
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+
+			if (m == bogus_page) {
+				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
+#if !defined(MAX_PERF)
+				if (!m) {
+					panic("vfs_unbusy_pages: page missing\n");
+				}
+#endif
+				bp->b_pages[i] = m;
+				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+			}
+			vm_object_pip_subtract(obj, 1);
+			vm_page_flag_clear(m, PG_ZERO);
+			vm_page_io_finish(m);
+		}
+		vm_object_pip_wakeupn(obj, 0);
+	}
+}
+
+/*
+ * Set NFS' b_validoff and b_validend fields from the valid bits
+ * of a page.  If the consumer is not NFS, and the page is not
+ * valid for the entire range, clear the B_CACHE flag to force
+ * the consumer to re-read the page.
+ *
+ * B_CACHE interaction is especially tricky.
+ */
+static void
+vfs_buf_set_valid(struct buf *bp,
+		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
+		  vm_page_t m)
+{
+	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
+		vm_offset_t svalid, evalid;
+		int validbits = m->valid >> (((foff+off)&PAGE_MASK)/DEV_BSIZE);
+
+		/*
+		 * This only bothers with the first valid range in the
+		 * page.
+		 */
+		svalid = off;
+		while (validbits && !(validbits & 1)) {
+			svalid += DEV_BSIZE;
+			validbits >>= 1;
+		}
+		evalid = svalid;
+		while (validbits & 1) {
+			evalid += DEV_BSIZE;
+			validbits >>= 1;
+		}
+		evalid = min(evalid, off + size);
+		/*
+		 * We can only set b_validoff/end if this range is contiguous
+		 * with the range built up already.  If we cannot set
+		 * b_validoff/end, we must clear B_CACHE to force an update
+		 * to clean the bp up.
+		 */
+		if (svalid == bp->b_validend) {
+			bp->b_validoff = min(bp->b_validoff, svalid);
+			bp->b_validend = max(bp->b_validend, evalid);
+		} else {
+			bp->b_flags &= ~B_CACHE;
+		}
+	} else if (!vm_page_is_valid(m,
+				     (vm_offset_t) ((foff + off) & PAGE_MASK),
+				     size)) {
+		bp->b_flags &= ~B_CACHE;
+	}
+}
+
+/*
+ * Set the valid bits in a page, taking care of the b_validoff,
+ * b_validend fields which NFS uses to optimise small reads.  Off is
+ * the offset within the file and pageno is the page index within the buf.
+ *
+ * XXX we have to set the valid & clean bits for all page fragments 
+ * touched by b_validoff/validend, even if the page fragment goes somewhat
+ * beyond b_validoff/validend due to alignment.
+ */
+static void
+vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
+{
+	struct vnode *vp = bp->b_vp;
+	vm_ooffset_t soff, eoff;
+
+	soff = off;
+	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
+	if (eoff > bp->b_offset + bp->b_bufsize)
+		eoff = bp->b_offset + bp->b_bufsize;
+	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
+		vm_ooffset_t sv, ev;
+		vm_page_set_invalid(m,
+		    (vm_offset_t) (soff & PAGE_MASK),
+		    (vm_offset_t) (eoff - soff));
+		sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+		ev = (bp->b_offset + bp->b_validend + (DEV_BSIZE - 1)) & 
+		    ~(DEV_BSIZE - 1);
+		soff = qmax(sv, soff);
+		eoff = qmin(ev, eoff);
+	}
+	if (eoff > soff)
+		vm_page_set_validclean(m,
+	       (vm_offset_t) (soff & PAGE_MASK),
+	       (vm_offset_t) (eoff - soff));
+}
+
+/*
+ * This routine is called before a device strategy routine.
+ * It is used to tell the VM system that paging I/O is in
+ * progress, and treat the pages associated with the buffer
+ * almost as being PG_BUSY.  Also the object paging_in_progress
+ * flag is handled to make sure that the object doesn't become
+ * inconsistant.
+ */
+void
+vfs_busy_pages(struct buf * bp, int clear_modify)
+{
+	int i, bogus;
+
+	if (bp->b_flags & B_VMIO) {
+		struct vnode *vp = bp->b_vp;
+		vm_object_t obj = vp->v_object;
+		vm_ooffset_t foff;
+
+		foff = bp->b_offset;
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("vfs_busy_pages: no buffer offset"));
+		vfs_setdirty(bp);
+
+retry:
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
+				goto retry;
+		}
+
+		bogus = 0;
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+
+			vm_page_flag_clear(m, PG_ZERO);
+			if ((bp->b_flags & B_CLUSTER) == 0) {
+				vm_object_pip_add(obj, 1);
+				vm_page_io_start(m);
+			}
+
+			vm_page_protect(m, VM_PROT_NONE);
+			if (clear_modify)
+				vfs_page_set_valid(bp, foff, i, m);
+			else if (m->valid == VM_PAGE_BITS_ALL &&
+				(bp->b_flags & B_CACHE) == 0) {
+				bp->b_pages[i] = bogus_page;
+				bogus++;
+			}
+			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+		}
+		if (bogus)
+			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+	}
+}
+
+/*
+ * Tell the VM system that the pages associated with this buffer
+ * are clean.  This is used for delayed writes where the data is
+ * going to go to disk eventually without additional VM intevention.
+ */
+void
+vfs_clean_pages(struct buf * bp)
+{
+	int i;
+
+	if (bp->b_flags & B_VMIO) {
+		vm_ooffset_t foff;
+		foff = bp->b_offset;
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("vfs_clean_pages: no buffer offset"));
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+			vfs_page_set_valid(bp, foff, i, m);
+			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+		}
+	}
+}
+
+void
+vfs_bio_clrbuf(struct buf *bp) {
+	int i, mask = 0;
+	caddr_t sa, ea;
+	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
+		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
+		    (bp->b_offset & PAGE_MASK) == 0) {
+			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
+			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
+			    ((bp->b_pages[0]->valid & mask) != mask)) {
+				bzero(bp->b_data, bp->b_bufsize);
+			}
+			bp->b_pages[0]->valid |= mask;
+			bp->b_resid = 0;
+			return;
+		}
+		ea = sa = bp->b_data;
+		for(i=0;i<bp->b_npages;i++,sa=ea) {
+			int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE;
+			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
+			ea = (caddr_t)ulmin((u_long)ea,
+				(u_long)bp->b_data + bp->b_bufsize);
+			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
+			if ((bp->b_pages[i]->valid & mask) == mask)
+				continue;
+			if ((bp->b_pages[i]->valid & mask) == 0) {
+				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
+					bzero(sa, ea - sa);
+				}
+			} else {
+				for (; sa < ea; sa += DEV_BSIZE, j++) {
+					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
+						(bp->b_pages[i]->valid & (1<<j)) == 0)
+						bzero(sa, DEV_BSIZE);
+				}
+			}
+			bp->b_pages[i]->valid |= mask;
+			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
+		}
+		bp->b_resid = 0;
+	} else {
+		clrbuf(bp);
+	}
+}
+
+/*
+ * vm_hold_load_pages and vm_hold_unload pages get pages into
+ * a buffers address space.  The pages are anonymous and are
+ * not associated with a file object.
+ */
+void
+vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+	vm_offset_t pg;
+	vm_page_t p;
+	int index;
+
+	to = round_page(to);
+	from = round_page(from);
+	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+
+tryagain:
+
+		p = vm_page_alloc(kernel_object,
+			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+		    VM_ALLOC_NORMAL);
+		if (!p) {
+			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
+			VM_WAIT;
+			goto tryagain;
+		}
+		vm_page_wire(p);
+		p->valid = VM_PAGE_BITS_ALL;
+		vm_page_flag_clear(p, PG_ZERO);
+		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
+		bp->b_pages[index] = p;
+		vm_page_wakeup(p);
+	}
+	bp->b_npages = index;
+}
+
+void
+vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+	vm_offset_t pg;
+	vm_page_t p;
+	int index, newnpages;
+
+	from = round_page(from);
+	to = round_page(to);
+	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+		p = bp->b_pages[index];
+		if (p && (index < bp->b_npages)) {
+#if !defined(MAX_PERF)
+			if (p->busy) {
+				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
+					bp->b_blkno, bp->b_lblkno);
+			}
+#endif
+			bp->b_pages[index] = NULL;
+			pmap_kremove(pg);
+			vm_page_busy(p);
+			vm_page_unwire(p, 0);
+			vm_page_free(p);
+		}
+	}
+	bp->b_npages = newnpages;
+}
+
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(buffer, db_show_buffer)
+{
+	/* get args */
+	struct buf *bp = (struct buf *)addr;
+
+	if (!have_addr) {
+		db_printf("usage: show buffer <addr>\n");
+		return;
+	}
+
+	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
+		  (u_int)bp->b_flags, PRINT_BUF_FLAGS);
+	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
+		  "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
+		  "b_blkno = %d, b_pblkno = %d\n",
+		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
+		  bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
+	if (bp->b_npages) {
+		int i;
+		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m;
+			m = bp->b_pages[i];
+			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
+			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
+			if ((i + 1) < bp->b_npages)
+				db_printf(",");
+		}
+		db_printf("\n");
+	}
+}
+#endif /* DDB */
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
new file mode 100644
index 0000000..a8ac5e7
--- /dev/null
+++ b/sys/kern/vfs_cache.c
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 1989, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Poul-Henning Kamp of the FreeBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
+ * $Id: vfs_cache.c,v 1.37 1997/12/19 23:18:37 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/namei.h>
+#include <sys/malloc.h>
+
+
+/*
+ * Name caching works as follows:
+ *
+ * Names found by directory scans are retained in a cache
+ * for future reference.  It is managed LRU, so frequently
+ * used names will hang around.  Cache is indexed by hash value
+ * obtained from (vp, name) where vp refers to the directory
+ * containing name.
+ *
+ * If it is a "negative" entry, (i.e. for a name that is known NOT to
+ * exist) the vnode pointer will be NULL.
+ *
+ * Upon reaching the last segment of a path, if the reference
+ * is for DELETE, or NOCACHE is set (rewrite), and the
+ * name is located in the cache, it will be dropped.
+ */
+
+/*
+ * Structures associated with name cacheing.
+ */
+#define NCHHASH(dvp, cnp) \
+	(&nchashtbl[((dvp)->v_id + (cnp)->cn_hash) & nchash])
+static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
+static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
+static u_long	nchash;			/* size of hash table */
+SYSCTL_INT(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
+static u_long	ncnegfactor = 16;	/* ratio of negative entries */
+SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
+static u_long	numneg;		/* number of cache entries allocated */
+SYSCTL_INT(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
+static u_long	numcache;		/* number of cache entries allocated */
+SYSCTL_INT(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
+struct	nchstats nchstats;		/* cache effectiveness statistics */
+
+static int	doingcache = 1;		/* 1 => enable the cache */
+SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
+SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
+SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
+
+/*
+ * The new name cache statistics
+ */
+SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
+#define STATNODE(mode, name, var) \
+	SYSCTL_INT(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
+STATNODE(CTLFLAG_RD, numneg, &numneg);
+STATNODE(CTLFLAG_RD, numcache, &numcache);
+static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
+static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
+static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
+static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
+static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
+static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
+static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
+static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
+static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
+static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
+
+
+static void cache_zap __P((struct namecache *ncp));
+
+/*
+ * Flags in namecache.nc_flag
+ */
+#define NCF_WHITE	1
+/*
+ * Delete an entry from its hash list and move it to the front
+ * of the LRU list for immediate reuse.
+ */
+static void
+cache_zap(ncp)
+	struct namecache *ncp;
+{
+	LIST_REMOVE(ncp, nc_hash);
+	LIST_REMOVE(ncp, nc_src);
+	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) 
+		vdrop(ncp->nc_dvp);
+	if (ncp->nc_vp) {
+		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
+	} else {
+		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+		numneg--;
+	}
+	numcache--;
+	free(ncp, M_CACHE);
+}
+
+/*
+ * Lookup an entry in the cache
+ *
+ * We don't do this if the segment name is long, simply so the cache
+ * can avoid holding long names (which would either waste space, or
+ * add greatly to the complexity).
+ *
+ * Lookup is called with dvp pointing to the directory to search,
+ * cnp pointing to the name of the entry being sought. If the lookup
+ * succeeds, the vnode is returned in *vpp, and a status of -1 is
+ * returned. If the lookup determines that the name does not exist
+ * (negative cacheing), a status of ENOENT is returned. If the lookup
+ * fails, a status of zero is returned.
+ */
+
+int
+cache_lookup(dvp, vpp, cnp)
+	struct vnode *dvp;
+	struct vnode **vpp;
+	struct componentname *cnp;
+{
+	register struct namecache *ncp;
+
+	if (!doingcache) {
+		cnp->cn_flags &= ~MAKEENTRY;
+		return (0);
+	}
+
+	numcalls++;
+
+	if (cnp->cn_nameptr[0] == '.') {
+		if (cnp->cn_namelen == 1) {
+			*vpp = dvp;
+			dothits++;
+			return (-1);
+		}
+		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+			dotdothits++;
+			if (dvp->v_dd->v_id != dvp->v_ddid ||
+			    (cnp->cn_flags & MAKEENTRY) == 0) {
+				dvp->v_ddid = 0;
+				return (0);
+			}
+			*vpp = dvp->v_dd;
+			return (-1);
+		}
+	}
+
+	LIST_FOREACH(ncp, (NCHHASH(dvp, cnp)), nc_hash) {
+		numchecks++;
+		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+			break;
+	}
+
+	/* We failed to find an entry */
+	if (ncp == 0) {
+		if ((cnp->cn_flags & MAKEENTRY) == 0) {
+			nummisszap++;
+		} else {
+			nummiss++;
+		}
+		nchstats.ncs_miss++;
+		return (0);
+	}
+
+	/* We don't want to have an entry, so dump it */
+	if ((cnp->cn_flags & MAKEENTRY) == 0) {
+		numposzaps++;
+		nchstats.ncs_badhits++;
+		cache_zap(ncp);
+		return (0);
+	}
+
+	/* We found a "positive" match, return the vnode */
+        if (ncp->nc_vp) {
+		numposhits++;
+		nchstats.ncs_goodhits++;
+		*vpp = ncp->nc_vp;
+		return (-1);
+	}
+
+	/* We found a negative match, and want to create it, so purge */
+	if (cnp->cn_nameiop == CREATE) {
+		numnegzaps++;
+		nchstats.ncs_badhits++;
+		cache_zap(ncp);
+		return (0);
+	}
+
+	numneghits++;
+	/*
+	 * We found a "negative" match, ENOENT notifies client of this match.
+	 * The nc_vpid field records whether this is a whiteout.
+	 */
+	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+	nchstats.ncs_neghits++;
+	if (ncp->nc_flag & NCF_WHITE)
+		cnp->cn_flags |= ISWHITEOUT;
+	return (ENOENT);
+}
+
+/*
+ * Add an entry to the cache.
+ */
+void
+cache_enter(dvp, vp, cnp)
+	struct vnode *dvp;
+	struct vnode *vp;
+	struct componentname *cnp;
+{
+	register struct namecache *ncp;
+	register struct nchashhead *ncpp;
+
+	if (!doingcache)
+		return;
+
+	if (cnp->cn_nameptr[0] == '.') {
+		if (cnp->cn_namelen == 1) {
+			return;
+		}
+		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+			if (vp) {
+				dvp->v_dd = vp;
+				dvp->v_ddid = vp->v_id;
+			} else {
+				dvp->v_dd = dvp;
+				dvp->v_ddid = 0;
+			}
+			return;
+		}
+	}
+	 
+	ncp = (struct namecache *)
+		malloc(sizeof *ncp + cnp->cn_namelen, M_CACHE, M_WAITOK);
+	bzero((char *)ncp, sizeof *ncp);
+	numcache++;
+	if (!vp) {
+		numneg++;
+		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
+	} else if (vp->v_type == VDIR) {
+		vp->v_dd = dvp;
+		vp->v_ddid = dvp->v_id;
+	}
+
+	/*
+	 * Fill in cache info, if vp is NULL this is a "negative" cache entry.
+	 * For negative entries, we have to record whether it is a whiteout.
+	 * the whiteout flag is stored in the nc_vpid field which is
+	 * otherwise unused.
+	 */
+	ncp->nc_vp = vp;
+	ncp->nc_dvp = dvp;
+	ncp->nc_nlen = cnp->cn_namelen;
+	bcopy(cnp->cn_nameptr, ncp->nc_name, ncp->nc_nlen);
+	ncpp = NCHHASH(dvp, cnp);
+	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+	if (LIST_EMPTY(&dvp->v_cache_src))
+		vhold(dvp);
+	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
+	if (vp) {
+		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
+	} else {
+		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+	}
+	if (numneg*ncnegfactor > numcache) {
+		ncp = TAILQ_FIRST(&ncneg);
+		cache_zap(ncp);
+	}
+}
+
+/*
+ * Name cache initialization, from vfs_init() when we are booting
+ */
+void
+nchinit()
+{
+
+	TAILQ_INIT(&ncneg);
+	nchashtbl = hashinit(desiredvnodes*2, M_CACHE, &nchash);
+}
+
+/*
+ * Invalidate all entries to particular vnode.
+ *
+ * We actually just increment the v_id, that will do it. The stale entries
+ * will be purged by lookup as they get found. If the v_id wraps around, we
+ * need to ditch the entire cache, to avoid confusion. No valid vnode will
+ * ever have (v_id == 0).
+ */
+void
+cache_purge(vp)
+	struct vnode *vp;
+{
+	static u_long nextid;
+
+	while (!LIST_EMPTY(&vp->v_cache_src)) 
+		cache_zap(LIST_FIRST(&vp->v_cache_src));
+	while (!TAILQ_EMPTY(&vp->v_cache_dst)) 
+		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
+
+	nextid++;
+	while (nextid == vp->v_id || !nextid)
+		continue;
+	vp->v_id = nextid;
+	vp->v_dd = vp;
+	vp->v_ddid = 0;
+}
+
+/*
+ * Flush all entries referencing a particular filesystem.
+ *
+ * Since we need to check it anyway, we will flush all the invalid
+ * entries at the same time.
+ */
+void
+cache_purgevfs(mp)
+	struct mount *mp;
+{
+	struct nchashhead *ncpp;
+	struct namecache *ncp, *nnp;
+
+	/* Scan hash tables for applicable entries */
+	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
+		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
+			nnp = LIST_NEXT(ncp, nc_hash);
+			if (ncp->nc_dvp->v_mount == mp) {
+				cache_zap(ncp);
+			}
+		}
+	}
+}
+
+/*
+ * Perform canonical checks and cache lookup and pass on to filesystem
+ * through the vop_cachedlookup only if needed.
+ */
+
+int
+vfs_cache_lookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+	struct vnode *vdp;
+	struct vnode *pdp;
+	int lockparent;	
+	int error;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	struct ucred *cred = cnp->cn_cred;
+	int flags = cnp->cn_flags;
+	struct proc *p = cnp->cn_proc;
+	u_long vpid;	/* capability number of vnode */
+
+	*vpp = NULL;
+	vdp = ap->a_dvp;
+	lockparent = flags & LOCKPARENT;
+
+	if (vdp->v_type != VDIR)
+                return (ENOTDIR);
+
+	if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+		return (EROFS);
+
+	error = VOP_ACCESS(vdp, VEXEC, cred, cnp->cn_proc);
+
+	if (error)
+		return (error);
+
+	error = cache_lookup(vdp, vpp, cnp);
+
+	if (!error) 
+		return (VOP_CACHEDLOOKUP(ap->a_dvp, ap->a_vpp, ap->a_cnp));
+
+	if (error == ENOENT)
+		return (error);
+
+	pdp = vdp;
+	vdp = *vpp;
+	vpid = vdp->v_id;
+	if (pdp == vdp) {   /* lookup on "." */
+		VREF(vdp);
+		error = 0;
+	} else if (flags & ISDOTDOT) {
+		VOP_UNLOCK(pdp, 0, p);
+		error = vget(vdp, LK_EXCLUSIVE, p);
+		if (!error && lockparent && (flags & ISLASTCN))
+			error = vn_lock(pdp, LK_EXCLUSIVE, p);
+	} else {
+		error = vget(vdp, LK_EXCLUSIVE, p);
+		if (!lockparent || error || !(flags & ISLASTCN))
+			VOP_UNLOCK(pdp, 0, p);
+	}
+	/*
+	 * Check that the capability number did not change
+	 * while we were waiting for the lock.
+	 */
+	if (!error) {
+		if (vpid == vdp->v_id)
+			return (0);
+		vput(vdp);
+		if (lockparent && pdp != vdp && (flags & ISLASTCN))
+			VOP_UNLOCK(pdp, 0, p);
+	}
+	error = vn_lock(pdp, LK_EXCLUSIVE, p);
+	if (error)
+		return (error);
+	return (VOP_CACHEDLOOKUP(ap->a_dvp, ap->a_vpp, ap->a_cnp));
+}
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
new file mode 100644
index 0000000..781508e
--- /dev/null
+++ b/sys/kern/vfs_cluster.c
@@ -0,0 +1,840 @@
+/*-
+ * Copyright (c) 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * Modifications/enhancements:
+ * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
+ * $Id: vfs_cluster.c,v 1.77 1999/01/10 01:58:25 eivind Exp $
+ */
+
+#include "opt_debug_cluster.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+
+#if defined(CLUSTERDEBUG)
+#include <sys/sysctl.h>
+static int	rcluster= 0;
+SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
+#endif
+
+static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer");
+
+static struct cluster_save *
+	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
+static struct buf *
+	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+			    daddr_t blkno, long size, int run, struct buf *fbp));
+
+extern vm_page_t	bogus_page;
+
+extern int cluster_pbuf_freecnt;
+
+/*
+ * Maximum number of blocks for read-ahead.
+ */
+#define MAXRA 32
+
+/*
+ * This replaces bread.
+ */
+int
+cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
+	struct vnode *vp;
+	u_quad_t filesize;
+	daddr_t lblkno;
+	long size;
+	struct ucred *cred;
+	long totread;
+	int seqcount;
+	struct buf **bpp;
+{
+	struct buf *bp, *rbp, *reqbp;
+	daddr_t blkno, origblkno;
+	int error, num_ra;
+	int i;
+	int maxra, racluster;
+	long origtotread;
+
+	error = 0;
+	if (vp->v_maxio == 0)
+		vp->v_maxio = DFLTPHYS;
+
+	/*
+	 * Try to limit the amount of read-ahead by a few
+	 * ad-hoc parameters.  This needs work!!!
+	 */
+	racluster = vp->v_maxio/size;
+	maxra = 2 * racluster + (totread / size);
+	if (maxra > MAXRA)
+		maxra = MAXRA;
+	if (maxra > nbuf/8)
+		maxra = nbuf/8;
+
+	/*
+	 * get the requested block
+	 */
+	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
+	origblkno = lblkno;
+	origtotread = totread;
+
+	/*
+	 * if it is in the cache, then check to see if the reads have been
+	 * sequential.  If they have, then try some read-ahead, otherwise
+	 * back-off on prospective read-aheads.
+	 */
+	if (bp->b_flags & B_CACHE) {
+		if (!seqcount) {
+			return 0;
+		} else if ((bp->b_flags & B_RAM) == 0) {
+			return 0;
+		} else {
+			int s;
+			struct buf *tbp;
+			bp->b_flags &= ~B_RAM;
+			/*
+			 * We do the spl here so that there is no window
+			 * between the incore and the b_usecount increment
+			 * below.  We opt to keep the spl out of the loop
+			 * for efficiency.
+			 */
+			s = splbio();
+			for(i=1;i<maxra;i++) {
+
+				if (!(tbp = incore(vp, lblkno+i))) {
+					break;
+				}
+
+				/*
+				 * Set another read-ahead mark so we know to check
+				 * again.
+				 */
+				if (((i % racluster) == (racluster - 1)) ||
+					(i == (maxra - 1)))
+					tbp->b_flags |= B_RAM;
+
+				if ((tbp->b_usecount < 1) &&
+					((tbp->b_flags & B_BUSY) == 0) &&
+					(tbp->b_qindex == QUEUE_LRU)) {
+					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist);
+					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist);
+				}
+			}
+			splx(s);
+			if (i >= maxra) {
+				return 0;
+			}
+			lblkno += i;
+		}
+		reqbp = bp = NULL;
+	} else {
+		off_t firstread = bp->b_offset;
+
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("cluster_read: no buffer offset"));
+		if (firstread + totread > filesize)
+			totread = filesize - firstread;
+		if (totread > size) {
+			int nblks = 0;
+			int ncontigafter;
+			while (totread > 0) {
+				nblks++;
+				totread -= size;
+			}
+			if (nblks == 1)
+				goto single_block_read;
+			if (nblks > racluster)
+				nblks = racluster;
+
+	    		error = VOP_BMAP(vp, lblkno, NULL,
+				&blkno, &ncontigafter, NULL);
+			if (error)
+				goto single_block_read;
+			if (blkno == -1)
+				goto single_block_read;
+			if (ncontigafter == 0)
+				goto single_block_read;
+			if (ncontigafter + 1 < nblks)
+				nblks = ncontigafter + 1;
+
+			bp = cluster_rbuild(vp, filesize, lblkno,
+				blkno, size, nblks, bp);
+			lblkno += (bp->b_bufsize / size);
+		} else {
+single_block_read:
+			/*
+			 * if it isn't in the cache, then get a chunk from
+			 * disk if sequential, otherwise just get the block.
+			 */
+			bp->b_flags |= B_READ | B_RAM;
+			lblkno += 1;
+		}
+	}
+
+	/*
+	 * if we have been doing sequential I/O, then do some read-ahead
+	 */
+	rbp = NULL;
+	if (seqcount && (lblkno < (origblkno + seqcount))) {
+		/*
+		 * we now build the read-ahead buffer if it is desirable.
+		 */
+		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
+		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
+		    blkno != -1) {
+			int nblksread;
+			int ntoread = num_ra + 1;
+			nblksread = (origtotread + size - 1) / size;
+			if (seqcount < nblksread)
+				seqcount = nblksread;
+			if (seqcount < ntoread)
+				ntoread = seqcount;
+			if (num_ra) {
+				rbp = cluster_rbuild(vp, filesize, lblkno,
+					blkno, size, ntoread, NULL);
+			} else {
+				rbp = getblk(vp, lblkno, size, 0, 0);
+				rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
+				rbp->b_blkno = blkno;
+			}
+		}
+	}
+
+	/*
+	 * handle the synchronous read
+	 */
+	if (bp) {
+#if defined(CLUSTERDEBUG)
+		if (rcluster)
+			printf("S(%ld,%ld,%d) ",
+			    (long)bp->b_lblkno, bp->b_bcount, seqcount);
+#endif
+		if ((bp->b_flags & B_CLUSTER) == 0)
+			vfs_busy_pages(bp, 0);
+		error = VOP_STRATEGY(vp, bp);
+		curproc->p_stats->p_ru.ru_inblock++;
+	}
+
+	/*
+	 * and if we have read-aheads, do them too
+	 */
+	if (rbp) {
+		if (error) {
+			rbp->b_flags &= ~(B_ASYNC | B_READ);
+			brelse(rbp);
+		} else if (rbp->b_flags & B_CACHE) {
+			rbp->b_flags &= ~(B_ASYNC | B_READ);
+			bqrelse(rbp);
+		} else {
+#if defined(CLUSTERDEBUG)
+			if (rcluster) {
+				if (bp)
+					printf("A+(%ld,%ld,%ld,%d) ",
+					    (long)rbp->b_lblkno, rbp->b_bcount,
+					    (long)(rbp->b_lblkno - origblkno),
+					    seqcount);
+				else
+					printf("A(%ld,%ld,%ld,%d) ",
+					    (long)rbp->b_lblkno, rbp->b_bcount,
+					    (long)(rbp->b_lblkno - origblkno),
+					    seqcount);
+			}
+#endif
+
+			if ((rbp->b_flags & B_CLUSTER) == 0)
+				vfs_busy_pages(rbp, 0);
+			(void) VOP_STRATEGY(vp, rbp);
+			curproc->p_stats->p_ru.ru_inblock++;
+		}
+	}
+	if (reqbp)
+		return (biowait(reqbp));
+	else
+		return (error);
+}
+
+/*
+ * If blocks are contiguous on disk, use this to provide clustered
+ * read ahead.  We will read as many blocks as possible sequentially
+ * and then parcel them up into logical blocks in the buffer hash table.
+ */
+static struct buf *
+cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
+	struct vnode *vp;
+	u_quad_t filesize;
+	daddr_t lbn;
+	daddr_t blkno;
+	long size;
+	int run;
+	struct buf *fbp;
+{
+	struct buf *bp, *tbp;
+	daddr_t bn;
+	int i, inc, j;
+
+	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
+	    ("cluster_rbuild: size %ld != filesize %ld\n",
+	    size, vp->v_mount->mnt_stat.f_iosize));
+
+	/*
+	 * avoid a division
+	 */
+	while ((u_quad_t) size * (lbn + run) > filesize) {
+		--run;
+	}
+
+	if (fbp) {
+		tbp = fbp;
+		tbp->b_flags |= B_READ; 
+	} else {
+		tbp = getblk(vp, lbn, size, 0, 0);
+		if (tbp->b_flags & B_CACHE)
+			return tbp;
+		tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
+	}
+
+	tbp->b_blkno = blkno;
+	if( (tbp->b_flags & B_MALLOC) ||
+		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+		return tbp;
+
+	bp = trypbuf(&cluster_pbuf_freecnt);
+	if (bp == 0)
+		return tbp;
+
+	bp->b_data = (char *)((vm_offset_t)bp->b_data |
+	    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
+	bp->b_iodone = cluster_callback;
+	bp->b_blkno = blkno;
+	bp->b_lblkno = lbn;
+	bp->b_offset = tbp->b_offset;
+	KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
+	pbgetvp(vp, bp);
+
+	TAILQ_INIT(&bp->b_cluster.cluster_head);
+
+	bp->b_bcount = 0;
+	bp->b_bufsize = 0;
+	bp->b_npages = 0;
+
+	if (vp->v_maxio == 0)
+		vp->v_maxio = DFLTPHYS;
+	inc = btodb(size);
+	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
+		if (i != 0) {
+			if ((bp->b_npages * PAGE_SIZE) +
+				round_page(size) > vp->v_maxio)
+				break;
+
+			if (tbp = incore(vp, lbn + i)) {
+				if (tbp->b_flags & B_BUSY)
+					break;
+
+				for (j = 0; j < tbp->b_npages; j++)
+					if (tbp->b_pages[j]->valid)
+						break;
+				
+				if (j != tbp->b_npages)
+					break;
+	
+				if (tbp->b_bcount != size)
+					break;
+			}
+
+			tbp = getblk(vp, lbn + i, size, 0, 0);
+
+			if ((tbp->b_flags & B_CACHE) ||
+				(tbp->b_flags & B_VMIO) == 0) {
+				bqrelse(tbp);
+				break;
+			}
+
+			for (j = 0;j < tbp->b_npages; j++)
+				if (tbp->b_pages[j]->valid)
+					break;
+
+			if (j != tbp->b_npages) {
+				bqrelse(tbp);
+				break;
+			}
+
+			if ((fbp && (i == 1)) || (i == (run - 1)))
+				tbp->b_flags |= B_RAM;
+			tbp->b_flags |= B_READ | B_ASYNC;
+			if (tbp->b_blkno == tbp->b_lblkno) {
+				tbp->b_blkno = bn;
+			} else if (tbp->b_blkno != bn) {
+				brelse(tbp);
+				break;
+			}
+		}
+		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+			tbp, b_cluster.cluster_entry);
+		for (j = 0; j < tbp->b_npages; j += 1) {
+			vm_page_t m;
+			m = tbp->b_pages[j];
+			vm_page_io_start(m);
+			vm_object_pip_add(m->object, 1);
+			if ((bp->b_npages == 0) ||
+				(bp->b_pages[bp->b_npages-1] != m)) {
+				bp->b_pages[bp->b_npages] = m;
+				bp->b_npages++;
+			}
+			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
+				tbp->b_pages[j] = bogus_page;
+		}
+		bp->b_bcount += tbp->b_bcount;
+		bp->b_bufsize += tbp->b_bufsize;
+	}
+
+	for(j=0;j<bp->b_npages;j++) {
+		if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
+			VM_PAGE_BITS_ALL)
+			bp->b_pages[j] = bogus_page;
+	}
+	if (bp->b_bufsize > bp->b_kvasize)
+		panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+		    bp->b_bufsize, bp->b_kvasize);
+	bp->b_kvasize = bp->b_bufsize;
+
+	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+		(vm_page_t *)bp->b_pages, bp->b_npages);
+	return (bp);
+}
+
+/*
+ * Cleanup after a clustered read or write.
+ * This is complicated by the fact that any of the buffers might have
+ * extra memory (if there were no empty buffer headers at allocbuf time)
+ * that we will need to shift around.
+ */
+void
+cluster_callback(bp)
+	struct buf *bp;
+{
+	struct buf *nbp, *tbp;
+	int error = 0;
+
+	/*
+	 * Must propogate errors to all the components.
+	 */
+	if (bp->b_flags & B_ERROR)
+		error = bp->b_error;
+
+	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+	/*
+	 * Move memory from the large cluster buffer into the component
+	 * buffers and mark IO as done on these.
+	 */
+	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
+		tbp; tbp = nbp) {
+		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
+		if (error) {
+			tbp->b_flags |= B_ERROR;
+			tbp->b_error = error;
+		} else
+		    tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+		biodone(tbp);
+	}
+	relpbuf(bp, &cluster_pbuf_freecnt);
+}
+
+/*
+ * Do clustered write for FFS.
+ *
+ * Three cases:
+ *	1. Write is not sequential (write asynchronously)
+ *	Write is sequential:
+ *	2.	beginning of cluster - begin cluster
+ *	3.	middle of a cluster - add to cluster
+ *	4.	end of a cluster - asynchronously write cluster
+ */
+void
+cluster_write(bp, filesize)
+	struct buf *bp;
+	u_quad_t filesize;
+{
+	struct vnode *vp;
+	daddr_t lbn;
+	int maxclen, cursize;
+	int lblocksize;
+	int async;
+
+	vp = bp->b_vp;
+	if (vp->v_maxio == 0)
+		vp->v_maxio = DFLTPHYS;
+	if (vp->v_type == VREG) {
+		async = vp->v_mount->mnt_flag & MNT_ASYNC;
+		lblocksize = vp->v_mount->mnt_stat.f_iosize;
+	} else {
+		async = 0;
+		lblocksize = bp->b_bufsize;
+	}
+	lbn = bp->b_lblkno;
+	KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
+
+	/* Initialize vnode to beginning of file. */
+	if (lbn == 0)
+		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+
+	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
+	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
+		maxclen = vp->v_maxio / lblocksize - 1;
+		if (vp->v_clen != 0) {
+			/*
+			 * Next block is not sequential.
+			 *
+			 * If we are not writing at end of file, the process
+			 * seeked to another point in the file since its last
+			 * write, or we have reached our maximum cluster size,
+			 * then push the previous cluster. Otherwise try
+			 * reallocating to make it sequential.
+			 */
+			cursize = vp->v_lastw - vp->v_cstart + 1;
+			if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
+			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
+				if (!async)
+					cluster_wbuild(vp, lblocksize,
+						vp->v_cstart, cursize);
+			} else {
+				struct buf **bpp, **endbp;
+				struct cluster_save *buflist;
+
+				buflist = cluster_collectbufs(vp, bp);
+				endbp = &buflist->bs_children
+				    [buflist->bs_nchildren - 1];
+				if (VOP_REALLOCBLKS(vp, buflist)) {
+					/*
+					 * Failed, push the previous cluster.
+					 */
+					for (bpp = buflist->bs_children;
+					     bpp < endbp; bpp++)
+						brelse(*bpp);
+					free(buflist, M_SEGMENT);
+					cluster_wbuild(vp, lblocksize,
+					    vp->v_cstart, cursize);
+				} else {
+					/*
+					 * Succeeded, keep building cluster.
+					 */
+					for (bpp = buflist->bs_children;
+					     bpp <= endbp; bpp++)
+						bdwrite(*bpp);
+					free(buflist, M_SEGMENT);
+					vp->v_lastw = lbn;
+					vp->v_lasta = bp->b_blkno;
+					return;
+				}
+			}
+		}
+		/*
+		 * Consider beginning a cluster. If at end of file, make
+		 * cluster as large as possible, otherwise find size of
+		 * existing cluster.
+		 */
+		if ((vp->v_type == VREG) &&
+			((u_quad_t) bp->b_offset + lblocksize) != filesize &&
+		    (bp->b_blkno == bp->b_lblkno) &&
+		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
+		     bp->b_blkno == -1)) {
+			bawrite(bp);
+			vp->v_clen = 0;
+			vp->v_lasta = bp->b_blkno;
+			vp->v_cstart = lbn + 1;
+			vp->v_lastw = lbn;
+			return;
+		}
+		vp->v_clen = maxclen;
+		if (!async && maxclen == 0) {	/* I/O not contiguous */
+			vp->v_cstart = lbn + 1;
+			bawrite(bp);
+		} else {	/* Wait for rest of cluster */
+			vp->v_cstart = lbn;
+			bdwrite(bp);
+		}
+	} else if (lbn == vp->v_cstart + vp->v_clen) {
+		/*
+		 * At end of cluster, write it out.
+		 */
+		bdwrite(bp);
+		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
+		vp->v_clen = 0;
+		vp->v_cstart = lbn + 1;
+	} else
+		/*
+		 * In the middle of a cluster, so just delay the I/O for now.
+		 */
+		bdwrite(bp);
+	vp->v_lastw = lbn;
+	vp->v_lasta = bp->b_blkno;
+}
+
+
+/*
+ * This is an awful lot like cluster_rbuild...wish they could be combined.
+ * The last lbn argument is the current block on which I/O is being
+ * performed.  Check to see that it doesn't fall in the middle of
+ * the current block (if last_bp == NULL).
+ */
+int
+cluster_wbuild(vp, size, start_lbn, len)
+	struct vnode *vp;
+	long size;
+	daddr_t start_lbn;
+	int len;
+{
+	struct buf *bp, *tbp;
+	int i, j, s;
+	int totalwritten = 0;
+	int dbsize = btodb(size);
+
+	if (vp->v_maxio == 0)
+		vp->v_maxio = DFLTPHYS;
+	while (len > 0) {
+		s = splbio();
+		if (((tbp = gbincore(vp, start_lbn)) == NULL) ||
+		  ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
+			++start_lbn;
+			--len;
+			splx(s);
+			continue;
+		}
+		bremfree(tbp);
+		tbp->b_flags |= B_BUSY;
+		tbp->b_flags &= ~B_DONE;
+		splx(s);
+
+	/*
+	 * Extra memory in the buffer, punt on this buffer. XXX we could
+	 * handle this in most cases, but we would have to push the extra
+	 * memory down to after our max possible cluster size and then
+	 * potentially pull it back up if the cluster was terminated
+	 * prematurely--too much hassle.
+	 */
+		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
+		  (tbp->b_bcount != tbp->b_bufsize) ||
+		  (tbp->b_bcount != size) ||
+		  (len == 1) ||
+		  ((bp = trypbuf(&cluster_pbuf_freecnt)) == NULL)) {
+			totalwritten += tbp->b_bufsize;
+			bawrite(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
+
+		/*
+		 * We got a pbuf to make the cluster in.
+		 * so initialise it.
+		 */
+		TAILQ_INIT(&bp->b_cluster.cluster_head);
+		bp->b_bcount = 0;
+		bp->b_bufsize = 0;
+		bp->b_npages = 0;
+		if (tbp->b_wcred != NOCRED) {
+		    bp->b_wcred = tbp->b_wcred;
+		    crhold(bp->b_wcred);
+		}
+
+		bp->b_blkno = tbp->b_blkno;
+		bp->b_lblkno = tbp->b_lblkno;
+		bp->b_offset = tbp->b_offset;
+		bp->b_data = (char *)((vm_offset_t)bp->b_data |
+		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER |
+				(tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
+		bp->b_iodone = cluster_callback;
+		pbgetvp(vp, bp);
+		/*
+		 * From this location in the file, scan forward to see
+		 * if there are buffers with adjacent data that need to
+		 * be written as well.
+		 */
+		for (i = 0; i < len; ++i, ++start_lbn) {
+			if (i != 0) { /* If not the first buffer */
+				s = splbio();
+				/*
+				 * If the adjacent data is not even in core it
+				 * can't need to be written.
+				 */
+				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
+					splx(s);
+					break;
+				}
+
+				/*
+				 * If it IS in core, but has different
+				 * characteristics, don't cluster with it.
+				 */
+				if ((tbp->b_flags &
+				  (B_VMIO | B_CLUSTEROK | B_INVAL | B_BUSY |
+				    B_DELWRI | B_NEEDCOMMIT))
+				  != (B_DELWRI | B_CLUSTEROK |
+				    (bp->b_flags & (B_VMIO | B_NEEDCOMMIT)))) {
+					splx(s);
+					break;
+				}
+
+				if (tbp->b_wcred != bp->b_wcred) {
+					splx(s);
+					break;
+				}
+
+				/*
+				 * Check that the combined cluster
+				 * would make sense with regard to pages
+				 * and would not be too large
+				 */
+				if ((tbp->b_bcount != size) ||
+				  ((bp->b_blkno + (dbsize * i)) !=
+				    tbp->b_blkno) ||
+				  ((tbp->b_npages + bp->b_npages) >
+				    (vp->v_maxio / PAGE_SIZE))) {
+					splx(s);
+					break;
+				}
+				/*
+				 * Ok, it's passed all the tests,
+				 * so remove it from the free list
+				 * and mark it busy. We will use it.
+				 */
+				bremfree(tbp);
+				tbp->b_flags |= B_BUSY;
+				tbp->b_flags &= ~B_DONE;
+				splx(s);
+			} /* end of code for non-first buffers only */
+			/* check for latent dependencies to be handled */
+			if ((LIST_FIRST(&tbp->b_dep)) != NULL &&
+			    bioops.io_start)
+				(*bioops.io_start)(tbp);
+			/*
+			 * If the IO is via the VM then we do some
+			 * special VM hackery. (yuck)
+			 */
+			if (tbp->b_flags & B_VMIO) {
+				vm_page_t m;
+
+				if (i != 0) { /* if not first buffer */
+					for (j = 0; j < tbp->b_npages; j += 1) {
+						m = tbp->b_pages[j];
+						if (m->flags & PG_BUSY)
+							goto finishcluster;
+					}
+				}
+					
+				for (j = 0; j < tbp->b_npages; j += 1) {
+					m = tbp->b_pages[j];
+					vm_page_io_start(m);
+					vm_object_pip_add(m->object, 1);
+					if ((bp->b_npages == 0) ||
+					  (bp->b_pages[bp->b_npages - 1] != m)) {
+						bp->b_pages[bp->b_npages] = m;
+						bp->b_npages++;
+					}
+				}
+			}
+			bp->b_bcount += size;
+			bp->b_bufsize += size;
+
+			s = splbio();
+			--numdirtybuffers;
+			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
+			tbp->b_flags |= B_ASYNC;
+			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
+			++tbp->b_vp->v_numoutput;
+			splx(s);
+			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+				tbp, b_cluster.cluster_entry);
+		}
+	finishcluster:
+		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+			(vm_page_t *) bp->b_pages, bp->b_npages);
+		if (bp->b_bufsize > bp->b_kvasize)
+			panic(
+			    "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+			    bp->b_bufsize, bp->b_kvasize);
+		bp->b_kvasize = bp->b_bufsize;
+		totalwritten += bp->b_bufsize;
+		bp->b_dirtyoff = 0;
+		bp->b_dirtyend = bp->b_bufsize;
+		bawrite(bp);
+
+		len -= i;
+	}
+	return totalwritten;
+}
+
+/*
+ * Collect together all the buffers in a cluster.
+ * Plus add one additional buffer.
+ */
+static struct cluster_save *
+cluster_collectbufs(vp, last_bp)
+	struct vnode *vp;
+	struct buf *last_bp;
+{
+	struct cluster_save *buflist;
+	struct buf *bp;
+	daddr_t lbn;
+	int i, len;
+
+	len = vp->v_lastw - vp->v_cstart + 1;
+	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
+	    M_SEGMENT, M_WAITOK);
+	buflist->bs_nchildren = 0;
+	buflist->bs_children = (struct buf **) (buflist + 1);
+	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
+		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp);
+		buflist->bs_children[i] = bp;
+		if (bp->b_blkno == bp->b_lblkno)
+			VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
+				NULL, NULL);
+	}
+	buflist->bs_children[i] = bp = last_bp;
+	if (bp->b_blkno == bp->b_lblkno)
+		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
+			NULL, NULL);
+	buflist->bs_nchildren = i + 1;
+	return (buflist);
+}
diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c
new file mode 100644
index 0000000..a7a830f
--- /dev/null
+++ b/sys/kern/vfs_conf.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 1989, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 1995 Artisoft, Inc.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_conf.c	8.8 (Berkeley) 3/31/94
+ * $Id: vfs_conf.c,v 1.25 1998/06/09 12:52:33 bde Exp $
+ */
+
+/*
+ * PURPOSE:	This file abstracts the root mounting interface from
+ *		the per file system semantics for handling mounts,
+ *		the overall intent of which is to move the BSD
+ *		internals dependence out of the FS code, both to
+ *		make the FS code more portable and to free up some
+ *		of the BSD internals so that they may more easily
+ *		be changed.
+ *
+ * NOTE1:	Code is single entry/single exit to aid debugging
+ *		and conversion for kernel multithreading.
+ *
+ * NOTE2:	Code notes lock state in headers on entry and exit
+ *		as an aid to conversion for kernel multithreading
+ *		on SMP reentrancy
+ */
+#include "opt_bootp.h"
+
+#include <sys/param.h>		/* dev_t (types.h)*/
+#include <sys/kernel.h>
+#include <sys/systm.h>		/* rootvp*/
+#include <sys/proc.h>		/* curproc*/
+#include <sys/vnode.h>		/* NULLVP*/
+#include <sys/mount.h>		/* struct mount*/
+#include <sys/malloc.h>		/* M_MOUNT*/
+
+/*
+ * GLOBALS
+ */
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct");
+
+/*
+ *  These define the root filesystem, device, and root filesystem type.
+ */
+dev_t rootdevs[] = { NODEV, NODEV };
+char *rootdevnames[2];
+struct vnode *rootvnode;
+char *mountrootfsname;
+#ifdef BOOTP
+extern void bootpc_init __P((void));
+#endif
+
+/*
+ * vfs_init() will set maxvfsconf
+ * to the highest defined type number.
+ */
+int maxvfsconf;
+struct vfsconf *vfsconf;
+
+/*
+ * Common root mount code shared by all filesystems
+ */
+#define ROOTNAME	"root_device"
+
+/*
+ * vfs_mountrootfs
+ *
+ * Common entry point for root mounts
+ *
+ * PARAMETERS:
+ * 		NONE
+ *
+ * RETURNS:	0	Success
+ *		!0	error number (errno.h)
+ *
+ * LOCK STATE:
+ *		ENTRY
+ *			<no locks held>
+ *		EXIT
+ *			<no locks held>
+ *
+ * NOTES:
+ *		This code is currently supported only for use for
+ *		the FFS file system type.  This is a matter of
+ *		fixing the other file systems, not this code!
+ */
+static void
+vfs_mountrootfs(void *unused)
+{
+	struct mount		*mp;
+	int			i, err;
+	struct proc		*p = curproc;	/* XXX */
+	dev_t			orootdev;
+
+#ifdef BOOTP
+	bootpc_init();
+#endif
+	/*
+	 *  New root mount structure
+	 */
+	if ((err = vfs_rootmountalloc(mountrootfsname, ROOTNAME, &mp))) {
+		printf("error %d: ", err);
+		panic("cannot mount root\n");
+		return ;
+	}
+	mp->mnt_flag		|= MNT_ROOTFS;
+
+	/*
+	 * Attempt the mount
+	 */
+	err = ENXIO;
+	orootdev = rootdev;
+	if (rootdevs[0] == NODEV)
+		rootdevs[0] = rootdev;
+	for (i = 0; i < sizeof(rootdevs) / sizeof(rootdevs[0]); i++) {
+		if (rootdevs[i] == NODEV)
+			break;
+		rootdev = rootdevs[i];
+		if (rootdev != orootdev) {
+			printf("changing root device to %s\n", rootdevnames[i]);
+			orootdev = rootdev;
+		}
+		strncpy(mp->mnt_stat.f_mntfromname,
+		    rootdevnames[i] ? rootdevnames[i] : ROOTNAME, MNAMELEN - 1);
+		err = VFS_MOUNT(mp, NULL, NULL, NULL, p);
+		if (err != ENXIO)
+			break;
+	}
+	if (err) {
+		/*
+		 * XXX should ask the user for the name in some cases.
+		 * Why do we call vfs_unbusy() here and not after ENXIO
+		 * is returned above?
+		 */
+		vfs_unbusy(mp, p);
+		/*
+		 * free mount struct before failing
+		 * (hardly worthwhile with the PANIC eh?)
+		 */
+		free( mp, M_MOUNT);
+		printf("error %d: ", err);
+		panic("cannot mount root (2)\n");
+		return;
+	}
+
+	simple_lock(&mountlist_slock);
+
+	/*
+	 * Add fs to list of mounted file systems
+	 */
+	CIRCLEQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+
+	simple_unlock(&mountlist_slock);
+	vfs_unbusy(mp, p);
+
+	/* root mount, update system time from FS specific data*/
+	inittodr(mp->mnt_time);
+	return;
+}
+
+SYSINIT(mountroot, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, vfs_mountrootfs, NULL)
+
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
new file mode 100644
index 0000000..b73b126
--- /dev/null
+++ b/sys/kern/vfs_default.c
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/poll.h>
+
+static int vop_nostrategy __P((struct vop_strategy_args *));
+
+/*
+ * This vnode table stores what we want to do if the filesystem doesn't
+ * implement a particular VOP.
+ *
+ * If there is no specific entry here, we will return EOPNOTSUPP.
+ *
+ */
+
+vop_t **default_vnodeop_p;
+static struct vnodeopv_entry_desc default_vnodeop_entries[] = {
+	{ &vop_default_desc,		(vop_t *) vop_eopnotsupp },
+	{ &vop_abortop_desc,		(vop_t *) vop_null },
+	{ &vop_advlock_desc,		(vop_t *) vop_einval },
+	{ &vop_bwrite_desc,		(vop_t *) vop_stdbwrite },
+	{ &vop_close_desc,		(vop_t *) vop_null },
+	{ &vop_fsync_desc,		(vop_t *) vop_null },
+	{ &vop_ioctl_desc,		(vop_t *) vop_enotty },
+	{ &vop_islocked_desc,		(vop_t *) vop_noislocked },
+	{ &vop_lease_desc,		(vop_t *) vop_null },
+	{ &vop_lock_desc,		(vop_t *) vop_nolock },
+	{ &vop_mmap_desc,		(vop_t *) vop_einval },
+	{ &vop_open_desc,		(vop_t *) vop_null },
+	{ &vop_pathconf_desc,		(vop_t *) vop_einval },
+	{ &vop_poll_desc,		(vop_t *) vop_nopoll },
+	{ &vop_readlink_desc,		(vop_t *) vop_einval },
+	{ &vop_reallocblks_desc,	(vop_t *) vop_eopnotsupp },
+	{ &vop_revoke_desc,		(vop_t *) vop_revoke },
+	{ &vop_strategy_desc,		(vop_t *) vop_nostrategy },
+	{ &vop_unlock_desc,		(vop_t *) vop_nounlock },
+	{ NULL, NULL }
+};
+
+static struct vnodeopv_desc default_vnodeop_opv_desc =
+        { &default_vnodeop_p, default_vnodeop_entries };
+
+VNODEOP_SET(default_vnodeop_opv_desc);
+
+int
+vop_eopnotsupp(struct vop_generic_args *ap)
+{
+	/*
+	printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
+	*/
+
+	return (EOPNOTSUPP);
+}
+
+int
+vop_ebadf(struct vop_generic_args *ap)
+{
+
+	return (EBADF);
+}
+
+int
+vop_enotty(struct vop_generic_args *ap)
+{
+
+	return (ENOTTY);
+}
+
+int
+vop_einval(struct vop_generic_args *ap)
+{
+
+	return (EINVAL);
+}
+
+int
+vop_null(struct vop_generic_args *ap)
+{
+
+	return (0);
+}
+
+int
+vop_defaultop(struct vop_generic_args *ap)
+{
+
+	return (VOCALL(default_vnodeop_p, ap->a_desc->vdesc_offset, ap));
+}
+
+int
+vop_panic(struct vop_generic_args *ap)
+{
+
+	panic("illegal vnode op called");
+}
+
+static int
+vop_nostrategy (struct vop_strategy_args *ap)
+{
+	printf("No strategy for buffer at %p\n", ap->a_bp);
+	vprint("", ap->a_vp);
+	vprint("", ap->a_bp->b_vp);
+	ap->a_bp->b_flags |= B_ERROR;
+	ap->a_bp->b_error = EOPNOTSUPP;
+	biodone(ap->a_bp);
+	return (EOPNOTSUPP);
+}
+
+int
+vop_stdpathconf(ap)
+	struct vop_pathconf_args /* {
+	struct vnode *a_vp;
+	int a_name;
+	int *a_retval;
+	} */ *ap;
+{
+
+	switch (ap->a_name) {
+		case _PC_LINK_MAX:
+			*ap->a_retval = LINK_MAX;
+			return (0);
+		case _PC_MAX_CANON:
+			*ap->a_retval = MAX_CANON;
+			return (0);
+		case _PC_MAX_INPUT:
+			*ap->a_retval = MAX_INPUT;
+			return (0);
+		case _PC_PIPE_BUF:
+			*ap->a_retval = PIPE_BUF;
+			return (0);
+		case _PC_CHOWN_RESTRICTED:
+			*ap->a_retval = 1;
+			return (0);
+		case _PC_VDISABLE:
+			*ap->a_retval = _POSIX_VDISABLE;
+			return (0);
+		default:
+			return (EINVAL);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Standard lock, unlock and islocked functions.
+ *
+ * These depend on the lock structure being the first element in the
+ * inode, ie: vp->v_data points to the the lock!
+ */
+int
+vop_stdlock(ap)
+	struct vop_lock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct proc *a_p;
+	} */ *ap;
+{               
+	struct lock *l;
+
+	if ((l = (struct lock *)ap->a_vp->v_data) == NULL) {
+		if (ap->a_flags & LK_INTERLOCK)
+			simple_unlock(&ap->a_vp->v_interlock);
+		return 0;
+	}
+
+#ifndef	DEBUG_LOCKS
+	return (lockmgr(l, ap->a_flags, &ap->a_vp->v_interlock, ap->a_p));
+#else
+	return (debuglockmgr(l, ap->a_flags, &ap->a_vp->v_interlock, ap->a_p,
+	    "vop_stdlock", ap->a_vp->filename, ap->a_vp->line));
+#endif
+}
+
+int
+vop_stdunlock(ap)
+	struct vop_unlock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct proc *a_p;
+	} */ *ap;
+{
+	struct lock *l;
+
+	if ((l = (struct lock *)ap->a_vp->v_data) == NULL) {
+		if (ap->a_flags & LK_INTERLOCK)
+			simple_unlock(&ap->a_vp->v_interlock);
+		return 0;
+	}
+
+	return (lockmgr(l, ap->a_flags | LK_RELEASE, &ap->a_vp->v_interlock, 
+	    ap->a_p));
+}
+
+int
+vop_stdislocked(ap)
+	struct vop_islocked_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct lock *l;
+
+	if ((l = (struct lock *)ap->a_vp->v_data) == NULL)
+		return 0;
+
+	return (lockstatus(l));
+}
+
+/*
+ * Return true for select/poll.
+ */
+int
+vop_nopoll(ap)
+	struct vop_poll_args /* {
+		struct vnode *a_vp;
+		int  a_events;
+		struct ucred *a_cred;
+		struct proc *a_p;
+	} */ *ap;
+{
+	/*
+	 * Return true for read/write.  If the user asked for something
+	 * special, return POLLNVAL, so that clients have a way of
+	 * determining reliably whether or not the extended
+	 * functionality is present without hard-coding knowledge
+	 * of specific filesystem implementations.
+	 */
+	if (ap->a_events & ~POLLSTANDARD)
+		return (POLLNVAL);
+
+	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Implement poll for local filesystems that support it.
+ */
+int
+vop_stdpoll(ap)
+	struct vop_poll_args /* {
+		struct vnode *a_vp;
+		int  a_events;
+		struct ucred *a_cred;
+		struct proc *a_p;
+	} */ *ap;
+{
+	if ((ap->a_events & ~POLLSTANDARD) == 0)
+		return (ap->a_events & (POLLRDNORM|POLLWRNORM));
+	return (vn_pollrecord(ap->a_vp, ap->a_p, ap->a_events));
+}
+
+int
+vop_stdbwrite(ap)
+	struct vop_bwrite_args *ap;
+{
+	return (bwrite(ap->a_bp));
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_sharedlock(ap)
+	struct vop_lock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct proc *a_p;
+	} */ *ap;
+{
+	/*
+	 * This code cannot be used until all the non-locking filesystems
+	 * (notably NFS) are converted to properly lock and release nodes.
+	 * Also, certain vnode operations change the locking state within
+	 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+	 * and symlink). Ideally these operations should not change the
+	 * lock state, but should be changed to let the caller of the
+	 * function unlock them. Otherwise all intermediate vnode layers
+	 * (such as union, umapfs, etc) must catch these functions to do
+	 * the necessary locking at their layer. Note that the inactive
+	 * and lookup operations also change their lock state, but this 
+	 * cannot be avoided, so these two operations will always need
+	 * to be handled in intermediate layers.
+	 */
+	struct vnode *vp = ap->a_vp;
+	int vnflags, flags = ap->a_flags;
+
+	if (vp->v_vnlock == NULL) {
+		if ((flags & LK_TYPE_MASK) == LK_DRAIN)
+			return (0);
+		MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
+		    M_VNODE, M_WAITOK);
+		lockinit(vp->v_vnlock, PVFS, "vnlock", 0, LK_NOPAUSE);
+	}
+	switch (flags & LK_TYPE_MASK) {
+	case LK_DRAIN:
+		vnflags = LK_DRAIN;
+		break;
+	case LK_EXCLUSIVE:
+#ifdef DEBUG_VFS_LOCKS
+		/*
+		 * Normally, we use shared locks here, but that confuses
+		 * the locking assertions.
+		 */
+		vnflags = LK_EXCLUSIVE;
+		break;
+#endif
+	case LK_SHARED:
+		vnflags = LK_SHARED;
+		break;
+	case LK_UPGRADE:
+	case LK_EXCLUPGRADE:
+	case LK_DOWNGRADE:
+		return (0);
+	case LK_RELEASE:
+	default:
+		panic("vop_sharedlock: bad operation %d", flags & LK_TYPE_MASK);
+	}
+	if (flags & LK_INTERLOCK)
+		vnflags |= LK_INTERLOCK;
+#ifndef	DEBUG_LOCKS
+	return (lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
+#else
+	return (debuglockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p,
+	    "vop_sharedlock", vp->filename, vp->line));
+#endif
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_nolock(ap)
+	struct vop_lock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct proc *a_p;
+	} */ *ap;
+{
+#ifdef notyet
+	/*
+	 * This code cannot be used until all the non-locking filesystems
+	 * (notably NFS) are converted to properly lock and release nodes.
+	 * Also, certain vnode operations change the locking state within
+	 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+	 * and symlink). Ideally these operations should not change the
+	 * lock state, but should be changed to let the caller of the
+	 * function unlock them. Otherwise all intermediate vnode layers
+	 * (such as union, umapfs, etc) must catch these functions to do
+	 * the necessary locking at their layer. Note that the inactive
+	 * and lookup operations also change their lock state, but this 
+	 * cannot be avoided, so these two operations will always need
+	 * to be handled in intermediate layers.
+	 */
+	struct vnode *vp = ap->a_vp;
+	int vnflags, flags = ap->a_flags;
+
+	if (vp->v_vnlock == NULL) {
+		if ((flags & LK_TYPE_MASK) == LK_DRAIN)
+			return (0);
+		MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
+		    M_VNODE, M_WAITOK);
+		lockinit(vp->v_vnlock, PVFS, "vnlock", 0, LK_NOPAUSE);
+	}
+	switch (flags & LK_TYPE_MASK) {
+	case LK_DRAIN:
+		vnflags = LK_DRAIN;
+		break;
+	case LK_EXCLUSIVE:
+	case LK_SHARED:
+		vnflags = LK_SHARED;
+		break;
+	case LK_UPGRADE:
+	case LK_EXCLUPGRADE:
+	case LK_DOWNGRADE:
+		return (0);
+	case LK_RELEASE:
+	default:
+		panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
+	}
+	if (flags & LK_INTERLOCK)
+		vnflags |= LK_INTERLOCK;
+	return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
+#else /* for now */
+	/*
+	 * Since we are not using the lock manager, we must clear
+	 * the interlock here.
+	 */
+	if (ap->a_flags & LK_INTERLOCK)
+		simple_unlock(&ap->a_vp->v_interlock);
+	return (0);
+#endif
+}
+
+/*
+ * Do the inverse of vop_nolock, handling the interlock in a compatible way.
+ */
+int
+vop_nounlock(ap)
+	struct vop_unlock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct proc *a_p;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	if (vp->v_vnlock == NULL) {
+		if (ap->a_flags & LK_INTERLOCK)
+			simple_unlock(&ap->a_vp->v_interlock);
+		return (0);
+	}
+	return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags,
+		&ap->a_vp->v_interlock, ap->a_p));
+}
+
+/*
+ * Return whether or not the node is in use.
+ */
+int
+vop_noislocked(ap)
+	struct vop_islocked_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	if (vp->v_vnlock == NULL)
+		return (0);
+	return (lockstatus(vp->v_vnlock));
+}
+
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
new file mode 100644
index 0000000..44b1698
--- /dev/null
+++ b/sys/kern/vfs_export.c
@@ -0,0 +1,2872 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
+ * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/domain.h>
+#include <sys/dirent.h>
+#include <sys/vmmeter.h>
+
+#include <machine/limits.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+#include <vm/vm_zone.h>
+#include <sys/sysctl.h>
+
+#include <miscfs/specfs/specdev.h>
+
+static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+
+static void	insmntque __P((struct vnode *vp, struct mount *mp));
+static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
+static void	vfree __P((struct vnode *));
+static void	vgonel __P((struct vnode *vp, struct proc *p));
+static unsigned long	numvnodes;
+SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
+
+enum vtype iftovt_tab[16] = {
+	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[9] = {
+	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+	S_IFSOCK, S_IFIFO, S_IFMT,
+};
+
+static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
+struct tobefreelist vnode_tobefree_list;	/* vnode free list */
+
+static u_long wantfreevnodes = 25;
+SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+static u_long freevnodes = 0;
+SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
+
+int vfs_ioopt = 0;
+#ifdef ENABLE_VFS_IOOPT
+SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
+#endif
+
+struct mntlist mountlist;	/* mounted filesystem list */
+struct simplelock mountlist_slock;
+struct simplelock mntvnode_slock;
+int	nfs_mount_type = -1;
+#ifndef NULL_SIMPLELOCKS
+static struct simplelock mntid_slock;
+static struct simplelock vnode_free_list_slock;
+static struct simplelock spechash_slock;
+#endif
+struct nfs_public nfs_pub;	/* publicly exported FS */
+static vm_zone_t vnode_zone;
+
+/*
+ * The workitem queue.
+ */
+#define SYNCER_MAXDELAY		32
+static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
+time_t syncdelay =		30;
+int rushjob;				/* number of slots to run ASAP */
+
+static int syncer_delayno = 0;
+static long syncer_mask; 
+LIST_HEAD(synclist, vnode);
+static struct synclist *syncer_workitem_pending;
+
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
+
+static void	vfs_free_addrlist __P((struct netexport *nep));
+static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
+static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
+				       struct export_args *argp));
+
+/*
+ * Initialize the vnode management data structures.
+ */
+void
+vntblinit()
+{
+
+	desiredvnodes = maxproc + cnt.v_page_count / 4;
+	simple_lock_init(&mntvnode_slock);
+	simple_lock_init(&mntid_slock);
+	simple_lock_init(&spechash_slock);
+	TAILQ_INIT(&vnode_free_list);
+	TAILQ_INIT(&vnode_tobefree_list);
+	simple_lock_init(&vnode_free_list_slock);
+	CIRCLEQ_INIT(&mountlist);
+	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
+	/*
+	 * Initialize the filesystem syncer.
+	 */     
+	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 
+		&syncer_mask);
+	syncer_maxdelay = syncer_mask + 1;
+}
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Interlock is not released on failure.
+ */
+int
+vfs_busy(mp, flags, interlkp, p)
+	struct mount *mp;
+	int flags;
+	struct simplelock *interlkp;
+	struct proc *p;
+{
+	int lkflags;
+
+	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+		if (flags & LK_NOWAIT)
+			return (ENOENT);
+		mp->mnt_kern_flag |= MNTK_MWAIT;
+		if (interlkp) {
+			simple_unlock(interlkp);
+		}
+		/*
+		 * Since all busy locks are shared except the exclusive
+		 * lock granted when unmounting, the only place that a
+		 * wakeup needs to be done is at the release of the
+		 * exclusive lock at the end of dounmount.
+		 */
+		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
+		if (interlkp) {
+			simple_lock(interlkp);
+		}
+		return (ENOENT);
+	}
+	lkflags = LK_SHARED | LK_NOPAUSE;
+	if (interlkp)
+		lkflags |= LK_INTERLOCK;
+	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
+		panic("vfs_busy: unexpected lock failure");
+	return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(mp, p)
+	struct mount *mp;
+	struct proc *p;
+{
+
+	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
+}
+
+/*
+ * Lookup a filesystem type, and if found allocate and initialize
+ * a mount structure for it.
+ *
+ * Devname is usually updated by mount(8) after booting.
+ */
+int
+vfs_rootmountalloc(fstypename, devname, mpp)
+	char *fstypename;
+	char *devname;
+	struct mount **mpp;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct vfsconf *vfsp;
+	struct mount *mp;
+
+	if (fstypename == NULL)
+		return (ENODEV);
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstypename))
+			break;
+	if (vfsp == NULL)
+		return (ENODEV);
+	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
+	bzero((char *)mp, (u_long)sizeof(struct mount));
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
+	LIST_INIT(&mp->mnt_vnodelist);
+	mp->mnt_vfc = vfsp;
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_flag = MNT_RDONLY;
+	mp->mnt_vnodecovered = NULLVP;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+	mp->mnt_stat.f_mntonname[0] = '/';
+	mp->mnt_stat.f_mntonname[1] = 0;
+	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
+	*mpp = mp;
+	return (0);
+}
+
+/*
+ * Find an appropriate filesystem to use for the root. If a filesystem
+ * has not been preselected, walk through the list of known filesystems
+ * trying those that have mountroot routines, and try them until one
+ * works or we have tried them all.
+ */
+#ifdef notdef	/* XXX JH */
+int
+lite2_vfs_mountroot()
+{
+	struct vfsconf *vfsp;
+	extern int (*lite2_mountroot) __P((void));
+	int error;
+
+	if (lite2_mountroot != NULL)
+		return ((*lite2_mountroot)());
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		if (vfsp->vfc_mountroot == NULL)
+			continue;
+		if ((error = (*vfsp->vfc_mountroot)()) == 0)
+			return (0);
+		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
+	}
+	return (ENODEV);
+}
+#endif
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid)
+	fsid_t *fsid;
+{
+	register struct mount *mp;
+
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
+	    mp = mp->mnt_list.cqe_next) {
+		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+			simple_unlock(&mountlist_slock);
+			return (mp);
+	    }
+	}
+	simple_unlock(&mountlist_slock);
+	return ((struct mount *) 0);
+}
+
+/*
+ * Get a new unique fsid
+ */
+void
+vfs_getnewfsid(mp)
+	struct mount *mp;
+{
+	static u_short xxxfs_mntid;
+
+	fsid_t tfsid;
+	int mtype;
+
+	simple_lock(&mntid_slock); 
+	mtype = mp->mnt_vfc->vfc_typenum;
+	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
+	mp->mnt_stat.f_fsid.val[1] = mtype;
+	if (xxxfs_mntid == 0)
+		++xxxfs_mntid;
+	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
+	tfsid.val[1] = mtype;
+	if (mountlist.cqh_first != (void *)&mountlist) {
+		while (vfs_getvfs(&tfsid)) {
+			tfsid.val[0]++;
+			xxxfs_mntid++;
+		}
+	}
+	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+	simple_unlock(&mntid_slock);
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(vap)
+	register struct vattr *vap;
+{
+
+	vap->va_type = VNON;
+	vap->va_size = VNOVAL;
+	vap->va_bytes = VNOVAL;
+	vap->va_mode = VNOVAL;
+	vap->va_nlink = VNOVAL;
+	vap->va_uid = VNOVAL;
+	vap->va_gid = VNOVAL;
+	vap->va_fsid = VNOVAL;
+	vap->va_fileid = VNOVAL;
+	vap->va_blocksize = VNOVAL;
+	vap->va_rdev = VNOVAL;
+	vap->va_atime.tv_sec = VNOVAL;
+	vap->va_atime.tv_nsec = VNOVAL;
+	vap->va_mtime.tv_sec = VNOVAL;
+	vap->va_mtime.tv_nsec = VNOVAL;
+	vap->va_ctime.tv_sec = VNOVAL;
+	vap->va_ctime.tv_nsec = VNOVAL;
+	vap->va_flags = VNOVAL;
+	vap->va_gen = VNOVAL;
+	vap->va_vaflags = 0;
+}
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+extern vop_t **dead_vnodeop_p;
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(tag, mp, vops, vpp)
+	enum vtagtype tag;
+	struct mount *mp;
+	vop_t **vops;
+	struct vnode **vpp;
+{
+	int s;
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp, *tvp, *nvp;
+	vm_object_t object;
+	TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
+
+	/*
+	 * We take the least recently used vnode from the freelist
+	 * if we can get it and it has no cached pages, and no
+	 * namecache entries are relative to it.
+	 * Otherwise we allocate a new vnode
+	 */
+
+	s = splbio();
+	simple_lock(&vnode_free_list_slock);
+	TAILQ_INIT(&vnode_tmp_list);
+
+	for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
+		nvp = TAILQ_NEXT(vp, v_freelist);
+		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+		if (vp->v_flag & VAGE) {
+			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+		} else {
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+		}
+		vp->v_flag &= ~(VTBFREE|VAGE);
+		vp->v_flag |= VFREE;
+		if (vp->v_usecount)
+			panic("tobe free vnode isn't");
+		freevnodes++;
+	}
+
+	if (wantfreevnodes && freevnodes < wantfreevnodes) {
+		vp = NULL;
+	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
+		/* 
+		 * XXX: this is only here to be backwards compatible
+		 */
+		vp = NULL;
+	} else {
+		for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
+			nvp = TAILQ_NEXT(vp, v_freelist);
+			if (!simple_lock_try(&vp->v_interlock)) 
+				continue;
+			if (vp->v_usecount)
+				panic("free vnode isn't");
+
+			object = vp->v_object;
+			if (object && (object->resident_page_count || object->ref_count)) {
+				printf("object inconsistant state: RPC: %d, RC: %d\n",
+					object->resident_page_count, object->ref_count);
+				/* Don't recycle if it's caching some pages */
+				TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+				TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
+				continue;
+			} else if (LIST_FIRST(&vp->v_cache_src)) {
+				/* Don't recycle if active in the namecache */
+				simple_unlock(&vp->v_interlock);
+				continue;
+			} else {
+				break;
+			}
+		}
+	}
+
+	for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
+		nvp = TAILQ_NEXT(tvp, v_freelist);
+		TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
+		TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
+		simple_unlock(&tvp->v_interlock);
+	}
+
+	if (vp) {
+		vp->v_flag |= VDOOMED;
+		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		freevnodes--;
+		simple_unlock(&vnode_free_list_slock);
+		cache_purge(vp);
+		vp->v_lease = NULL;
+		if (vp->v_type != VBAD) {
+			vgonel(vp, p);
+		} else {
+			simple_unlock(&vp->v_interlock);
+		}
+
+#ifdef INVARIANTS
+		{
+			int s;
+
+			if (vp->v_data)
+				panic("cleaned vnode isn't");
+			s = splbio();
+			if (vp->v_numoutput)
+				panic("Clean vnode has pending I/O's");
+			splx(s);
+		}
+#endif
+		vp->v_flag = 0;
+		vp->v_lastr = 0;
+		vp->v_lastw = 0;
+		vp->v_lasta = 0;
+		vp->v_cstart = 0;
+		vp->v_clen = 0;
+		vp->v_socket = 0;
+		vp->v_writecount = 0;	/* XXX */
+		vp->v_maxio = 0;
+	} else {
+		simple_unlock(&vnode_free_list_slock);
+		vp = (struct vnode *) zalloc(vnode_zone);
+		bzero((char *) vp, sizeof *vp);
+		simple_lock_init(&vp->v_interlock);
+		vp->v_dd = vp;
+		cache_purge(vp);
+		LIST_INIT(&vp->v_cache_src);
+		TAILQ_INIT(&vp->v_cache_dst);
+		numvnodes++;
+	}
+
+	TAILQ_INIT(&vp->v_cleanblkhd);
+	TAILQ_INIT(&vp->v_dirtyblkhd);
+	vp->v_type = VNON;
+	vp->v_tag = tag;
+	vp->v_op = vops;
+	insmntque(vp, mp);
+	*vpp = vp;
+	vp->v_usecount = 1;
+	vp->v_data = 0;
+	splx(s);
+
+	vfs_object_create(vp, p, p->p_ucred);
+	return (0);
+}
+
+/*
+ * Move a vnode from one mount queue to another.
+ */
+static void
+insmntque(vp, mp)
+	register struct vnode *vp;
+	register struct mount *mp;
+{
+
+	simple_lock(&mntvnode_slock);
+	/*
+	 * Delete from old mount point vnode list, if on one.
+	 */
+	if (vp->v_mount != NULL)
+		LIST_REMOVE(vp, v_mntvnodes);
+	/*
+	 * Insert into list of vnodes for the new mount point, if available.
+	 */
+	if ((vp->v_mount = mp) == NULL) {
+		simple_unlock(&mntvnode_slock);
+		return;
+	}
+	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
+	simple_unlock(&mntvnode_slock);
+}
+
+/*
+ * Update outstanding I/O count and do wakeup if requested.
+ */
+void
+vwakeup(bp)
+	register struct buf *bp;
+{
+	register struct vnode *vp;
+
+	bp->b_flags &= ~B_WRITEINPROG;
+	if ((vp = bp->b_vp)) {
+		vp->v_numoutput--;
+		if (vp->v_numoutput < 0)
+			panic("vwakeup: neg numoutput");
+		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
+			vp->v_flag &= ~VBWAIT;
+			wakeup((caddr_t) &vp->v_numoutput);
+		}
+	}
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
+	register struct vnode *vp;
+	int flags;
+	struct ucred *cred;
+	struct proc *p;
+	int slpflag, slptimeo;
+{
+	register struct buf *bp;
+	struct buf *nbp, *blist;
+	int s, error;
+	vm_object_t object;
+
+	if (flags & V_SAVE) {
+		s = splbio();
+		while (vp->v_numoutput) {
+			vp->v_flag |= VBWAIT;
+			error = tsleep((caddr_t)&vp->v_numoutput,
+			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
+			if (error) {
+				splx(s);
+				return (error);
+			}
+		}
+		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+			splx(s);
+			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
+				return (error);
+			s = splbio();
+			if (vp->v_numoutput > 0 ||
+			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
+				panic("vinvalbuf: dirty bufs");
+		}
+		splx(s);
+  	}
+	s = splbio();
+	for (;;) {
+		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
+		if (!blist)
+			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
+		if (!blist)
+			break;
+
+		for (bp = blist; bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if (bp->b_flags & B_BUSY) {
+				bp->b_flags |= B_WANTED;
+				error = tsleep((caddr_t) bp,
+				    slpflag | (PRIBIO + 4), "vinvalbuf",
+				    slptimeo);
+				if (error) {
+					splx(s);
+					return (error);
+				}
+				break;
+			}
+			/*
+			 * XXX Since there are no node locks for NFS, I
+			 * believe there is a slight chance that a delayed
+			 * write will occur while sleeping just above, so
+			 * check for it.  Note that vfs_bio_awrite expects
+			 * buffers to reside on a queue, while VOP_BWRITE and
+			 * brelse do not.
+			 */
+			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+				(flags & V_SAVE)) {
+
+				if (bp->b_vp == vp) {
+					if (bp->b_flags & B_CLUSTEROK) {
+						vfs_bio_awrite(bp);
+					} else {
+						bremfree(bp);
+						bp->b_flags |= (B_BUSY | B_ASYNC);
+						VOP_BWRITE(bp);
+					}
+				} else {
+					bremfree(bp);
+					bp->b_flags |= B_BUSY;
+					(void) VOP_BWRITE(bp);
+				}
+				break;
+			}
+			bremfree(bp);
+			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF | B_BUSY);
+			bp->b_flags &= ~B_ASYNC;
+			brelse(bp);
+		}
+	}
+
+	while (vp->v_numoutput > 0) {
+		vp->v_flag |= VBWAIT;
+		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+	}
+
+	splx(s);
+
+	/*
+	 * Destroy the copy in the VM cache, too.
+	 */
+	simple_lock(&vp->v_interlock);
+	object = vp->v_object;
+	if (object != NULL) {
+		vm_object_page_remove(object, 0, 0,
+			(flags & V_SAVE) ? TRUE : FALSE);
+	}
+	simple_unlock(&vp->v_interlock);
+
+	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
+		panic("vinvalbuf: flush failed");
+	return (0);
+}
+
+/*
+ * Truncate a file's buffer and pages to a specified length.  This
+ * is in lieu of the old vinvalbuf mechanism, which performed unneeded
+ * sync activity.
+ */
+int
+vtruncbuf(vp, cred, p, length, blksize)
+	register struct vnode *vp;
+	struct ucred *cred;
+	struct proc *p;
+	off_t length;
+	int blksize;
+{
+	register struct buf *bp;
+	struct buf *nbp;
+	int s, anyfreed;
+	int trunclbn;
+
+	/*
+	 * Round up to the *next* lbn.
+	 */
+	trunclbn = (length + blksize - 1) / blksize;
+
+	s = splbio();
+restart:
+	anyfreed = 1;
+	for (;anyfreed;) {
+		anyfreed = 0;
+		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if (bp->b_lblkno >= trunclbn) {
+				if (bp->b_flags & B_BUSY) {
+					bp->b_flags |= B_WANTED;
+					tsleep(bp, PRIBIO + 4, "vtrb1", 0);
+					goto restart;
+				} else {
+					bremfree(bp);
+					bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
+					bp->b_flags &= ~B_ASYNC;
+					brelse(bp);
+					anyfreed = 1;
+				}
+				if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)||
+					 (nbp->b_vp != vp) ||
+					 (nbp->b_flags & B_DELWRI))) {
+					goto restart;
+				}
+			}
+		}
+
+		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if (bp->b_lblkno >= trunclbn) {
+				if (bp->b_flags & B_BUSY) {
+					bp->b_flags |= B_WANTED;
+					tsleep(bp, PRIBIO + 4, "vtrb2", 0);
+					goto restart;
+				} else {
+					bremfree(bp);
+					bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
+					bp->b_flags &= ~B_ASYNC;
+					brelse(bp);
+					anyfreed = 1;
+				}
+				if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)||
+					 (nbp->b_vp != vp) ||
+					 (nbp->b_flags & B_DELWRI) == 0)) {
+					goto restart;
+				}
+			}
+		}
+	}
+
+	if (length > 0) {
+restartsync:
+		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
+				if (bp->b_flags & B_BUSY) {
+					bp->b_flags |= B_WANTED;
+					tsleep(bp, PRIBIO, "vtrb3", 0);
+				} else {
+					bremfree(bp);
+					bp->b_flags |= B_BUSY;
+					if (bp->b_vp == vp) {
+						bp->b_flags |= B_ASYNC;
+					} else {
+						bp->b_flags &= ~B_ASYNC;
+					}
+					VOP_BWRITE(bp);
+				}
+				goto restartsync;
+			}
+
+		}
+	}
+
+	while (vp->v_numoutput > 0) {
+		vp->v_flag |= VBWAIT;
+		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
+	}
+
+	splx(s);
+
+	vnode_pager_setsize(vp, length);
+
+	return (0);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(vp, bp)
+	register struct vnode *vp;
+	register struct buf *bp;
+{
+	int s;
+
+	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
+
+	vhold(vp);
+	bp->b_vp = vp;
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		bp->b_dev = vp->v_rdev;
+	else
+		bp->b_dev = NODEV;
+	/*
+	 * Insert onto list for new vnode.
+	 */
+	s = splbio();
+	bp->b_xflags |= B_VNCLEAN;
+	bp->b_xflags &= ~B_VNDIRTY;
+	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
+	splx(s);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(bp)
+	register struct buf *bp;
+{
+	struct vnode *vp;
+	struct buflists *listheadp;
+	int s;
+
+	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
+
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	vp = bp->b_vp;
+	s = splbio();
+	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
+		if (bp->b_xflags & B_VNDIRTY)
+			listheadp = &vp->v_dirtyblkhd;
+		else 
+			listheadp = &vp->v_cleanblkhd;
+		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
+	}
+	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+		vp->v_flag &= ~VONWORKLST;
+		LIST_REMOVE(vp, v_synclist);
+	}
+	splx(s);
+	bp->b_vp = (struct vnode *) 0;
+	vdrop(vp);
+}
+
+/*
+ * The workitem queue.
+ * 
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syner process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ *	syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+
+/*
+ * Add an item to the syncer work queue.
+ */
+void
+vn_syncer_add_to_worklist(vp, delay)
+	struct vnode *vp;
+	int delay;
+{
+	int s, slot;
+
+	s = splbio();
+
+	if (vp->v_flag & VONWORKLST) {
+		LIST_REMOVE(vp, v_synclist);
+	}
+
+	if (delay > syncer_maxdelay - 2)
+		delay = syncer_maxdelay - 2;
+	slot = (syncer_delayno + delay) & syncer_mask;
+
+	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
+	vp->v_flag |= VONWORKLST;
+	splx(s);
+}
+
+static void sched_sync __P((void));
+static struct	proc *updateproc;
+static struct kproc_desc up_kp = {
+	"syncer",
+	sched_sync,
+	&updateproc
+};
+SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+void 
+sched_sync(void)
+{
+	struct synclist *slp;
+	struct vnode *vp;
+	long starttime;
+	int s;
+	struct proc *p = updateproc;
+
+	for (;;) {
+		starttime = time_second;
+
+		/*
+		 * Push files whose dirty time has expired.
+		 */
+		s = splbio();
+		slp = &syncer_workitem_pending[syncer_delayno];
+		syncer_delayno += 1;
+		if (syncer_delayno == syncer_maxdelay)
+			syncer_delayno = 0;
+		splx(s);
+
+		while ((vp = LIST_FIRST(slp)) != NULL) {
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
+			VOP_UNLOCK(vp, 0, p);
+			if (LIST_FIRST(slp) == vp) {
+				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
+				    vp->v_type != VBLK)
+					panic("sched_sync: fsync failed");
+				/*
+				 * Move ourselves to the back of the sync list.
+				 */
+				LIST_REMOVE(vp, v_synclist);
+				vn_syncer_add_to_worklist(vp, syncdelay);
+			}
+		}
+
+		/*
+		 * Do soft update processing.
+		 */
+		if (bioops.io_sync)
+			(*bioops.io_sync)(NULL);
+
+		/*
+		 * The variable rushjob allows the kernel to speed up the
+		 * processing of the filesystem syncer process. A rushjob
+		 * value of N tells the filesystem syncer to process the next
+		 * N seconds worth of work on its queue ASAP. Currently rushjob
+		 * is used by the soft update code to speed up the filesystem
+		 * syncer process when the incore state is getting so far
+		 * ahead of the disk that the kernel memory pool is being
+		 * threatened with exhaustion.
+		 */
+		if (rushjob > 0) {
+			rushjob -= 1;
+			continue;
+		}
+		/*
+		 * If it has taken us less than a second to process the
+		 * current work, then wait. Otherwise start right over
+		 * again. We can still lose time if any single round
+		 * takes more than two seconds, but it does not really
+		 * matter as we are just trying to generally pace the
+		 * filesystem activity.
+		 */
+		if (time_second == starttime)
+			tsleep(&lbolt, PPAUSE, "syncer", 0);
+	}
+}
+
+/*
+ * Associate a p-buffer with a vnode.
+ *
+ * Also sets B_PAGING flag to indicate that vnode is not fully associated
+ * with the buffer.  i.e. the bp has not been linked into the vnode or
+ * ref-counted.
+ */
+void
+pbgetvp(vp, bp)
+	register struct vnode *vp;
+	register struct buf *bp;
+{
+
+	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
+
+	bp->b_vp = vp;
+	bp->b_flags |= B_PAGING;
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		bp->b_dev = vp->v_rdev;
+	else
+		bp->b_dev = NODEV;
+}
+
+/*
+ * Disassociate a p-buffer from a vnode.
+ */
+void
+pbrelvp(bp)
+	register struct buf *bp;
+{
+
+	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
+
+#if !defined(MAX_PERF)
+	/* XXX REMOVE ME */
+	if (bp->b_vnbufs.tqe_next != NULL) {
+		panic(
+		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
+		    bp,
+		    (int)bp->b_flags
+		);
+	}
+#endif
+	bp->b_vp = (struct vnode *) 0;
+	bp->b_flags &= ~B_PAGING;
+}
+
+void
+pbreassignbuf(bp, newvp)
+	struct buf *bp;
+	struct vnode *newvp;
+{
+#if !defined(MAX_PERF)
+	if ((bp->b_flags & B_PAGING) == 0) {
+		panic(
+		    "pbreassignbuf() on non phys bp %p", 
+		    bp
+		);
+	}
+#endif
+	bp->b_vp = newvp;
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(bp, newvp)
+	register struct buf *bp;
+	register struct vnode *newvp;
+{
+	struct buflists *listheadp;
+	struct vnode *oldvp;
+	int delay;
+	int s;
+
+	if (newvp == NULL) {
+		printf("reassignbuf: NULL");
+		return;
+	}
+
+#if !defined(MAX_PERF)
+	/*
+	 * B_PAGING flagged buffers cannot be reassigned because their vp
+	 * is not fully linked in.
+	 */
+	if (bp->b_flags & B_PAGING)
+		panic("cannot reassign paging buffer");
+#endif
+
+	s = splbio();
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
+		oldvp = bp->b_vp;
+		if (bp->b_xflags & B_VNDIRTY)
+			listheadp = &oldvp->v_dirtyblkhd;
+		else 
+			listheadp = &oldvp->v_cleanblkhd;
+		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
+		vdrop(oldvp);
+	}
+	/*
+	 * If dirty, put on list of dirty buffers; otherwise insert onto list
+	 * of clean buffers.
+	 */
+	if (bp->b_flags & B_DELWRI) {
+		struct buf *tbp;
+
+		listheadp = &newvp->v_dirtyblkhd;
+		if ((newvp->v_flag & VONWORKLST) == 0) {
+			switch (newvp->v_type) {
+			case VDIR:
+				delay = syncdelay / 3;
+				break;
+			case VBLK:
+				if (newvp->v_specmountpoint != NULL) {
+					delay = syncdelay / 2;
+					break;
+				}
+				/* fall through */
+			default:
+				delay = syncdelay;
+			}
+			vn_syncer_add_to_worklist(newvp, delay);
+		}
+		bp->b_xflags |= B_VNDIRTY;
+		tbp = TAILQ_FIRST(listheadp);
+		if (tbp == NULL ||
+		    (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) {
+			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
+		} else {
+			if (bp->b_lblkno >= 0) {
+				struct buf *ttbp;
+				while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
+				    (ttbp->b_lblkno < bp->b_lblkno)) {
+					tbp = ttbp;
+				}
+				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+			} else {
+				TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
+			}
+		}
+	} else {
+		bp->b_xflags |= B_VNCLEAN;
+		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
+		if ((newvp->v_flag & VONWORKLST) &&
+		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
+			newvp->v_flag &= ~VONWORKLST;
+			LIST_REMOVE(newvp, v_synclist);
+		}
+	}
+	bp->b_vp = newvp;
+	vhold(bp->b_vp);
+	splx(s);
+}
+
+/*
+ * Create a vnode for a block device.
+ * Used for mounting the root file system.
+ */
+int
+bdevvp(dev, vpp)
+	dev_t dev;
+	struct vnode **vpp;
+{
+	register struct vnode *vp;
+	struct vnode *nvp;
+	int error;
+
+	/* XXX 255 is for mfs. */
+	if (dev == NODEV || (major(dev) != 255 && (major(dev) >= nblkdev ||
+	    bdevsw[major(dev)] == NULL))) {
+		*vpp = NULLVP;
+		return (ENXIO);
+	}
+	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
+	if (error) {
+		*vpp = NULLVP;
+		return (error);
+	}
+	vp = nvp;
+	vp->v_type = VBLK;
+	if ((nvp = checkalias(vp, dev, (struct mount *)0)) != NULL) {
+		vput(vp);
+		vp = nvp;
+	}
+	*vpp = vp;
+	return (0);
+}
+
+/*
+ * Check to see if the new vnode represents a special device
+ * for which we already have a vnode (either because of
+ * bdevvp() or because of a different vnode representing
+ * the same block device). If such an alias exists, deallocate
+ * the existing contents and return the aliased vnode. The
+ * caller is responsible for filling it with its new contents.
+ */
+struct vnode *
+checkalias(nvp, nvp_rdev, mp)
+	register struct vnode *nvp;
+	dev_t nvp_rdev;
+	struct mount *mp;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp;
+	struct vnode **vpp;
+
+	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
+		return (NULLVP);
+
+	vpp = &speclisth[SPECHASH(nvp_rdev)];
+loop:
+	simple_lock(&spechash_slock);
+	for (vp = *vpp; vp; vp = vp->v_specnext) {
+		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
+			continue;
+		/*
+		 * Alias, but not in use, so flush it out.
+		 * Only alias active device nodes.
+		 * Not sure why we don't re-use this like we do below.
+		 */
+		simple_lock(&vp->v_interlock);
+		if (vp->v_usecount == 0) {
+			simple_unlock(&spechash_slock);
+			vgonel(vp, p);
+			goto loop;
+		}
+		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
+			/*
+			 * It dissappeared, and we may have slept.
+			 * Restart from the beginning
+			 */
+			simple_unlock(&spechash_slock);
+			goto loop;
+		}
+		break;
+	}
+	/*
+	 * It would be a lot clearer what is going on here if
+	 * this had been expressed as:
+	 * if ( vp && (vp->v_tag == VT_NULL))
+	 * and the clauses had been swapped.
+	 */
+	if (vp == NULL || vp->v_tag != VT_NON) {
+		/*
+		 * Put the new vnode into the hash chain.
+		 * and if there was an alias, connect them.
+		 */
+		MALLOC(nvp->v_specinfo, struct specinfo *,
+		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
+		nvp->v_rdev = nvp_rdev;
+		nvp->v_hashchain = vpp;
+		nvp->v_specnext = *vpp;
+		nvp->v_specmountpoint = NULL;
+		simple_unlock(&spechash_slock);
+		*vpp = nvp;
+		if (vp != NULLVP) {
+			nvp->v_flag |= VALIASED;
+			vp->v_flag |= VALIASED;
+			vput(vp);
+		}
+		return (NULLVP);
+	}
+	/*
+	 * if ( vp && (vp->v_tag == VT_NULL))
+	 * We have a vnode alias, but it is a trashed.
+	 * Make it look like it's newley allocated. (by getnewvnode())
+	 * The caller should use this instead.
+	 */
+	simple_unlock(&spechash_slock);
+	VOP_UNLOCK(vp, 0, p);
+	simple_lock(&vp->v_interlock);
+	vclean(vp, 0, p);
+	vp->v_op = nvp->v_op;
+	vp->v_tag = nvp->v_tag;
+	nvp->v_type = VNON;
+	insmntque(vp, mp);
+	return (vp);
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it. The vnode lock bit is set the
+ * vnode is being eliminated in vgone. The process is awakened
+ * when the transition is completed, and an error returned to
+ * indicate that the vnode is no longer usable (possibly having
+ * been changed to a new file system type).
+ */
+int
+vget(vp, flags, p)
+	register struct vnode *vp;
+	int flags;
+	struct proc *p;
+{
+	int error;
+
+	/*
+	 * If the vnode is in the process of being cleaned out for
+	 * another use, we wait for the cleaning to finish and then
+	 * return failure. Cleaning is determined by checking that
+	 * the VXLOCK flag is set.
+	 */
+	if ((flags & LK_INTERLOCK) == 0) {
+		simple_lock(&vp->v_interlock);
+	}
+	if (vp->v_flag & VXLOCK) {
+		vp->v_flag |= VXWANT;
+		simple_unlock(&vp->v_interlock);
+		tsleep((caddr_t)vp, PINOD, "vget", 0);
+		return (ENOENT);
+	}
+
+	vp->v_usecount++;
+
+	if (VSHOULDBUSY(vp))
+		vbusy(vp);
+	if (flags & LK_TYPE_MASK) {
+		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
+			/*
+			 * must expand vrele here because we do not want
+			 * to call VOP_INACTIVE if the reference count
+			 * drops back to zero since it was never really
+			 * active. We must remove it from the free list
+			 * before sleeping so that multiple processes do
+			 * not try to recycle it.
+			 */
+			simple_lock(&vp->v_interlock);
+			vp->v_usecount--;
+			if (VSHOULDFREE(vp))
+				vfree(vp);
+			simple_unlock(&vp->v_interlock);
+		}
+		return (error);
+	}
+	simple_unlock(&vp->v_interlock);
+	return (0);
+}
+
+void
+vref(struct vnode *vp)
+{
+	simple_lock(&vp->v_interlock);
+	vp->v_usecount++;
+	simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(vp)
+	struct vnode *vp;
+{
+	struct proc *p = curproc;	/* XXX */
+
+	KASSERT(vp != NULL, ("vrele: null vp"));
+
+	simple_lock(&vp->v_interlock);
+
+	if (vp->v_usecount > 1) {
+
+		vp->v_usecount--;
+		simple_unlock(&vp->v_interlock);
+
+		return;
+	}
+
+	if (vp->v_usecount == 1) {
+
+		vp->v_usecount--;
+		if (VSHOULDFREE(vp))
+			vfree(vp);
+	/*
+	 * If we are doing a vput, the node is already locked, and we must
+	 * call VOP_INACTIVE with the node locked.  So, in the case of
+	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+	 */
+		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
+			VOP_INACTIVE(vp, p);
+		}
+
+	} else {
+#ifdef DIAGNOSTIC
+		vprint("vrele: negative ref count", vp);
+		simple_unlock(&vp->v_interlock);
+#endif
+		panic("vrele: negative ref cnt");
+	}
+}
+
+void
+vput(vp)
+	struct vnode *vp;
+{
+	struct proc *p = curproc;	/* XXX */
+
+	KASSERT(vp != NULL, ("vput: null vp"));
+
+	simple_lock(&vp->v_interlock);
+
+	if (vp->v_usecount > 1) {
+
+		vp->v_usecount--;
+		VOP_UNLOCK(vp, LK_INTERLOCK, p);
+		return;
+
+	}
+
+	if (vp->v_usecount == 1) {
+
+		vp->v_usecount--;
+		if (VSHOULDFREE(vp))
+			vfree(vp);
+	/*
+	 * If we are doing a vput, the node is already locked, and we must
+	 * call VOP_INACTIVE with the node locked.  So, in the case of
+	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+	 */
+		simple_unlock(&vp->v_interlock);
+		VOP_INACTIVE(vp, p);
+
+	} else {
+#ifdef DIAGNOSTIC
+		vprint("vput: negative ref count", vp);
+#endif
+		panic("vput: negative ref cnt");
+	}
+}
+
+/*
+ * Somebody doesn't want the vnode recycled.
+ */
+void
+vhold(vp)
+	register struct vnode *vp;
+{
+	int s;
+
+  	s = splbio();
+	vp->v_holdcnt++;
+	if (VSHOULDBUSY(vp))
+		vbusy(vp);
+	splx(s);
+}
+
+/*
+ * One less who cares about this vnode.
+ */
+void
+vdrop(vp)
+	register struct vnode *vp;
+{
+	int s;
+
+	s = splbio();
+	if (vp->v_holdcnt <= 0)
+		panic("vdrop: holdcnt");
+	vp->v_holdcnt--;
+	if (VSHOULDFREE(vp))
+		vfree(vp);
+	splx(s);
+}
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If MNT_NOFORCE is specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If MNT_FORCE is specified, detach any active vnodes
+ * that are found.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0;		/* print out busy vnodes */
+SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
+#endif
+
+int
+vflush(mp, skipvp, flags)
+	struct mount *mp;
+	struct vnode *skipvp;
+	int flags;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp, *nvp;
+	int busy = 0;
+
+	simple_lock(&mntvnode_slock);
+loop:
+	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
+		/*
+		 * Make sure this vnode wasn't reclaimed in getnewvnode().
+		 * Start over if it has (it won't be on the list anymore).
+		 */
+		if (vp->v_mount != mp)
+			goto loop;
+		nvp = vp->v_mntvnodes.le_next;
+		/*
+		 * Skip over a selected vnode.
+		 */
+		if (vp == skipvp)
+			continue;
+
+		simple_lock(&vp->v_interlock);
+		/*
+		 * Skip over a vnodes marked VSYSTEM.
+		 */
+		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+			simple_unlock(&vp->v_interlock);
+			continue;
+		}
+		/*
+		 * If WRITECLOSE is set, only flush out regular file vnodes
+		 * open for writing.
+		 */
+		if ((flags & WRITECLOSE) &&
+		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
+			simple_unlock(&vp->v_interlock);
+			continue;
+		}
+
+		/*
+		 * With v_usecount == 0, all we need to do is clear out the
+		 * vnode data structures and we are done.
+		 */
+		if (vp->v_usecount == 0) {
+			simple_unlock(&mntvnode_slock);
+			vgonel(vp, p);
+			simple_lock(&mntvnode_slock);
+			continue;
+		}
+
+		/*
+		 * If FORCECLOSE is set, forcibly close the vnode. For block
+		 * or character devices, revert to an anonymous device. For
+		 * all other files, just kill them.
+		 */
+		if (flags & FORCECLOSE) {
+			simple_unlock(&mntvnode_slock);
+			if (vp->v_type != VBLK && vp->v_type != VCHR) {
+				vgonel(vp, p);
+			} else {
+				vclean(vp, 0, p);
+				vp->v_op = spec_vnodeop_p;
+				insmntque(vp, (struct mount *) 0);
+			}
+			simple_lock(&mntvnode_slock);
+			continue;
+		}
+#ifdef DIAGNOSTIC
+		if (busyprt)
+			vprint("vflush: busy vnode", vp);
+#endif
+		simple_unlock(&vp->v_interlock);
+		busy++;
+	}
+	simple_unlock(&mntvnode_slock);
+	if (busy)
+		return (EBUSY);
+	return (0);
+}
+
+/*
+ * Disassociate the underlying file system from a vnode.
+ */
+static void
+vclean(vp, flags, p)
+	struct vnode *vp;
+	int flags;
+	struct proc *p;
+{
+	int active;
+	vm_object_t obj;
+
+	/*
+	 * Check to see if the vnode is in use. If so we have to reference it
+	 * before we clean it out so that its count cannot fall to zero and
+	 * generate a race against ourselves to recycle it.
+	 */
+	if ((active = vp->v_usecount))
+		vp->v_usecount++;
+
+	/*
+	 * Prevent the vnode from being recycled or brought into use while we
+	 * clean it out.
+	 */
+	if (vp->v_flag & VXLOCK)
+		panic("vclean: deadlock");
+	vp->v_flag |= VXLOCK;
+	/*
+	 * Even if the count is zero, the VOP_INACTIVE routine may still
+	 * have the object locked while it cleans it out. The VOP_LOCK
+	 * ensures that the VOP_INACTIVE routine is done with its work.
+	 * For active vnodes, it ensures that no other activity can
+	 * occur while the underlying object is being cleaned out.
+	 */
+	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
+
+	/*
+	 * Clean out any buffers associated with the vnode.
+	 */
+	vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
+	if (obj = vp->v_object) {
+		if (obj->ref_count == 0) {
+			/*
+			 * This is a normal way of shutting down the object/vnode
+			 * association.
+			 */
+			vm_object_terminate(obj);
+		} else {
+			/*
+			 * Woe to the process that tries to page now :-).
+			 */
+			vm_pager_deallocate(obj);
+		}
+	}
+
+	/*
+	 * If purging an active vnode, it must be closed and
+	 * deactivated before being reclaimed. Note that the
+	 * VOP_INACTIVE will unlock the vnode.
+	 */
+	if (active) {
+		if (flags & DOCLOSE)
+			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
+		VOP_INACTIVE(vp, p);
+	} else {
+		/*
+		 * Any other processes trying to obtain this lock must first
+		 * wait for VXLOCK to clear, then call the new lock operation.
+		 */
+		VOP_UNLOCK(vp, 0, p);
+	}
+	/*
+	 * Reclaim the vnode.
+	 */
+	if (VOP_RECLAIM(vp, p))
+		panic("vclean: cannot reclaim");
+
+	if (active)
+		vrele(vp);
+
+	cache_purge(vp);
+	if (vp->v_vnlock) {
+#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
+#ifdef DIAGNOSTIC
+		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
+			vprint("vclean: lock not drained", vp);
+#endif
+#endif
+		FREE(vp->v_vnlock, M_VNODE);
+		vp->v_vnlock = NULL;
+	}
+
+	if (VSHOULDFREE(vp))
+		vfree(vp);
+
+	/*
+	 * Done with purge, notify sleepers of the grim news.
+	 */
+	vp->v_op = dead_vnodeop_p;
+	vn_pollgone(vp);
+	vp->v_tag = VT_NON;
+	vp->v_flag &= ~VXLOCK;
+	if (vp->v_flag & VXWANT) {
+		vp->v_flag &= ~VXWANT;
+		wakeup((caddr_t) vp);
+	}
+}
+
+/*
+ * Eliminate all activity associated with the requested vnode
+ * and with all vnodes aliased to the requested vnode.
+ */
+int
+vop_revoke(ap)
+	struct vop_revoke_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+	} */ *ap;
+{
+	struct vnode *vp, *vq;
+	struct proc *p = curproc;	/* XXX */
+
+	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
+
+	vp = ap->a_vp;
+	simple_lock(&vp->v_interlock);
+
+	if (vp->v_flag & VALIASED) {
+		/*
+		 * If a vgone (or vclean) is already in progress,
+		 * wait until it is done and return.
+		 */
+		if (vp->v_flag & VXLOCK) {
+			vp->v_flag |= VXWANT;
+			simple_unlock(&vp->v_interlock);
+			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
+			return (0);
+		}
+		/*
+		 * Ensure that vp will not be vgone'd while we
+		 * are eliminating its aliases.
+		 */
+		vp->v_flag |= VXLOCK;
+		simple_unlock(&vp->v_interlock);
+		while (vp->v_flag & VALIASED) {
+			simple_lock(&spechash_slock);
+			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+				if (vq->v_rdev != vp->v_rdev ||
+				    vq->v_type != vp->v_type || vp == vq)
+					continue;
+				simple_unlock(&spechash_slock);
+				vgone(vq);
+				break;
+			}
+			if (vq == NULLVP) {
+				simple_unlock(&spechash_slock);
+			}
+		}
+		/*
+		 * Remove the lock so that vgone below will
+		 * really eliminate the vnode after which time
+		 * vgone will awaken any sleepers.
+		 */
+		simple_lock(&vp->v_interlock);
+		vp->v_flag &= ~VXLOCK;
+		if (vp->v_flag & VXWANT) {
+			vp->v_flag &= ~VXWANT;
+			wakeup(vp);
+		}
+	}
+	vgonel(vp, p);
+	return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ * Release the passed interlock if the vnode will be recycled.
+ */
+int
+vrecycle(vp, inter_lkp, p)
+	struct vnode *vp;
+	struct simplelock *inter_lkp;
+	struct proc *p;
+{
+
+	simple_lock(&vp->v_interlock);
+	if (vp->v_usecount == 0) {
+		if (inter_lkp) {
+			simple_unlock(inter_lkp);
+		}
+		vgonel(vp, p);
+		return (1);
+	}
+	simple_unlock(&vp->v_interlock);
+	return (0);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(vp)
+	register struct vnode *vp;
+{
+	struct proc *p = curproc;	/* XXX */
+
+	simple_lock(&vp->v_interlock);
+	vgonel(vp, p);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+static void
+vgonel(vp, p)
+	struct vnode *vp;
+	struct proc *p;
+{
+	int s;
+	struct vnode *vq;
+	struct vnode *vx;
+
+	/*
+	 * If a vgone (or vclean) is already in progress,
+	 * wait until it is done and return.
+	 */
+	if (vp->v_flag & VXLOCK) {
+		vp->v_flag |= VXWANT;
+		simple_unlock(&vp->v_interlock);
+		tsleep((caddr_t)vp, PINOD, "vgone", 0);
+		return;
+	}
+
+	/*
+	 * Clean out the filesystem specific data.
+	 */
+	vclean(vp, DOCLOSE, p);
+	simple_lock(&vp->v_interlock);
+
+	/*
+	 * Delete from old mount point vnode list, if on one.
+	 */
+	if (vp->v_mount != NULL)
+		insmntque(vp, (struct mount *)0);
+	/*
+	 * If special device, remove it from special device alias list
+	 * if it is on one.
+	 */
+	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
+		simple_lock(&spechash_slock);
+		if (*vp->v_hashchain == vp) {
+			*vp->v_hashchain = vp->v_specnext;
+		} else {
+			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+				if (vq->v_specnext != vp)
+					continue;
+				vq->v_specnext = vp->v_specnext;
+				break;
+			}
+			if (vq == NULL)
+				panic("missing bdev");
+		}
+		if (vp->v_flag & VALIASED) {
+			vx = NULL;
+			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+				if (vq->v_rdev != vp->v_rdev ||
+				    vq->v_type != vp->v_type)
+					continue;
+				if (vx)
+					break;
+				vx = vq;
+			}
+			if (vx == NULL)
+				panic("missing alias");
+			if (vq == NULL)
+				vx->v_flag &= ~VALIASED;
+			vp->v_flag &= ~VALIASED;
+		}
+		simple_unlock(&spechash_slock);
+		FREE(vp->v_specinfo, M_VNODE);
+		vp->v_specinfo = NULL;
+	}
+
+	/*
+	 * If it is on the freelist and not already at the head,
+	 * move it to the head of the list. The test of the back
+	 * pointer and the reference count of zero is because
+	 * it will be removed from the free list by getnewvnode,
+	 * but will not have its reference count incremented until
+	 * after calling vgone. If the reference count were
+	 * incremented first, vgone would (incorrectly) try to
+	 * close the previous instance of the underlying object.
+	 */
+	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
+		s = splbio();
+		simple_lock(&vnode_free_list_slock);
+		if (vp->v_flag & VFREE) {
+			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		} else if (vp->v_flag & VTBFREE) {
+			TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+			vp->v_flag &= ~VTBFREE;
+			freevnodes++;
+		} else
+			freevnodes++;
+		vp->v_flag |= VFREE;
+		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+		simple_unlock(&vnode_free_list_slock);
+		splx(s);
+	}
+
+	vp->v_type = VBAD;
+	simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Lookup a vnode by device number.
+ */
+int
+vfinddev(dev, type, vpp)
+	dev_t dev;
+	enum vtype type;
+	struct vnode **vpp;
+{
+	register struct vnode *vp;
+	int rc = 0;
+
+	simple_lock(&spechash_slock);
+	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
+		if (dev != vp->v_rdev || type != vp->v_type)
+			continue;
+		*vpp = vp;
+		rc = 1;
+		break;
+	}
+	simple_unlock(&spechash_slock);
+	return (rc);
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(vp)
+	register struct vnode *vp;
+{
+	struct vnode *vq, *vnext;
+	int count;
+
+loop:
+	if ((vp->v_flag & VALIASED) == 0)
+		return (vp->v_usecount);
+	simple_lock(&spechash_slock);
+	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
+		vnext = vq->v_specnext;
+		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
+			continue;
+		/*
+		 * Alias, but not in use, so flush it out.
+		 */
+		if (vq->v_usecount == 0 && vq != vp) {
+			simple_unlock(&spechash_slock);
+			vgone(vq);
+			goto loop;
+		}
+		count += vq->v_usecount;
+	}
+	simple_unlock(&spechash_slock);
+	return (count);
+}
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
+
+void
+vprint(label, vp)
+	char *label;
+	register struct vnode *vp;
+{
+	char buf[96];
+
+	if (label != NULL)
+		printf("%s: %p: ", label, (void *)vp);
+	else
+		printf("%p: ", (void *)vp);
+	printf("type %s, usecount %d, writecount %d, refcount %d,",
+	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
+	    vp->v_holdcnt);
+	buf[0] = '\0';
+	if (vp->v_flag & VROOT)
+		strcat(buf, "|VROOT");
+	if (vp->v_flag & VTEXT)
+		strcat(buf, "|VTEXT");
+	if (vp->v_flag & VSYSTEM)
+		strcat(buf, "|VSYSTEM");
+	if (vp->v_flag & VXLOCK)
+		strcat(buf, "|VXLOCK");
+	if (vp->v_flag & VXWANT)
+		strcat(buf, "|VXWANT");
+	if (vp->v_flag & VBWAIT)
+		strcat(buf, "|VBWAIT");
+	if (vp->v_flag & VALIASED)
+		strcat(buf, "|VALIASED");
+	if (vp->v_flag & VDOOMED)
+		strcat(buf, "|VDOOMED");
+	if (vp->v_flag & VFREE)
+		strcat(buf, "|VFREE");
+	if (vp->v_flag & VOBJBUF)
+		strcat(buf, "|VOBJBUF");
+	if (buf[0] != '\0')
+		printf(" flags (%s)", &buf[1]);
+	if (vp->v_data == NULL) {
+		printf("\n");
+	} else {
+		printf("\n\t");
+		VOP_PRINT(vp);
+	}
+}
+
+#ifdef DDB
+#include <ddb/ddb.h>
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
+{
+	struct proc *p = curproc;	/* XXX */
+	struct mount *mp, *nmp;
+	struct vnode *vp;
+
+	printf("Locked vnodes\n");
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+		for (vp = mp->mnt_vnodelist.lh_first;
+		     vp != NULL;
+		     vp = vp->v_mntvnodes.le_next) {
+			if (VOP_ISLOCKED(vp))
+				vprint((char *)0, vp);
+		}
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl SYSCTL_HANDLER_ARGS
+{
+	int *name = (int *)arg1 - 1;	/* XXX */
+	u_int namelen = arg2 + 1;	/* XXX */
+	struct vfsconf *vfsp;
+
+#if 1 || defined(COMPAT_PRELITE2)
+	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+	if (namelen == 1)
+		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+#ifdef notyet
+	/* all sysctl names at this level are at least name and field */
+	if (namelen < 2)
+		return (ENOTDIR);		/* overloaded */
+	if (name[0] != VFS_GENERIC) {
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == name[0])
+				break;
+		if (vfsp == NULL)
+			return (EOPNOTSUPP);
+		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
+		    oldp, oldlenp, newp, newlen, p));
+	}
+#endif
+	switch (name[1]) {
+	case VFS_MAXTYPENUM:
+		if (namelen != 2)
+			return (ENOTDIR);
+		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+	case VFS_CONF:
+		if (namelen != 3)
+			return (ENOTDIR);	/* overloaded */
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == name[2])
+				break;
+		if (vfsp == NULL)
+			return (EOPNOTSUPP);
+		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
+	}
+	return (EOPNOTSUPP);
+}
+
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
+	"Generic filesystem");
+
+#if 1 || defined(COMPAT_PRELITE2)
+
+static int
+sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
+{
+	int error;
+	struct vfsconf *vfsp;
+	struct ovfsconf ovfs;
+
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
+		strcpy(ovfs.vfc_name, vfsp->vfc_name);
+		ovfs.vfc_index = vfsp->vfc_typenum;
+		ovfs.vfc_refcount = vfsp->vfc_refcount;
+		ovfs.vfc_flags = vfsp->vfc_flags;
+		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+		if (error)
+			return error;
+	}
+	return 0;
+}
+
+#endif /* 1 || COMPAT_PRELITE2 */
+
+#if 0
+#define KINFO_VNODESLOP	10
+/*
+ * Dump vnode list (via sysctl).
+ * Copyout address of vnode followed by vnode.
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode SYSCTL_HANDLER_ARGS
+{
+	struct proc *p = curproc;	/* XXX */
+	struct mount *mp, *nmp;
+	struct vnode *nvp, *vp;
+	int error;
+
+#define VPTRSZ	sizeof (struct vnode *)
+#define VNODESZ	sizeof (struct vnode)
+
+	req->lock = 0;
+	if (!req->oldptr) /* Make an estimate */
+		return (SYSCTL_OUT(req, 0,
+			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
+
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+again:
+		simple_lock(&mntvnode_slock);
+		for (vp = mp->mnt_vnodelist.lh_first;
+		     vp != NULL;
+		     vp = nvp) {
+			/*
+			 * Check that the vp is still associated with
+			 * this filesystem.  RACE: could have been
+			 * recycled onto the same filesystem.
+			 */
+			if (vp->v_mount != mp) {
+				simple_unlock(&mntvnode_slock);
+				goto again;
+			}
+			nvp = vp->v_mntvnodes.le_next;
+			simple_unlock(&mntvnode_slock);
+			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
+			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
+				return (error);
+			simple_lock(&mntvnode_slock);
+		}
+		simple_unlock(&mntvnode_slock);
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+
+	return (0);
+}
+#endif
+
+/*
+ * XXX
+ * Exporting the vnode list on large systems causes them to crash.
+ * Exporting the vnode list on medium systems causes sysctl to coredump.
+ */
+#if 0
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, 0, sysctl_vnode, "S,vnode", "");
+#endif
+
+/*
+ * Check to see if a filesystem is mounted on a block device.
+ */
+int
+vfs_mountedon(vp)
+	struct vnode *vp;
+{
+	struct vnode *vq;
+	int error = 0;
+
+	if (vp->v_specmountpoint != NULL)
+		return (EBUSY);
+	if (vp->v_flag & VALIASED) {
+		simple_lock(&spechash_slock);
+		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+			if (vq->v_rdev != vp->v_rdev ||
+			    vq->v_type != vp->v_type)
+				continue;
+			if (vq->v_specmountpoint != NULL) {
+				error = EBUSY;
+				break;
+			}
+		}
+		simple_unlock(&spechash_slock);
+	}
+	return (error);
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall()
+{
+	struct mount *mp, *nmp;
+	struct proc *p;
+	int error;
+
+	if (curproc != NULL)
+		p = curproc;
+	else
+		p = initproc;	/* XXX XXX should this be proc0? */
+	/*
+	 * Since this only runs when rebooting, it is not interlocked.
+	 */
+	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
+		nmp = mp->mnt_list.cqe_prev;
+		error = dounmount(mp, MNT_FORCE, p);
+		if (error) {
+			printf("unmount of %s failed (",
+			    mp->mnt_stat.f_mntonname);
+			if (error == EBUSY)
+				printf("BUSY)\n");
+			else
+				printf("%d)\n", error);
+		}
+	}
+}
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by ufs_mount() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(mp, nep, argp)
+	struct mount *mp;
+	struct netexport *nep;
+	struct export_args *argp;
+{
+	register struct netcred *np;
+	register struct radix_node_head *rnh;
+	register int i;
+	struct radix_node *rn;
+	struct sockaddr *saddr, *smask = 0;
+	struct domain *dom;
+	int error;
+
+	if (argp->ex_addrlen == 0) {
+		if (mp->mnt_flag & MNT_DEFEXPORTED)
+			return (EPERM);
+		np = &nep->ne_defexported;
+		np->netc_exflags = argp->ex_flags;
+		np->netc_anon = argp->ex_anon;
+		np->netc_anon.cr_ref = 1;
+		mp->mnt_flag |= MNT_DEFEXPORTED;
+		return (0);
+	}
+	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
+	bzero((caddr_t) np, i);
+	saddr = (struct sockaddr *) (np + 1);
+	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
+		goto out;
+	if (saddr->sa_len > argp->ex_addrlen)
+		saddr->sa_len = argp->ex_addrlen;
+	if (argp->ex_masklen) {
+		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
+		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
+		if (error)
+			goto out;
+		if (smask->sa_len > argp->ex_masklen)
+			smask->sa_len = argp->ex_masklen;
+	}
+	i = saddr->sa_family;
+	if ((rnh = nep->ne_rtable[i]) == 0) {
+		/*
+		 * Seems silly to initialize every AF when most are not used,
+		 * do so on demand here
+		 */
+		for (dom = domains; dom; dom = dom->dom_next)
+			if (dom->dom_family == i && dom->dom_rtattach) {
+				dom->dom_rtattach((void **) &nep->ne_rtable[i],
+				    dom->dom_rtoffset);
+				break;
+			}
+		if ((rnh = nep->ne_rtable[i]) == 0) {
+			error = ENOBUFS;
+			goto out;
+		}
+	}
+	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
+	    np->netc_rnodes);
+	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
+		error = EPERM;
+		goto out;
+	}
+	np->netc_exflags = argp->ex_flags;
+	np->netc_anon = argp->ex_anon;
+	np->netc_anon.cr_ref = 1;
+	return (0);
+out:
+	free(np, M_NETADDR);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+vfs_free_netcred(rn, w)
+	struct radix_node *rn;
+	void *w;
+{
+	register struct radix_node_head *rnh = (struct radix_node_head *) w;
+
+	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+	free((caddr_t) rn, M_NETADDR);
+	return (0);
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(nep)
+	struct netexport *nep;
+{
+	register int i;
+	register struct radix_node_head *rnh;
+
+	for (i = 0; i <= AF_MAX; i++)
+		if ((rnh = nep->ne_rtable[i])) {
+			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
+			    (caddr_t) rnh);
+			free((caddr_t) rnh, M_RTABLE);
+			nep->ne_rtable[i] = 0;
+		}
+}
+
+int
+vfs_export(mp, nep, argp)
+	struct mount *mp;
+	struct netexport *nep;
+	struct export_args *argp;
+{
+	int error;
+
+	if (argp->ex_flags & MNT_DELEXPORT) {
+		if (mp->mnt_flag & MNT_EXPUBLIC) {
+			vfs_setpublicfs(NULL, NULL, NULL);
+			mp->mnt_flag &= ~MNT_EXPUBLIC;
+		}
+		vfs_free_addrlist(nep);
+		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+	}
+	if (argp->ex_flags & MNT_EXPORTED) {
+		if (argp->ex_flags & MNT_EXPUBLIC) {
+			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+				return (error);
+			mp->mnt_flag |= MNT_EXPUBLIC;
+		}
+		if ((error = vfs_hang_addrlist(mp, nep, argp)))
+			return (error);
+		mp->mnt_flag |= MNT_EXPORTED;
+	}
+	return (0);
+}
+
+
+/*
+ * Set the publicly exported filesystem (WebNFS). Currently, only
+ * one public filesystem is possible in the spec (RFC 2054 and 2055)
+ */
+int
+vfs_setpublicfs(mp, nep, argp)
+	struct mount *mp;
+	struct netexport *nep;
+	struct export_args *argp;
+{
+	int error;
+	struct vnode *rvp;
+	char *cp;
+
+	/*
+	 * mp == NULL -> invalidate the current info, the FS is
+	 * no longer exported. May be called from either vfs_export
+	 * or unmount, so check if it hasn't already been done.
+	 */
+	if (mp == NULL) {
+		if (nfs_pub.np_valid) {
+			nfs_pub.np_valid = 0;
+			if (nfs_pub.np_index != NULL) {
+				FREE(nfs_pub.np_index, M_TEMP);
+				nfs_pub.np_index = NULL;
+			}
+		}
+		return (0);
+	}
+
+	/*
+	 * Only one allowed at a time.
+	 */
+	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
+		return (EBUSY);
+
+	/*
+	 * Get real filehandle for root of exported FS.
+	 */
+	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
+	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
+
+	if ((error = VFS_ROOT(mp, &rvp)))
+		return (error);
+
+	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+		return (error);
+
+	vput(rvp);
+
+	/*
+	 * If an indexfile was specified, pull it in.
+	 */
+	if (argp->ex_indexfile != NULL) {
+		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
+		    M_WAITOK);
+		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
+		    MAXNAMLEN, (size_t *)0);
+		if (!error) {
+			/*
+			 * Check for illegal filenames.
+			 */
+			for (cp = nfs_pub.np_index; *cp; cp++) {
+				if (*cp == '/') {
+					error = EINVAL;
+					break;
+				}
+			}
+		}
+		if (error) {
+			FREE(nfs_pub.np_index, M_TEMP);
+			return (error);
+		}
+	}
+
+	nfs_pub.np_mount = mp;
+	nfs_pub.np_valid = 1;
+	return (0);
+}
+
+struct netcred *
+vfs_export_lookup(mp, nep, nam)
+	register struct mount *mp;
+	struct netexport *nep;
+	struct sockaddr *nam;
+{
+	register struct netcred *np;
+	register struct radix_node_head *rnh;
+	struct sockaddr *saddr;
+
+	np = NULL;
+	if (mp->mnt_flag & MNT_EXPORTED) {
+		/*
+		 * Lookup in the export list first.
+		 */
+		if (nam != NULL) {
+			saddr = nam;
+			rnh = nep->ne_rtable[saddr->sa_family];
+			if (rnh != NULL) {
+				np = (struct netcred *)
+					(*rnh->rnh_matchaddr)((caddr_t)saddr,
+							      rnh);
+				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
+					np = NULL;
+			}
+		}
+		/*
+		 * If no address match, use the default if it exists.
+		 */
+		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
+			np = &nep->ne_defexported;
+	}
+	return (np);
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags) {
+	struct vnode *vp, *nvp;
+	struct vm_object *obj;
+	int anyio, tries;
+
+	tries = 5;
+loop:
+	anyio = 0;
+	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
+
+		nvp = vp->v_mntvnodes.le_next;
+
+		if (vp->v_mount != mp) {
+			goto loop;
+		}
+
+		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
+			continue;
+
+		if (flags != MNT_WAIT) {
+			obj = vp->v_object;
+			if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
+				continue;
+			if (VOP_ISLOCKED(vp))
+				continue;
+		}
+
+		simple_lock(&vp->v_interlock);
+		if (vp->v_object &&
+		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
+			if (!vget(vp,
+				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
+				if (vp->v_object) {
+					vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
+					anyio = 1;
+				}
+				vput(vp);
+			}
+		} else {
+			simple_unlock(&vp->v_interlock);
+		}
+	}
+	if (anyio && (--tries > 0))
+		goto loop;
+}
+
+/*
+ * Create the VM object needed for VMIO and mmap support.  This
+ * is done for all VREG files in the system.  Some filesystems might
+ * afford the additional metadata buffering capability of the
+ * VMIO code by making the device node be VMIO mode also.
+ *
+ * vp must be locked when vfs_object_create is called.
+ */
+int
+vfs_object_create(vp, p, cred)
+	struct vnode *vp;
+	struct proc *p;
+	struct ucred *cred;
+{
+	struct vattr vat;
+	vm_object_t object;
+	int error = 0;
+
+	if ((vp->v_type != VREG) && (vp->v_type != VBLK))
+		return 0;
+
+retry:
+	if ((object = vp->v_object) == NULL) {
+		if (vp->v_type == VREG) {
+			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
+				goto retn;
+			object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
+		} else if (major(vp->v_rdev) < nblkdev &&
+		    bdevsw[major(vp->v_rdev)] != NULL) {
+			/*
+			 * This simply allocates the biggest object possible
+			 * for a VBLK vnode.  This should be fixed, but doesn't
+			 * cause any problems (yet).
+			 */
+			object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
+		}
+		object->ref_count--;
+		vp->v_usecount--;
+	} else {
+		if (object->flags & OBJ_DEAD) {
+			VOP_UNLOCK(vp, 0, p);
+			tsleep(object, PVM, "vodead", 0);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			goto retry;
+		}
+	}
+
+	if (vp->v_object)
+		vp->v_flag |= VOBJBUF;
+
+retn:
+	return error;
+}
+
+static void
+vfree(vp)
+	struct vnode *vp;
+{
+	int s;
+
+	s = splbio();
+	simple_lock(&vnode_free_list_slock);
+	if (vp->v_flag & VTBFREE) {
+		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+		vp->v_flag &= ~VTBFREE;
+	}
+	if (vp->v_flag & VAGE) {
+		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+	} else {
+		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+	}
+	freevnodes++;
+	simple_unlock(&vnode_free_list_slock);
+	vp->v_flag &= ~VAGE;
+	vp->v_flag |= VFREE;
+	splx(s);
+}
+
+void
+vbusy(vp)
+	struct vnode *vp;
+{
+	int s;
+
+	s = splbio();
+	simple_lock(&vnode_free_list_slock);
+	if (vp->v_flag & VTBFREE) {
+		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+		vp->v_flag &= ~VTBFREE;
+	} else {
+		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		freevnodes--;
+	}
+	simple_unlock(&vnode_free_list_slock);
+	vp->v_flag &= ~(VFREE|VAGE);
+	splx(s);
+}
+
+/*
+ * Record a process's interest in events which might happen to
+ * a vnode.  Because poll uses the historic select-style interface
+ * internally, this routine serves as both the ``check for any
+ * pending events'' and the ``record my interest in future events''
+ * functions.  (These are done together, while the lock is held,
+ * to avoid race conditions.)
+ */
+int
+vn_pollrecord(vp, p, events)
+	struct vnode *vp;
+	struct proc *p;
+	short events;
+{
+	simple_lock(&vp->v_pollinfo.vpi_lock);
+	if (vp->v_pollinfo.vpi_revents & events) {
+		/*
+		 * This leaves events we are not interested
+		 * in available for the other process which
+		 * which presumably had requested them
+		 * (otherwise they would never have been
+		 * recorded).
+		 */
+		events &= vp->v_pollinfo.vpi_revents;
+		vp->v_pollinfo.vpi_revents &= ~events;
+
+		simple_unlock(&vp->v_pollinfo.vpi_lock);
+		return events;
+	}
+	vp->v_pollinfo.vpi_events |= events;
+	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
+	simple_unlock(&vp->v_pollinfo.vpi_lock);
+	return 0;
+}
+
+/*
+ * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
+ * it is possible for us to miss an event due to race conditions, but
+ * that condition is expected to be rare, so for the moment it is the
+ * preferred interface.
+ */
+void
+vn_pollevent(vp, events)
+	struct vnode *vp;
+	short events;
+{
+	simple_lock(&vp->v_pollinfo.vpi_lock);
+	if (vp->v_pollinfo.vpi_events & events) {
+		/*
+		 * We clear vpi_events so that we don't
+		 * call selwakeup() twice if two events are
+		 * posted before the polling process(es) is
+		 * awakened.  This also ensures that we take at
+		 * most one selwakeup() if the polling process
+		 * is no longer interested.  However, it does
+		 * mean that only one event can be noticed at
+		 * a time.  (Perhaps we should only clear those
+		 * event bits which we note?) XXX
+		 */
+		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
+		vp->v_pollinfo.vpi_revents |= events;
+		selwakeup(&vp->v_pollinfo.vpi_selinfo);
+	}
+	simple_unlock(&vp->v_pollinfo.vpi_lock);
+}
+
+/*
+ * Wake up anyone polling on vp because it is being revoked.
+ * This depends on dead_poll() returning POLLHUP for correct
+ * behavior.
+ */
+void
+vn_pollgone(vp)
+	struct vnode *vp;
+{
+	simple_lock(&vp->v_pollinfo.vpi_lock);
+	if (vp->v_pollinfo.vpi_events) {
+		vp->v_pollinfo.vpi_events = 0;
+		selwakeup(&vp->v_pollinfo.vpi_selinfo);
+	}
+	simple_unlock(&vp->v_pollinfo.vpi_lock);
+}
+
+
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
+static int	sync_fsync __P((struct  vop_fsync_args *));
+static int	sync_inactive __P((struct  vop_inactive_args *));
+static int	sync_reclaim  __P((struct  vop_reclaim_args *));
+#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
+#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
+static int	sync_print __P((struct vop_print_args *));
+#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
+
+static vop_t **sync_vnodeop_p;
+static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
+	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
+	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
+	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
+	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
+	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
+	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
+	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
+	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
+	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
+	{ NULL, NULL }
+};
+static struct vnodeopv_desc sync_vnodeop_opv_desc =
+	{ &sync_vnodeop_p, sync_vnodeop_entries };
+
+VNODEOP_SET(sync_vnodeop_opv_desc);
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+int
+vfs_allocate_syncvnode(mp)
+	struct mount *mp;
+{
+	struct vnode *vp;
+	static long start, incr, next;
+	int error;
+
+	/* Allocate a new vnode */
+	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
+		mp->mnt_syncer = NULL;
+		return (error);
+	}
+	vp->v_type = VNON;
+	/*
+	 * Place the vnode onto the syncer worklist. We attempt to
+	 * scatter them about on the list so that they will go off
+	 * at evenly distributed times even if all the filesystems
+	 * are mounted at once.
+	 */
+	next += incr;
+	if (next == 0 || next > syncer_maxdelay) {
+		start /= 2;
+		incr /= 2;
+		if (start == 0) {
+			start = syncer_maxdelay / 2;
+			incr = syncer_maxdelay;
+		}
+		next = start;
+	}
+	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+	mp->mnt_syncer = vp;
+	return (0);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+static int
+sync_fsync(ap)
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		struct ucred *a_cred;
+		int a_waitfor;
+		struct proc *a_p;
+	} */ *ap;
+{
+	struct vnode *syncvp = ap->a_vp;
+	struct mount *mp = syncvp->v_mount;
+	struct proc *p = ap->a_p;
+	int asyncflag;
+
+	/*
+	 * We only need to do something if this is a lazy evaluation.
+	 */
+	if (ap->a_waitfor != MNT_LAZY)
+		return (0);
+
+	/*
+	 * Move ourselves to the back of the sync list.
+	 */
+	vn_syncer_add_to_worklist(syncvp, syncdelay);
+
+	/*
+	 * Walk the list of vnodes pushing all that are dirty and
+	 * not already on the sync list.
+	 */
+	simple_lock(&mountlist_slock);
+	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
+		simple_unlock(&mountlist_slock);
+		return (0);
+	}
+	asyncflag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &= ~MNT_ASYNC;
+	vfs_msync(mp, MNT_NOWAIT);
+	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
+	if (asyncflag)
+		mp->mnt_flag |= MNT_ASYNC;
+	vfs_unbusy(mp, p);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+static int
+sync_inactive(ap)
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+		struct proc *a_p;
+	} */ *ap;
+{
+
+	vgone(ap->a_vp);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ */
+static int
+sync_reclaim(ap)
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	vp->v_mount->mnt_syncer = NULL;
+	if (vp->v_flag & VONWORKLST) {
+		LIST_REMOVE(vp, v_synclist);
+		vp->v_flag &= ~VONWORKLST;
+	}
+
+	return (0);
+}
+
+/*
+ * Print out a syncer vnode.
+ */
+static int
+sync_print(ap)
+	struct vop_print_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	printf("syncer vnode");
+	if (vp->v_vnlock != NULL)
+		lockmgr_printinfo(vp->v_vnlock);
+	printf("\n");
+	return (0);
+}
diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c
new file mode 100644
index 0000000..18e39d6
--- /dev/null
+++ b/sys/kern/vfs_extattr.c
@@ -0,0 +1,3034 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
+ * $Id: vfs_syscalls.c,v 1.111 1998/12/12 21:07:09 dillon Exp $
+ */
+
+/* For 4.3 integer FS ID compatibility */
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/linker.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+
+#include <miscfs/union/union.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
+#include <sys/sysctl.h>
+
+static int change_dir __P((struct nameidata *ndp, struct proc *p));
+static void checkdirs __P((struct vnode *olddp));
+static int setfown __P((struct proc *, struct vnode *, uid_t, gid_t));
+static int setfmode __P((struct proc *, struct vnode *, int));
+static int setfflags __P((struct proc *, struct vnode *, int));
+static int setutimes __P((struct proc *, struct vnode *, struct timeval *, int));
+static int	usermount = 0;	/* if 1, non-root can mount fs. */
+
+int (*union_dircheckp) __P((struct proc *, struct vnode **, struct file *));
+
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
+
+/*
+ * Virtual File System System Calls
+ */
+
+/*
+ * Mount a file system.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+	char	*type;
+	char	*path;
+	int	flags;
+	caddr_t	data;
+};
+#endif
+/* ARGSUSED */
+int
+mount(p, uap)
+	struct proc *p;
+	register struct mount_args /* {
+		syscallarg(char *) type;
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(caddr_t) data;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct vfsconf *vfsp;
+	int error, flag = 0, flag2 = 0;
+	struct vattr va;
+	u_long fstypenum;
+	struct nameidata nd;
+	char fstypename[MFSNAMELEN];
+
+	if (usermount == 0 && (error = suser(p->p_ucred, &p->p_acflag)))
+		return (error);
+
+	/*
+	 * Get vnode to be covered
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (SCARG(uap, flags) & MNT_UPDATE) {
+		if ((vp->v_flag & VROOT) == 0) {
+			vput(vp);
+			return (EINVAL);
+		}
+		mp = vp->v_mount;
+		flag = mp->mnt_flag;
+		flag2 = mp->mnt_kern_flag;
+		/*
+		 * We only allow the filesystem to be reloaded if it
+		 * is currently mounted read-only.
+		 */
+		if ((SCARG(uap, flags) & MNT_RELOAD) &&
+		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+			vput(vp);
+			return (EOPNOTSUPP);	/* Needs translation */
+		}
+		mp->mnt_flag |=
+		    SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
+		/*
+		 * Only root, or the user that did the original mount is
+		 * permitted to update it.
+		 */
+		if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid &&
+		    (error = suser(p->p_ucred, &p->p_acflag))) {
+			vput(vp);
+			return (error);
+		}
+		/*
+		 * Do not allow NFS export by non-root users. Silently
+		 * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+		 */
+		if (p->p_ucred->cr_uid != 0) {
+			if (SCARG(uap, flags) & MNT_EXPORTED) {
+				vput(vp);
+				return (EPERM);
+			}
+			SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+		}
+		if (vfs_busy(mp, LK_NOWAIT, 0, p)) {
+			vput(vp);
+			return (EBUSY);
+		}
+		VOP_UNLOCK(vp, 0, p);
+		goto update;
+	}
+	/*
+	 * If the user is not root, ensure that they own the directory
+	 * onto which we are attempting to mount.
+	 */
+	if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) ||
+	    (va.va_uid != p->p_ucred->cr_uid &&
+	     (error = suser(p->p_ucred, &p->p_acflag)))) {
+		vput(vp);
+		return (error);
+	}
+	/*
+	 * Do not allow NFS export by non-root users. Silently
+	 * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+	 */
+	if (p->p_ucred->cr_uid != 0) {
+		if (SCARG(uap, flags) & MNT_EXPORTED) {
+			vput(vp);
+			return (EPERM);
+		}
+		SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+	}
+	if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0))
+		return (error);
+	if (vp->v_type != VDIR) {
+		vput(vp);
+		return (ENOTDIR);
+	}
+#ifdef COMPAT_43
+	/*
+	 * Historically filesystem types were identified by number. If we
+	 * get an integer for the filesystem type instead of a string, we
+	 * check to see if it matches one of the historic filesystem types.
+	 */
+	fstypenum = (uintptr_t)SCARG(uap, type);
+	if (fstypenum < maxvfsconf) {
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == fstypenum)
+				break;
+		if (vfsp == NULL) {
+			vput(vp);
+			return (ENODEV);
+		}
+		strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN);
+	} else
+#endif /* COMPAT_43 */
+	if (error = copyinstr(SCARG(uap, type), fstypename, MFSNAMELEN, NULL)) {
+		vput(vp);
+		return (error);
+	}
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstypename))
+			break;
+	if (vfsp == NULL) {
+		linker_file_t lf;
+
+		/* Refuse to load modules if securelevel raised */
+		if (securelevel > 0) {
+			vput(vp);
+			return EPERM; 
+		}
+		/* Only load modules for root (very important!) */
+		if (error = suser(p->p_ucred, &p->p_acflag)) {
+			vput(vp);
+			return error;
+		}
+		error = linker_load_file(fstypename, &lf);
+		if (error || lf == NULL) {
+			vput(vp);
+			if (lf == NULL)
+				error = ENODEV;
+			return error;
+		}
+		lf->userrefs++;
+		/* lookup again, see if the VFS was loaded */
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (!strcmp(vfsp->vfc_name, fstypename))
+				break;
+		if (vfsp == NULL) {
+			lf->userrefs--;
+			linker_file_unload(lf);
+			vput(vp);
+			return (ENODEV);
+		}
+	}
+	simple_lock(&vp->v_interlock);
+	if ((vp->v_flag & VMOUNT) != 0 ||
+	    vp->v_mountedhere != NULL) {
+		simple_unlock(&vp->v_interlock);
+		vput(vp);
+		return (EBUSY);
+	}
+	vp->v_flag |= VMOUNT;
+	simple_unlock(&vp->v_interlock);
+
+	/*
+	 * Allocate and initialize the filesystem.
+	 */
+	mp = (struct mount *)malloc((u_long)sizeof(struct mount),
+		M_MOUNT, M_WAITOK);
+	bzero((char *)mp, (u_long)sizeof(struct mount));
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_vfc = vfsp;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+	mp->mnt_vnodecovered = vp;
+	mp->mnt_stat.f_owner = p->p_ucred->cr_uid;
+	VOP_UNLOCK(vp, 0, p);
+update:
+	/*
+	 * Set the mount level flags.
+	 */
+	if (SCARG(uap, flags) & MNT_RDONLY)
+		mp->mnt_flag |= MNT_RDONLY;
+	else if (mp->mnt_flag & MNT_RDONLY)
+		mp->mnt_kern_flag |= MNTK_WANTRDWR;
+	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
+	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
+	    MNT_NOSYMFOLLOW |
+	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
+	mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC |
+	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
+	    MNT_NOSYMFOLLOW |
+	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
+	/*
+	 * Mount the filesystem.
+	 */
+	error = VFS_MOUNT(mp, SCARG(uap, path), SCARG(uap, data), &nd, p);
+	if (mp->mnt_flag & MNT_UPDATE) {
+		vrele(vp);
+		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+			mp->mnt_flag &= ~MNT_RDONLY;
+		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
+		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+		if (error) {
+			mp->mnt_flag = flag;
+			mp->mnt_kern_flag = flag2;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			if (mp->mnt_syncer == NULL)
+				error = vfs_allocate_syncvnode(mp);
+		} else {
+			if (mp->mnt_syncer != NULL)
+				vrele(mp->mnt_syncer);
+			mp->mnt_syncer = NULL;
+		}
+		vfs_unbusy(mp, p);
+		return (error);
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	/*
+	 * Put the new filesystem on the mount list after root.
+	 */
+	cache_purge(vp);
+	if (!error) {
+		simple_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		vp->v_mountedhere = mp;
+		simple_unlock(&vp->v_interlock);
+		simple_lock(&mountlist_slock);
+		CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+		simple_unlock(&mountlist_slock);
+		checkdirs(vp);
+		VOP_UNLOCK(vp, 0, p);
+		if ((mp->mnt_flag & MNT_RDONLY) == 0)
+			error = vfs_allocate_syncvnode(mp);
+		vfs_unbusy(mp, p);
+		if (error = VFS_START(mp, 0, p))
+			vrele(vp);
+	} else {
+		simple_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		simple_unlock(&vp->v_interlock);
+		mp->mnt_vfc->vfc_refcount--;
+		vfs_unbusy(mp, p);
+		free((caddr_t)mp, M_MOUNT);
+		vput(vp);
+	}
+	return (error);
+}
+
+/*
+ * Scan all active processes to see if any of them have a current
+ * or root directory onto which the new filesystem has just been
+ * mounted. If so, replace them with the new mount point.
+ */
+static void
+checkdirs(olddp)
+	struct vnode *olddp;
+{
+	struct filedesc *fdp;
+	struct vnode *newdp;
+	struct proc *p;
+
+	if (olddp->v_usecount == 1)
+		return;
+	if (VFS_ROOT(olddp->v_mountedhere, &newdp))
+		panic("mount: lost mount");
+	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+		fdp = p->p_fd;
+		if (fdp->fd_cdir == olddp) {
+			vrele(fdp->fd_cdir);
+			VREF(newdp);
+			fdp->fd_cdir = newdp;
+		}
+		if (fdp->fd_rdir == olddp) {
+			vrele(fdp->fd_rdir);
+			VREF(newdp);
+			fdp->fd_rdir = newdp;
+		}
+	}
+	if (rootvnode == olddp) {
+		vrele(rootvnode);
+		VREF(newdp);
+		rootvnode = newdp;
+	}
+	vput(newdp);
+}
+
+/*
+ * Unmount a file system.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument,
+ * not special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+unmount(p, uap)
+	struct proc *p;
+	register struct unmount_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct mount *mp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	mp = vp->v_mount;
+
+	/*
+	 * Only root, or the user that did the original mount is
+	 * permitted to unmount this filesystem.
+	 */
+	if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
+	    (error = suser(p->p_ucred, &p->p_acflag))) {
+		vput(vp);
+		return (error);
+	}
+
+	/*
+	 * Don't allow unmounting the root file system.
+	 */
+	if (mp->mnt_flag & MNT_ROOTFS) {
+		vput(vp);
+		return (EINVAL);
+	}
+
+	/*
+	 * Must be the root of the filesystem
+	 */
+	if ((vp->v_flag & VROOT) == 0) {
+		vput(vp);
+		return (EINVAL);
+	}
+	vput(vp);
+	return (dounmount(mp, SCARG(uap, flags), p));
+}
+
+/*
+ * Do the actual file system unmount.
+ */
+int
+dounmount(mp, flags, p)
+	register struct mount *mp;
+	int flags;
+	struct proc *p;
+{
+	struct vnode *coveredvp;
+	int error;
+	int async_flag;
+
+	simple_lock(&mountlist_slock);
+	mp->mnt_kern_flag |= MNTK_UNMOUNT;
+	lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p);
+
+	if (mp->mnt_flag & MNT_EXPUBLIC)
+		vfs_setpublicfs(NULL, NULL, NULL);
+
+	vfs_msync(mp, MNT_WAIT);
+	async_flag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &=~ MNT_ASYNC;
+	cache_purgevfs(mp);	/* remove cache entries for this file sys */
+	if (mp->mnt_syncer != NULL)
+		vrele(mp->mnt_syncer);
+	if (((mp->mnt_flag & MNT_RDONLY) ||
+	     (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) ||
+	    (flags & MNT_FORCE))
+		error = VFS_UNMOUNT(mp, flags, p);
+	simple_lock(&mountlist_slock);
+	if (error) {
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
+			(void) vfs_allocate_syncvnode(mp);
+		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
+		mp->mnt_flag |= async_flag;
+		lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE,
+		    &mountlist_slock, p);
+		if (mp->mnt_kern_flag & MNTK_MWAIT)
+			wakeup((caddr_t)mp);
+		return (error);
+	}
+	CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
+	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
+		coveredvp->v_mountedhere = (struct mount *)0;
+		vrele(coveredvp);
+	}
+	mp->mnt_vfc->vfc_refcount--;
+	if (mp->mnt_vnodelist.lh_first != NULL)
+		panic("unmount: dangling vnode");
+	lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock, p);
+	if (mp->mnt_kern_flag & MNTK_MWAIT)
+		wakeup((caddr_t)mp);
+	free((caddr_t)mp, M_MOUNT);
+	return (0);
+}
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+        int     dummy;
+};
+#endif
+
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/* ARGSUSED */
+int
+sync(p, uap)
+	struct proc *p;
+	struct sync_args *uap;
+{
+	register struct mount *mp, *nmp;
+	int asyncflag;
+
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			asyncflag = mp->mnt_flag & MNT_ASYNC;
+			mp->mnt_flag &= ~MNT_ASYNC;
+			vfs_msync(mp, MNT_NOWAIT);
+			VFS_SYNC(mp, MNT_NOWAIT,
+				((p != NULL) ? p->p_ucred : NOCRED), p);
+			mp->mnt_flag |= asyncflag;
+		}
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
+#ifdef DIAGNOSTIC
+	if (syncprt)
+		vfs_bufstats();
+#endif /* DIAGNOSTIC */
+#endif
+	return (0);
+}
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+	char *path;
+	int cmd;
+	int uid;
+	caddr_t arg;
+};
+#endif
+/* ARGSUSED */
+int
+quotactl(p, uap)
+	struct proc *p;
+	register struct quotactl_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) cmd;
+		syscallarg(int) uid;
+		syscallarg(caddr_t) arg;
+	} */ *uap;
+{
+	register struct mount *mp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	mp = nd.ni_vp->v_mount;
+	vrele(nd.ni_vp);
+	return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
+	    SCARG(uap, arg), p));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+	char *path;
+	struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+statfs(p, uap)
+	struct proc *p;
+	register struct statfs_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct statfs *) buf;
+	} */ *uap;
+{
+	register struct mount *mp;
+	register struct statfs *sp;
+	int error;
+	struct nameidata nd;
+	struct statfs sb;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	mp = nd.ni_vp->v_mount;
+	sp = &mp->mnt_stat;
+	vrele(nd.ni_vp);
+	error = VFS_STATFS(mp, sp, p);
+	if (error)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (p->p_ucred->cr_uid != 0) {
+		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+	int fd;
+	struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+fstatfs(p, uap)
+	struct proc *p;
+	register struct fstatfs_args /* {
+		syscallarg(int) fd;
+		syscallarg(struct statfs *) buf;
+	} */ *uap;
+{
+	struct file *fp;
+	struct mount *mp;
+	register struct statfs *sp;
+	int error;
+	struct statfs sb;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	mp = ((struct vnode *)fp->f_data)->v_mount;
+	sp = &mp->mnt_stat;
+	error = VFS_STATFS(mp, sp, p);
+	if (error)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (p->p_ucred->cr_uid != 0) {
+		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+	struct statfs *buf;
+	long bufsize;
+	int flags;
+};
+#endif
+int
+getfsstat(p, uap)
+	struct proc *p;
+	register struct getfsstat_args /* {
+		syscallarg(struct statfs *) buf;
+		syscallarg(long) bufsize;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	register struct mount *mp, *nmp;
+	register struct statfs *sp;
+	caddr_t sfsp;
+	long count, maxcount, error;
+
+	maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
+	sfsp = (caddr_t)SCARG(uap, buf);
+	count = 0;
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+		if (sfsp && count < maxcount) {
+			sp = &mp->mnt_stat;
+			/*
+			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
+			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+			 * overrides MNT_WAIT.
+			 */
+			if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
+			    (SCARG(uap, flags) & MNT_WAIT)) &&
+			    (error = VFS_STATFS(mp, sp, p))) {
+				simple_lock(&mountlist_slock);
+				nmp = mp->mnt_list.cqe_next;
+				vfs_unbusy(mp, p);
+				continue;
+			}
+			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+			error = copyout((caddr_t)sp, sfsp, sizeof(*sp));
+			if (error) {
+				vfs_unbusy(mp, p);
+				return (error);
+			}
+			sfsp += sizeof(*sp);
+		}
+		count++;
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+	if (sfsp && count > maxcount)
+		p->p_retval[0] = maxcount;
+	else
+		p->p_retval[0] = count;
+	return (0);
+}
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+	int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+fchdir(p, uap)
+	struct proc *p;
+	struct fchdir_args /* {
+		syscallarg(int) fd;
+	} */ *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	struct vnode *vp, *tdp;
+	struct mount *mp;
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(fdp, SCARG(uap, fd), &fp))
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	VREF(vp);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_type != VDIR)
+		error = ENOTDIR;
+	else
+		error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+	while (!error && (mp = vp->v_mountedhere) != NULL) {
+		if (vfs_busy(mp, 0, 0, p))
+			continue;
+		error = VFS_ROOT(mp, &tdp);
+		vfs_unbusy(mp, p);
+		if (error)
+			break;
+		vput(vp);
+		vp = tdp;
+	}
+	if (error) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0, p);
+	vrele(fdp->fd_cdir);
+	fdp->fd_cdir = vp;
+	return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+chdir(p, uap)
+	struct proc *p;
+	struct chdir_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = change_dir(&nd, p))
+		return (error);
+	vrele(fdp->fd_cdir);
+	fdp->fd_cdir = nd.ni_vp;
+	return (0);
+}
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+chroot(p, uap)
+	struct proc *p;
+	struct chroot_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	int error;
+	struct nameidata nd;
+
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = change_dir(&nd, p))
+		return (error);
+	vrele(fdp->fd_rdir);
+	fdp->fd_rdir = nd.ni_vp;
+	return (0);
+}
+
+/*
+ * Common routine for chroot and chdir.
+ */
+static int
+change_dir(ndp, p)
+	register struct nameidata *ndp;
+	struct proc *p;
+{
+	struct vnode *vp;
+	int error;
+
+	error = namei(ndp);
+	if (error)
+		return (error);
+	vp = ndp->ni_vp;
+	if (vp->v_type != VDIR)
+		error = ENOTDIR;
+	else
+		error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+	if (error)
+		vput(vp);
+	else
+		VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * Check permissions, allocate an open file structure,
+ * and call the device open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+	char	*path;
+	int	flags;
+	int	mode;
+};
+#endif
+int
+open(p, uap)
+	struct proc *p;
+	register struct open_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	register struct vnode *vp;
+	int cmode, flags, oflags;
+	struct file *nfp;
+	int type, indx, error;
+	struct flock lf;
+	struct nameidata nd;
+
+	oflags = SCARG(uap, flags);
+	if ((oflags & O_ACCMODE) == O_ACCMODE)
+		return (EINVAL);
+	flags = FFLAGS(oflags);
+	error = falloc(p, &nfp, &indx);
+	if (error)
+		return (error);
+	fp = nfp;
+	cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	p->p_dupfd = -indx - 1;			/* XXX check for fdopen */
+	error = vn_open(&nd, flags, cmode);
+	if (error) {
+		ffree(fp);
+		if ((error == ENODEV || error == ENXIO) &&
+		    p->p_dupfd >= 0 &&			/* XXX from fdopen */
+		    (error =
+			dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) {
+			p->p_retval[0] = indx;
+			return (0);
+		}
+		if (error == ERESTART)
+			error = EINTR;
+		fdp->fd_ofiles[indx] = NULL;
+		return (error);
+	}
+	p->p_dupfd = 0;
+	vp = nd.ni_vp;
+
+	fp->f_flag = flags & FMASK;
+	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+	fp->f_ops = &vnops;
+	fp->f_data = (caddr_t)vp;
+	if (flags & (O_EXLOCK | O_SHLOCK)) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		if (flags & O_EXLOCK)
+			lf.l_type = F_WRLCK;
+		else
+			lf.l_type = F_RDLCK;
+		type = F_FLOCK;
+		if ((flags & FNONBLOCK) == 0)
+			type |= F_WAIT;
+		VOP_UNLOCK(vp, 0, p);
+		if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) {
+			(void) vn_close(vp, fp->f_flag, fp->f_cred, p);
+			ffree(fp);
+			fdp->fd_ofiles[indx] = NULL;
+			return (error);
+		}
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+		fp->f_flag |= FHASLOCK;
+	}
+	if ((vp->v_type == VREG) && (vp->v_object == NULL))
+		vfs_object_create(vp, p, p->p_ucred);
+	VOP_UNLOCK(vp, 0, p);
+	p->p_retval[0] = indx;
+	return (0);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+ocreat(p, uap)
+	struct proc *p;
+	register struct ocreat_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct open_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(int) mode;
+	} */ nuap;
+
+	SCARG(&nuap, path) = SCARG(uap, path);
+	SCARG(&nuap, mode) = SCARG(uap, mode);
+	SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
+	return (open(p, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+	char	*path;
+	int	mode;
+	int	dev;
+};
+#endif
+/* ARGSUSED */
+int
+mknod(p, uap)
+	struct proc *p;
+	register struct mknod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+		syscallarg(int) dev;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	int whiteout = 0;
+	struct nameidata nd;
+
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error)
+		return (error);
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL)
+		error = EEXIST;
+	else {
+		VATTR_NULL(&vattr);
+		vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+		vattr.va_rdev = SCARG(uap, dev);
+		whiteout = 0;
+
+		switch (SCARG(uap, mode) & S_IFMT) {
+		case S_IFMT:	/* used by badsect to flag bad sectors */
+			vattr.va_type = VBAD;
+			break;
+		case S_IFCHR:
+			vattr.va_type = VCHR;
+			break;
+		case S_IFBLK:
+			vattr.va_type = VBLK;
+			break;
+		case S_IFWHT:
+			whiteout = 1;
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+	}
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		if (whiteout) {
+			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+			if (error)
+				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+			vput(nd.ni_dvp);
+		} else {
+			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+						&nd.ni_cnd, &vattr);
+			vput(nd.ni_dvp);
+		}
+	} else {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (vp)
+			vrele(vp);
+	}
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod");
+	return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkfifo(p, uap)
+	struct proc *p;
+	register struct mkfifo_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	if (nd.ni_vp != NULL) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == nd.ni_vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(nd.ni_vp);
+		return (EEXIST);
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VFIFO;
+	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	vput(nd.ni_dvp);
+	return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+	char	*path;
+	char	*link;
+};
+#endif
+/* ARGSUSED */
+int
+link(p, uap)
+	struct proc *p;
+	register struct link_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) link;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type == VDIR)
+		error = EPERM;		/* POSIX */
+	else {
+		NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p);
+		error = namei(&nd);
+		if (!error) {
+			if (nd.ni_vp != NULL) {
+				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+				if (nd.ni_vp)
+					vrele(nd.ni_vp);
+				error = EEXIST;
+			} else {
+				VOP_LEASE(nd.ni_dvp, p, p->p_ucred,
+				    LEASE_WRITE);
+				VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+			}
+			if (nd.ni_dvp == nd.ni_vp)
+				vrele(nd.ni_dvp);
+			else
+				vput(nd.ni_dvp);
+		}
+	}
+	vrele(vp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "link");
+	return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+	char	*path;
+	char	*link;
+};
+#endif
+/* ARGSUSED */
+int
+symlink(p, uap)
+	struct proc *p;
+	register struct symlink_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) link;
+	} */ *uap;
+{
+	struct vattr vattr;
+	char *path;
+	int error;
+	struct nameidata nd;
+
+	path = zalloc(namei_zone);
+	if (error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL))
+		goto out;
+	NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p);
+	if (error = namei(&nd))
+		goto out;
+	if (nd.ni_vp) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == nd.ni_vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(nd.ni_vp);
+		error = EEXIST;
+		goto out;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
+	vput(nd.ni_dvp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
+out:
+	zfree(namei_zone, path);
+	return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+/* ARGSUSED */
+int
+undelete(p, uap)
+	struct proc *p;
+	register struct undelete_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	error = namei(&nd);
+	if (error)
+		return (error);
+
+	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == nd.ni_vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (nd.ni_vp)
+			vrele(nd.ni_vp);
+		return (EEXIST);
+	}
+
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	if (error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE))
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+	vput(nd.ni_dvp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete");
+	return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+unlink(p, uap)
+	struct proc *p;
+	struct unlink_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+
+	if (vp->v_type == VDIR)
+		error = EPERM;		/* POSIX */
+	else {
+		/*
+		 * The root of a mounted filesystem cannot be deleted.
+		 *
+		 * XXX: can this only be a VDIR case?
+		 */
+		if (vp->v_flag & VROOT)
+			error = EBUSY;
+	}
+
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+	} else {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+	}
+	if (nd.ni_dvp == vp)
+		vrele(nd.ni_dvp);
+	else
+		vput(nd.ni_dvp);
+	if (vp != NULLVP)
+		vput(vp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink");
+	return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+	int	fd;
+	int	pad;
+	off_t	offset;
+	int	whence;
+};
+#endif
+int
+lseek(p, uap)
+	struct proc *p;
+	register struct lseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) offset;
+		syscallarg(int) whence;
+	} */ *uap;
+{
+	struct ucred *cred = p->p_ucred;
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	struct vattr vattr;
+	int error;
+
+	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+		return (EBADF);
+	if (fp->f_type != DTYPE_VNODE)
+		return (ESPIPE);
+	switch (SCARG(uap, whence)) {
+	case L_INCR:
+		fp->f_offset += SCARG(uap, offset);
+		break;
+	case L_XTND:
+		error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p);
+		if (error)
+			return (error);
+		fp->f_offset = SCARG(uap, offset) + vattr.va_size;
+		break;
+	case L_SET:
+		fp->f_offset = SCARG(uap, offset);
+		break;
+	default:
+		return (EINVAL);
+	}
+	*(off_t *)(p->p_retval) = fp->f_offset;
+	return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+	int	fd;
+	long	offset;
+	int	whence;
+};
+#endif
+int
+olseek(p, uap)
+	struct proc *p;
+	register struct olseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(long) offset;
+		syscallarg(int) whence;
+	} */ *uap;
+{
+	struct lseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) offset;
+		syscallarg(int) whence;
+	} */ nuap;
+	int error;
+
+	SCARG(&nuap, fd) = SCARG(uap, fd);
+	SCARG(&nuap, offset) = SCARG(uap, offset);
+	SCARG(&nuap, whence) = SCARG(uap, whence);
+	error = lseek(p, &nuap);
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Check access permissions.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+	char	*path;
+	int	flags;
+};
+#endif
+int
+access(p, uap)
+	struct proc *p;
+	register struct access_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	register struct ucred *cred = p->p_ucred;
+	register struct vnode *vp;
+	int error, flags, t_gid, t_uid;
+	struct nameidata nd;
+
+	t_uid = cred->cr_uid;
+	t_gid = cred->cr_groups[0];
+	cred->cr_uid = p->p_cred->p_ruid;
+	cred->cr_groups[0] = p->p_cred->p_rgid;
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		goto out1;
+	vp = nd.ni_vp;
+
+	/* Flags == 0 means only check for existence. */
+	if (SCARG(uap, flags)) {
+		flags = 0;
+		if (SCARG(uap, flags) & R_OK)
+			flags |= VREAD;
+		if (SCARG(uap, flags) & W_OK)
+			flags |= VWRITE;
+		if (SCARG(uap, flags) & X_OK)
+			flags |= VEXEC;
+		if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+			error = VOP_ACCESS(vp, flags, cred, p);
+	}
+	vput(vp);
+out1:
+	cred->cr_uid = t_uid;
+	cred->cr_groups[0] = t_gid;
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+ostat(p, uap)
+	struct proc *p;
+	register struct ostat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct ostat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct ostat osb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, p);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	cvtstat(&sb, &osb);
+	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+olstat(p, uap)
+	struct proc *p;
+	register struct olstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct ostat *) ub;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct stat sb;
+	struct ostat osb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	error = vn_stat(vp, &sb, p);
+	vput(vp);
+	if (error)
+		return (error);
+	cvtstat(&sb, &osb);
+	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+	return (error);
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+	struct stat *st;
+	struct ostat *ost;
+{
+
+	ost->st_dev = st->st_dev;
+	ost->st_ino = st->st_ino;
+	ost->st_mode = st->st_mode;
+	ost->st_nlink = st->st_nlink;
+	ost->st_uid = st->st_uid;
+	ost->st_gid = st->st_gid;
+	ost->st_rdev = st->st_rdev;
+	if (st->st_size < (quad_t)1 << 32)
+		ost->st_size = st->st_size;
+	else
+		ost->st_size = -2;
+	ost->st_atime = st->st_atime;
+	ost->st_mtime = st->st_mtime;
+	ost->st_ctime = st->st_ctime;
+	ost->st_blksize = st->st_blksize;
+	ost->st_blocks = st->st_blocks;
+	ost->st_flags = st->st_flags;
+	ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+stat(p, uap)
+	struct proc *p;
+	register struct stat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct stat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, p);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+lstat(p, uap)
+	struct proc *p;
+	register struct lstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct stat *) ub;
+	} */ *uap;
+{
+	int error;
+	struct vnode *vp;
+	struct stat sb;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	error = vn_stat(vp, &sb, p);
+	vput(vp);
+	if (error)
+		return (error);
+	error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+	return (error);
+}
+
+void
+cvtnstat(sb, nsb)
+	struct stat *sb;
+	struct nstat *nsb;
+{
+	nsb->st_dev = sb->st_dev;
+	nsb->st_ino = sb->st_ino;
+	nsb->st_mode = sb->st_mode;
+	nsb->st_nlink = sb->st_nlink;
+	nsb->st_uid = sb->st_uid;
+	nsb->st_gid = sb->st_gid;
+	nsb->st_rdev = sb->st_rdev;
+	nsb->st_atimespec = sb->st_atimespec;
+	nsb->st_mtimespec = sb->st_mtimespec;
+	nsb->st_ctimespec = sb->st_ctimespec;
+	nsb->st_size = sb->st_size;
+	nsb->st_blocks = sb->st_blocks;
+	nsb->st_blksize = sb->st_blksize;
+	nsb->st_flags = sb->st_flags;
+	nsb->st_gen = sb->st_gen;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nstat_args {
+	char	*path;
+	struct nstat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nstat(p, uap)
+	struct proc *p;
+	register struct nstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct nstat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct nstat nsb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, p);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	cvtnstat(&sb, &nsb);
+	error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nlstat(p, uap)
+	struct proc *p;
+	register struct nlstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct nstat *) ub;
+	} */ *uap;
+{
+	int error;
+	struct vnode *vp;
+	struct stat sb;
+	struct nstat nsb;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	error = vn_stat(vp, &sb, p);
+	vput(vp);
+	if (error)
+		return (error);
+	cvtnstat(&sb, &nsb);
+	error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
+	return (error);
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+	char	*path;
+	int	name;
+};
+#endif
+/* ARGSUSED */
+int
+pathconf(p, uap)
+	struct proc *p;
+	register struct pathconf_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) name;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), p->p_retval);
+	vput(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+	char	*path;
+	char	*buf;
+	int	count;
+};
+#endif
+/* ARGSUSED */
+int
+readlink(p, uap)
+	struct proc *p;
+	register struct readlink_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) buf;
+		syscallarg(int) count;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct iovec aiov;
+	struct uio auio;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VLNK)
+		error = EINVAL;
+	else {
+		aiov.iov_base = SCARG(uap, buf);
+		aiov.iov_len = SCARG(uap, count);
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_procp = p;
+		auio.uio_resid = SCARG(uap, count);
+		error = VOP_READLINK(vp, &auio, p->p_ucred);
+	}
+	vput(vp);
+	p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+
+static int
+setfflags(p, vp, flags)
+	struct proc *p;
+	struct vnode *vp;
+	int flags;
+{
+	int error;
+	struct vattr vattr;
+
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_flags = flags;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return error;
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+chflags(p, uap)
+	struct proc *p;
+	register struct chflags_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setfflags(p, nd.ni_vp, SCARG(uap, flags));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+	int	fd;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+fchflags(p, uap)
+	struct proc *p;
+	register struct fchflags_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	return setfflags(p, (struct vnode *) fp->f_data, SCARG(uap, flags));
+}
+
+static int
+setfmode(p, vp, mode)
+	struct proc *p;
+	struct vnode *vp;
+	int mode;
+{
+	int error;
+	struct vattr vattr;
+
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_mode = mode & ALLPERMS;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return error;
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+chmod(p, uap)
+	struct proc *p;
+	register struct chmod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setfmode(p, nd.ni_vp, SCARG(uap, mode));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+lchmod(p, uap)
+	struct proc *p;
+	register struct lchmod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setfmode(p, nd.ni_vp, SCARG(uap, mode));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+	int	fd;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+fchmod(p, uap)
+	struct proc *p;
+	register struct fchmod_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	return setfmode(p, (struct vnode *)fp->f_data, SCARG(uap, mode));
+}
+
+static int
+setfown(p, vp, uid, gid)
+	struct proc *p;
+	struct vnode *vp;
+	uid_t uid;
+	gid_t gid;
+{
+	int error;
+	struct vattr vattr;
+
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_uid = uid;
+	vattr.va_gid = gid;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return error;
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+chown(p, uap)
+	struct proc *p;
+	register struct chown_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+	vrele(nd.ni_vp);
+
+	return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+lchown(p, uap)
+	struct proc *p;
+	register struct lchown_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+	int	fd;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+fchown(p, uap)
+	struct proc *p;
+	register struct fchown_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	return setfown(p, (struct vnode *)fp->f_data,
+		SCARG(uap, uid), SCARG(uap, gid));
+}
+
+static int
+setutimes(p, vp, tv, nullflag)
+	struct proc *p;
+	struct vnode *vp;
+	struct timeval *tv;
+	int nullflag;
+{
+	int error;
+	struct vattr vattr;
+
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_atime.tv_sec = tv[0].tv_sec;
+	vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000;
+	vattr.va_mtime.tv_sec = tv[1].tv_sec;
+	vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000;
+	if (nullflag)
+		vattr.va_vaflags |= VA_UTIMES_NULL;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return error;
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+utimes(p, uap)
+	struct proc *p;
+	register struct utimes_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timeval tv[2];
+	int error;
+	struct nameidata nd;
+	int nullflag;
+
+	nullflag = 0;
+	if (SCARG(uap, tptr) == NULL) {
+		microtime(&tv[0]);
+		tv[1] = tv[0];
+		nullflag = 1;
+	} else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+	    sizeof (tv)))
+  		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setutimes(p, nd.ni_vp, tv, nullflag);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+lutimes(p, uap)
+	struct proc *p;
+	register struct lutimes_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timeval tv[2];
+	int error;
+	struct nameidata nd;
+	int nullflag;
+
+	nullflag = 0;
+	if (SCARG(uap, tptr) == NULL) {
+		microtime(&tv[0]);
+		tv[1] = tv[0];
+		nullflag = 1;
+	} else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+	    sizeof (tv)))
+  		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+
+	error = setutimes(p, nd.ni_vp, tv, nullflag);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+	int	fd;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+futimes(p, uap)
+	struct proc *p;
+	register struct futimes_args /* {
+		syscallarg(int ) fd;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timeval tv[2];
+	struct file *fp;
+	int error;
+	int nullflag;
+
+	nullflag = 0;
+	if (SCARG(uap, tptr) == NULL) {
+		microtime(&tv[0]);
+		tv[1] = tv[0];
+		nullflag = 1;
+	} else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+	    sizeof (tv)))
+  		return (error);
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	return setutimes(p, (struct vnode *)fp->f_data, tv, nullflag);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+	char	*path;
+	int	pad;
+	off_t	length;
+};
+#endif
+/* ARGSUSED */
+int
+truncate(p, uap)
+	struct proc *p;
+	register struct truncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	if (uap->length < 0)
+		return(EINVAL);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0 &&
+	    (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = SCARG(uap, length);
+		error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	}
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+	int	fd;
+	int	pad;
+	off_t	length;
+};
+#endif
+/* ARGSUSED */
+int
+ftruncate(p, uap)
+	struct proc *p;
+	register struct ftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ *uap;
+{
+	struct vattr vattr;
+	struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (uap->length < 0)
+		return(EINVAL);
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	if ((fp->f_flag & FWRITE) == 0)
+		return (EINVAL);
+	vp = (struct vnode *)fp->f_data;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = SCARG(uap, length);
+		error = VOP_SETATTR(vp, &vattr, fp->f_cred, p);
+	}
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+	char	*path;
+	long	length;
+};
+#endif
+/* ARGSUSED */
+int
+otruncate(p, uap)
+	struct proc *p;
+	register struct otruncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(long) length;
+	} */ *uap;
+{
+	struct truncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ nuap;
+
+	SCARG(&nuap, path) = SCARG(uap, path);
+	SCARG(&nuap, length) = SCARG(uap, length);
+	return (truncate(p, &nuap));
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+	int	fd;
+	long	length;
+};
+#endif
+/* ARGSUSED */
+int
+oftruncate(p, uap)
+	struct proc *p;
+	register struct oftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(long) length;
+	} */ *uap;
+{
+	struct ftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ nuap;
+
+	SCARG(&nuap, fd) = SCARG(uap, fd);
+	SCARG(&nuap, length) = SCARG(uap, length);
+	return (ftruncate(p, &nuap));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+	int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+fsync(p, uap)
+	struct proc *p;
+	struct fsync_args /* {
+		syscallarg(int) fd;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_object)
+		vm_object_page_clean(vp->v_object, 0, 0, 0);
+	if ((error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p)) == 0 &&
+	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) &&
+	    bioops.io_fsync)
+		error = (*bioops.io_fsync)(vp);
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * Rename files.  Source and destination must either both be directories,
+ * or both not be directories.  If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+	char	*from;
+	char	*to;
+};
+#endif
+/* ARGSUSED */
+int
+rename(p, uap)
+	struct proc *p;
+	register struct rename_args /* {
+		syscallarg(char *) from;
+		syscallarg(char *) to;
+	} */ *uap;
+{
+	register struct vnode *tvp, *fvp, *tdvp;
+	struct nameidata fromnd, tond;
+	int error;
+
+	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
+	    SCARG(uap, from), p);
+	if (error = namei(&fromnd))
+		return (error);
+	fvp = fromnd.ni_vp;
+	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ,
+	    UIO_USERSPACE, SCARG(uap, to), p);
+	if (fromnd.ni_vp->v_type == VDIR)
+		tond.ni_cnd.cn_flags |= WILLBEDIR;
+	if (error = namei(&tond)) {
+		/* Translate error code for rename("dir1", "dir2/."). */
+		if (error == EISDIR && fvp->v_type == VDIR)
+			error = EINVAL;
+		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		goto out1;
+	}
+	tdvp = tond.ni_dvp;
+	tvp = tond.ni_vp;
+	if (tvp != NULL) {
+		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto out;
+		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+			error = EISDIR;
+			goto out;
+		}
+	}
+	if (fvp == tdvp)
+		error = EINVAL;
+	/*
+	 * If source is the same as the destination (that is the
+	 * same inode number with the same name in the same directory),
+	 * then there is nothing to do.
+	 */
+	if (fvp == tvp && fromnd.ni_dvp == tdvp &&
+	    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
+	    !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
+	      fromnd.ni_cnd.cn_namelen))
+		error = -1;
+out:
+	if (!error) {
+		VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE);
+		if (fromnd.ni_dvp != tdvp) {
+			VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		}
+		if (tvp) {
+			VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE);
+		}
+		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+	} else {
+		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		if (tvp)
+			vput(tvp);
+		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+	}
+	vrele(tond.ni_startdir);
+	ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename");
+	ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename");
+	ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename");
+	ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename");
+	zfree(namei_zone, tond.ni_cnd.cn_pnbuf);
+out1:
+	if (fromnd.ni_startdir)
+		vrele(fromnd.ni_startdir);
+	zfree(namei_zone, fromnd.ni_cnd.cn_pnbuf);
+	if (error == -1)
+		return (0);
+	return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkdir(p, uap)
+	struct proc *p;
+	register struct mkdir_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	nd.ni_cnd.cn_flags |= WILLBEDIR;
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(vp);
+		return (EEXIST);
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VDIR;
+	vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask;
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	vput(nd.ni_dvp);
+	if (!error)
+		vput(nd.ni_vp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir");
+	return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+rmdir(p, uap)
+	struct proc *p;
+	struct rmdir_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+	/*
+	 * No rmdir "." please.
+	 */
+	if (nd.ni_dvp == vp) {
+		error = EINVAL;
+		goto out;
+	}
+	/*
+	 * The root of a mounted filesystem cannot be deleted.
+	 */
+	if (vp->v_flag & VROOT)
+		error = EBUSY;
+out:
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+		error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+	} else {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+	}
+	if (nd.ni_dvp == vp)
+		vrele(nd.ni_dvp);
+	else
+		vput(nd.ni_dvp);
+	if (vp != NULLVP)
+		vput(vp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir");
+	return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+ogetdirentries(p, uap)
+	struct proc *p;
+	register struct ogetdirentries_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+		syscallarg(long *) basep;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio, kuio;
+	struct iovec aiov, kiov;
+	struct dirent *dp, *edp;
+	caddr_t dirbuf;
+	int error, eofflag, readcnt;
+	long loff;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	if ((fp->f_flag & FREAD) == 0)
+		return (EBADF);
+	vp = (struct vnode *)fp->f_data;
+unionread:
+	if (vp->v_type != VDIR)
+		return (EINVAL);
+	aiov.iov_base = SCARG(uap, buf);
+	aiov.iov_len = SCARG(uap, count);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+	auio.uio_resid = SCARG(uap, count);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	loff = auio.uio_offset = fp->f_offset;
+#	if (BYTE_ORDER != LITTLE_ENDIAN)
+		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+			fp->f_offset = auio.uio_offset;
+		} else
+#	endif
+	{
+		kuio = auio;
+		kuio.uio_iov = &kiov;
+		kuio.uio_segflg = UIO_SYSSPACE;
+		kiov.iov_len = SCARG(uap, count);
+		MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
+		kiov.iov_base = dirbuf;
+		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+		fp->f_offset = kuio.uio_offset;
+		if (error == 0) {
+			readcnt = SCARG(uap, count) - kuio.uio_resid;
+			edp = (struct dirent *)&dirbuf[readcnt];
+			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+#				if (BYTE_ORDER == LITTLE_ENDIAN)
+					/*
+					 * The expected low byte of
+					 * dp->d_namlen is our dp->d_type.
+					 * The high MBZ byte of dp->d_namlen
+					 * is our dp->d_namlen.
+					 */
+					dp->d_type = dp->d_namlen;
+					dp->d_namlen = 0;
+#				else
+					/*
+					 * The dp->d_type is the high byte
+					 * of the expected dp->d_namlen,
+					 * so must be zero'ed.
+					 */
+					dp->d_type = 0;
+#				endif
+				if (dp->d_reclen > 0) {
+					dp = (struct dirent *)
+					    ((char *)dp + dp->d_reclen);
+				} else {
+					error = EIO;
+					break;
+				}
+			}
+			if (dp >= edp)
+				error = uiomove(dirbuf, readcnt, &auio);
+		}
+		FREE(dirbuf, M_TEMP);
+	}
+	VOP_UNLOCK(vp, 0, p);
+	if (error)
+		return (error);
+	if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) {
+		error = union_dircheckp(p, &vp, fp);
+		if (error == -1)
+			goto unionread;
+		if (error)
+			return (error);
+	}
+	error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+	    sizeof(long));
+	p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+getdirentries(p, uap)
+	struct proc *p;
+	register struct getdirentries_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+		syscallarg(long *) basep;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	long loff;
+	int error, eofflag;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	if ((fp->f_flag & FREAD) == 0)
+		return (EBADF);
+	vp = (struct vnode *)fp->f_data;
+unionread:
+	if (vp->v_type != VDIR)
+		return (EINVAL);
+	aiov.iov_base = SCARG(uap, buf);
+	aiov.iov_len = SCARG(uap, count);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+	auio.uio_resid = SCARG(uap, count);
+	/* vn_lock(vp, LK_SHARED | LK_RETRY, p); */
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	loff = auio.uio_offset = fp->f_offset;
+	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
+	fp->f_offset = auio.uio_offset;
+	VOP_UNLOCK(vp, 0, p);
+	if (error)
+		return (error);
+	if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) {
+		error = union_dircheckp(p, &vp, fp);
+		if (error == -1)
+			goto unionread;
+		if (error)
+			return (error);
+	}
+	if (SCARG(uap, basep) != NULL) {
+		error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+		    sizeof(long));
+	}
+	p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct getdents_args {
+	int fd;
+	char *buf;
+	size_t count;
+};
+#endif
+int
+getdents(p, uap)
+	struct proc *p;
+	register struct getdents_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+	} */ *uap;
+{
+	struct getdirentries_args ap;
+	ap.fd = uap->fd;
+	ap.buf = uap->buf;
+	ap.count = uap->count;
+	ap.basep = NULL;
+	return getdirentries(p, &ap);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+	int	newmask;
+};
+#endif
+int
+umask(p, uap)
+	struct proc *p;
+	struct umask_args /* {
+		syscallarg(int) newmask;
+	} */ *uap;
+{
+	register struct filedesc *fdp;
+
+	fdp = p->p_fd;
+	p->p_retval[0] = fdp->fd_cmask;
+	fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+	return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem
+ * away from vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+revoke(p, uap)
+	struct proc *p;
+	register struct revoke_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p))
+		goto out;
+	if (p->p_ucred->cr_uid != vattr.va_uid &&
+	    (error = suser(p->p_ucred, &p->p_acflag)))
+		goto out;
+	if (vp->v_usecount > 1 || (vp->v_flag & VALIASED))
+		VOP_REVOKE(vp, REVOKEALL);
+out:
+	vrele(vp);
+	return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry.
+ */
+int
+getvnode(fdp, fd, fpp)
+	struct filedesc *fdp;
+	int fd;
+	struct file **fpp;
+{
+	struct file *fp;
+
+	if ((u_int)fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[fd]) == NULL)
+		return (EBADF);
+	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
+		return (EINVAL);
+	*fpp = fp;
+	return (0);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct  __getcwd_args {
+	u_char	*buf;
+	u_int	buflen;
+};
+#endif
+#define STATNODE(mode, name, var) \
+	SYSCTL_INT(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
+
+static int disablecwd;
+SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
+
+static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
+static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
+static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
+static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
+static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
+static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
+int
+__getcwd(p, uap)
+	struct proc *p;
+	struct __getcwd_args *uap;
+{
+	char *bp, *buf;
+	int error, i, slash_prefixed;
+	struct filedesc *fdp;
+	struct namecache *ncp;
+	struct vnode *vp;
+
+	numcwdcalls++;
+	if (disablecwd)
+		return (ENODEV);
+	if (uap->buflen < 2)
+		return (EINVAL);
+	if (uap->buflen > MAXPATHLEN)
+		uap->buflen = MAXPATHLEN;
+	buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
+	bp += uap->buflen - 1;
+	*bp = '\0';
+	fdp = p->p_fd;
+	slash_prefixed = 0;
+	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
+		if (vp->v_flag & VROOT) {
+			vp = vp->v_mount->mnt_vnodecovered;
+			continue;
+		}
+		if (vp->v_dd->v_id != vp->v_ddid) {
+			numcwdfail1++;
+			free(buf, M_TEMP);
+			return (ENOTDIR);
+		}
+		ncp = TAILQ_FIRST(&vp->v_cache_dst);
+		if (!ncp) {
+			numcwdfail2++;
+			free(buf, M_TEMP);
+			return (ENOENT);
+		}
+		if (ncp->nc_dvp != vp->v_dd) {
+			numcwdfail3++;
+			free(buf, M_TEMP);
+			return (EBADF);
+		}
+		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
+			if (bp == buf) {
+				numcwdfail4++;
+				free(buf, M_TEMP);
+				return (ENOMEM);
+			}
+			*--bp = ncp->nc_name[i];
+		}
+		if (bp == buf) {
+			numcwdfail4++;
+			free(buf, M_TEMP);
+			return (ENOMEM);
+		}
+		*--bp = '/';
+		slash_prefixed = 1;
+		vp = vp->v_dd;
+	}
+	if (!slash_prefixed) {
+		if (bp == buf) {
+			numcwdfail4++;
+			free(buf, M_TEMP);
+			return (ENOMEM);
+		}
+		*--bp = '/';
+	}
+	numcwdfound++;
+	error = copyout(bp, uap->buf, strlen(bp) + 1);
+	free(buf, M_TEMP);
+	return (error);
+}
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
new file mode 100644
index 0000000..43589c74
--- /dev/null
+++ b/sys/kern/vfs_init.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_init.c	8.3 (Berkeley) 1/4/94
+ * $Id: vfs_init.c,v 1.40 1998/11/15 15:18:30 bde Exp $
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <vm/vm_zone.h>
+
+
+MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
+
+/*
+ * XXX this bloat just exands the sysctl__vfs linker set a little so that
+ * we can attach sysctls for VFS modules without expanding the linker set.
+ * Currently (1998/09/06), only one VFS uses sysctls, so 2 extra linker
+ * set slots are more than sufficient.
+ */
+extern struct linker_set sysctl__vfs;
+static int mod_xx;
+SYSCTL_INT(_vfs, OID_AUTO, mod0, CTLFLAG_RD, &mod_xx, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, mod1, CTLFLAG_RD, &mod_xx, 0, "");
+
+/*
+ * Zone for namei
+ */
+struct vm_zone *namei_zone;
+
+/*
+ * vfs_init.c
+ *
+ * Allocate and fill in operations vectors.
+ *
+ * An undocumented feature of this approach to defining operations is that
+ * there can be multiple entries in vfs_opv_descs for the same operations
+ * vector. This allows third parties to extend the set of operations
+ * supported by another layer in a binary compatibile way. For example,
+ * assume that NFS needed to be modified to support Ficus. NFS has an entry
+ * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
+ * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
+ * listing those new operations Ficus adds to NFS, all without modifying the
+ * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
+ * that is a(whole)nother story.) This is a feature.
+ */
+
+/* Table of known vnodeop vectors (list of VFS vnode vectors) */
+static struct vnodeopv_desc **vnodeopv_descs;
+static int vnodeopv_num;
+
+/* Table of known descs (list of vnode op handlers "vop_access_desc") */
+static struct vnodeop_desc **vfs_op_descs;
+static int *vfs_op_desc_refs;			/* reference counts */
+static int num_op_descs;
+static int vfs_opv_numops;
+
+static void
+vfs_opv_recalc(void)
+{
+	int i, j;
+	vop_t ***opv_desc_vector_p;
+	vop_t **opv_desc_vector;
+	struct vnodeopv_entry_desc *opve_descp;
+	struct vnodeopv_desc *opv;
+
+	if (vfs_op_descs == NULL)
+		panic("vfs_opv_recalc called with null vfs_op_descs");
+
+	/*
+	 * Run through and make sure all known descs have an offset
+	 *
+	 * vop_default_desc is hardwired at offset 1, and offset 0
+	 * is a panic sanity check.
+	 */
+	vfs_opv_numops = 0;
+	for (i = 0; i < num_op_descs; i++)
+		if (vfs_opv_numops < (vfs_op_descs[i]->vdesc_offset + 1))
+			vfs_opv_numops = vfs_op_descs[i]->vdesc_offset + 1;
+	for (i = 0; i < num_op_descs; i++)
+		if (vfs_op_descs[i]->vdesc_offset == 0)
+			vfs_op_descs[i]->vdesc_offset = vfs_opv_numops++;
+	/*
+	 * Allocate and fill in the vectors
+	 */
+	for (i = 0; i < vnodeopv_num; i++) {
+		opv = vnodeopv_descs[i];
+		opv_desc_vector_p = opv->opv_desc_vector_p;
+		if (*opv_desc_vector_p)
+			FREE(*opv_desc_vector_p, M_VNODE);
+		MALLOC(*opv_desc_vector_p, vop_t **,
+		       vfs_opv_numops * sizeof(vop_t *), M_VNODE, M_WAITOK);
+		if (*opv_desc_vector_p == NULL)
+			panic("no memory for vop_t ** vector");
+		bzero(*opv_desc_vector_p, vfs_opv_numops * sizeof(vop_t *));
+
+		/* Fill in, with slot 0 being panic */
+		opv_desc_vector = *opv_desc_vector_p;
+		opv_desc_vector[0] = (vop_t *)vop_panic;
+		for (j = 0; opv->opv_desc_ops[j].opve_op; j++) {
+			opve_descp = &(opv->opv_desc_ops[j]);
+			opv_desc_vector[opve_descp->opve_op->vdesc_offset] =
+				opve_descp->opve_impl;
+		}
+
+		/* Replace unfilled routines with their default (slot 1). */
+		opv_desc_vector = *(opv->opv_desc_vector_p);
+		if (opv_desc_vector[1] == NULL)
+			panic("vfs_opv_recalc: vector without a default.");
+		for (j = 0; j < vfs_opv_numops; j++)
+			if (opv_desc_vector[j] == NULL)
+				opv_desc_vector[j] = opv_desc_vector[1];
+	}
+}
+
+void
+vfs_add_vnodeops(void *data)
+{
+	struct vnodeopv_desc *opv;
+	struct vnodeopv_desc **newopv;
+	struct vnodeop_desc **newop;
+	int *newref;
+	vop_t **opv_desc_vector;
+	struct vnodeop_desc *desc;
+	int i, j;
+
+	opv = (struct vnodeopv_desc *)data;
+	MALLOC(newopv, struct vnodeopv_desc **,
+	       (vnodeopv_num + 1) * sizeof(*newopv), M_VNODE, M_WAITOK);
+	if (newopv == NULL)
+		panic("vfs_add_vnodeops: no memory");
+	if (vnodeopv_descs) {
+		bcopy(vnodeopv_descs, newopv, vnodeopv_num * sizeof(*newopv));
+		FREE(vnodeopv_descs, M_VNODE);
+	}
+	newopv[vnodeopv_num] = opv;
+	vnodeopv_descs = newopv;
+	vnodeopv_num++;
+
+	/* See if we have turned up a new vnode op desc */
+	opv_desc_vector = *(opv->opv_desc_vector_p);
+	for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) {
+		for (j = 0; j < num_op_descs; j++) {
+			if (desc == vfs_op_descs[j]) {
+				/* found it, increase reference count */
+				vfs_op_desc_refs[j]++;
+				break;
+			}
+		}
+		if (j == num_op_descs) {
+			/* not found, new entry */
+			MALLOC(newop, struct vnodeop_desc **,
+			       (num_op_descs + 1) * sizeof(*newop),
+			       M_VNODE, M_WAITOK);
+			if (newop == NULL)
+				panic("vfs_add_vnodeops: no memory for desc");
+			/* new reference count (for unload) */
+			MALLOC(newref, int *,
+				(num_op_descs + 1) * sizeof(*newref),
+				M_VNODE, M_WAITOK);
+			if (newref == NULL)
+				panic("vfs_add_vnodeops: no memory for refs");
+			if (vfs_op_descs) {
+				bcopy(vfs_op_descs, newop,
+					num_op_descs * sizeof(*newop));
+				FREE(vfs_op_descs, M_VNODE);
+			}
+			if (vfs_op_desc_refs) {
+				bcopy(vfs_op_desc_refs, newref,
+					num_op_descs * sizeof(*newref));
+				FREE(vfs_op_desc_refs, M_VNODE);
+			}
+			newop[num_op_descs] = desc;
+			newref[num_op_descs] = 1;
+			vfs_op_descs = newop;
+			vfs_op_desc_refs = newref;
+			num_op_descs++;
+		}
+	}
+	vfs_opv_recalc();
+}
+
+void
+vfs_rm_vnodeops(void *data)
+{
+	struct vnodeopv_desc *opv;
+	struct vnodeopv_desc **newopv;
+	struct vnodeop_desc **newop;
+	int *newref;
+	vop_t **opv_desc_vector;
+	struct vnodeop_desc *desc;
+	int i, j, k;
+
+	opv = (struct vnodeopv_desc *)data;
+	/* Lower ref counts on descs in the table and release if zero */
+	opv_desc_vector = *(opv->opv_desc_vector_p);
+	for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) {
+		for (j = 0; j < num_op_descs; j++) {
+			if (desc == vfs_op_descs[j]) {
+				/* found it, decrease reference count */
+				vfs_op_desc_refs[j]--;
+				break;
+			}
+		}
+		for (j = 0; j < num_op_descs; j++) {
+			if (vfs_op_desc_refs[j] > 0)
+				continue;
+			if (vfs_op_desc_refs[j] < 0)
+				panic("vfs_remove_vnodeops: negative refcnt");
+			MALLOC(newop, struct vnodeop_desc **,
+			       (num_op_descs - 1) * sizeof(*newop),
+			       M_VNODE, M_WAITOK);
+			if (newop == NULL)
+				panic("vfs_remove_vnodeops: no memory for desc");
+			/* new reference count (for unload) */
+			MALLOC(newref, int *,
+				(num_op_descs - 1) * sizeof(*newref),
+				M_VNODE, M_WAITOK);
+			if (newref == NULL)
+				panic("vfs_remove_vnodeops: no memory for refs");
+			for (k = j; k < (num_op_descs - 1); k++) {
+				vfs_op_descs[k] = vfs_op_descs[k + 1];
+				vfs_op_desc_refs[k] = vfs_op_desc_refs[k + 1];
+			}
+			bcopy(vfs_op_descs, newop,
+				(num_op_descs - 1) * sizeof(*newop));
+			bcopy(vfs_op_desc_refs, newref,
+				(num_op_descs - 1) * sizeof(*newref));
+			FREE(vfs_op_descs, M_VNODE);
+			FREE(vfs_op_desc_refs, M_VNODE);
+			vfs_op_descs = newop;
+			vfs_op_desc_refs = newref;
+			num_op_descs--;
+		}
+	}
+
+	for (i = 0; i < vnodeopv_num; i++) {
+		if (vnodeopv_descs[i] == opv) {
+			for (j = i; j < (vnodeopv_num - 1); j++)
+				vnodeopv_descs[j] = vnodeopv_descs[j + 1];
+			break;
+		}
+	}
+	if (i == vnodeopv_num)
+		panic("vfs_remove_vnodeops: opv not found");
+	MALLOC(newopv, struct vnodeopv_desc **,
+	       (vnodeopv_num - 1) * sizeof(*newopv), M_VNODE, M_WAITOK);
+	if (newopv == NULL)
+		panic("vfs_remove_vnodeops: no memory");
+	bcopy(vnodeopv_descs, newopv, (vnodeopv_num - 1) * sizeof(*newopv));
+	FREE(vnodeopv_descs, M_VNODE);
+	vnodeopv_descs = newopv;
+	vnodeopv_num--;
+
+	vfs_opv_recalc();
+}
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+struct vattr va_null;
+
+/*
+ * Initialize the vnode structures and initialize each file system type.
+ */
+/* ARGSUSED*/
+static void
+vfsinit(void *dummy)
+{
+
+	namei_zone = zinit("NAMEI", MAXPATHLEN, 0, 0, 2);
+
+	/*
+	 * Initialize the vnode table
+	 */
+	vntblinit();
+	/*
+	 * Initialize the vnode name cache
+	 */
+	nchinit();
+	/*
+	 * Initialize each file system type.
+	 * Vfs type numbers must be distinct from VFS_GENERIC (and VFS_VFSCONF).
+	 */
+	vattr_null(&va_null);
+	maxvfsconf = VFS_GENERIC + 1;
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vfsinit, NULL)
+
+int
+vfs_register(struct vfsconf *vfc)
+{
+	struct linker_set *l;
+	struct sysctl_oid **oidpp;
+	struct vfsconf *vfsp;
+	int i, exists;
+
+	vfsp = NULL;
+	l = &sysctl__vfs;
+	if (vfsconf)
+		for (vfsp = vfsconf; vfsp->vfc_next; vfsp = vfsp->vfc_next)
+			if (!strcmp(vfc->vfc_name, vfsp->vfc_name))
+				return EEXIST;
+
+	vfc->vfc_typenum = maxvfsconf++;
+	if (vfc->vfc_vfsops->vfs_oid != NULL) {
+		/*
+		 * Attach the oid to the "vfs" node of the sysctl tree if
+		 * it isn't already there (it will be there for statically
+		 * configured vfs's).
+		 */
+		exists = 0;
+		for (i = l->ls_length,
+		    oidpp = (struct sysctl_oid **)l->ls_items;
+		    i-- != 0; oidpp++)
+			if (*oidpp == vfc->vfc_vfsops->vfs_oid) {
+				exists = 1;
+				break;
+			}
+		if (exists == 0)
+			for (i = l->ls_length,
+			    oidpp = (struct sysctl_oid **)l->ls_items;
+			    i-- != 0; oidpp++) {
+				if (*oidpp == NULL ||
+				    *oidpp == &sysctl___vfs_mod0 ||
+				    *oidpp == &sysctl___vfs_mod1) {
+					*oidpp = vfc->vfc_vfsops->vfs_oid;
+					break;
+				}
+			}
+
+		vfc->vfc_vfsops->vfs_oid->oid_number = vfc->vfc_typenum;
+		sysctl_order_all();
+	}
+	if (vfsp)
+		vfsp->vfc_next = vfc;
+	else
+		vfsconf = vfc;
+	vfc->vfc_next = NULL;
+
+	/*
+	 * Call init function for this VFS...
+	 */
+	(*(vfc->vfc_vfsops->vfs_init))(vfc);
+
+	return 0;
+}
+
+
+int
+vfs_unregister(struct vfsconf *vfc)
+{
+	struct linker_set *l;
+	struct sysctl_oid **oidpp;
+	struct vfsconf *vfsp, *prev_vfsp;
+	int error, i, maxtypenum;
+
+	i = vfc->vfc_typenum;
+
+	prev_vfsp = NULL;
+	for (vfsp = vfsconf; vfsp;
+			prev_vfsp = vfsp, vfsp = vfsp->vfc_next) {
+		if (!strcmp(vfc->vfc_name, vfsp->vfc_name))
+			break;
+	}
+	if (vfsp == NULL)
+		return EINVAL;
+	if (vfsp->vfc_refcount)
+		return EBUSY;
+	if (vfc->vfc_vfsops->vfs_uninit != NULL) {
+		error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp);
+		if (error)
+			return (error);
+	}
+	if (prev_vfsp)
+		prev_vfsp->vfc_next = vfsp->vfc_next;
+	else
+		vfsconf = vfsp->vfc_next;
+	if (vfsp->vfc_vfsops->vfs_oid != NULL) {
+		l = &sysctl__vfs;
+		for (i = l->ls_length,
+		    oidpp = (struct sysctl_oid **)l->ls_items;
+		    i--; oidpp++) {
+			if (*oidpp == vfsp->vfc_vfsops->vfs_oid) {
+				*oidpp = NULL;
+				sysctl_order_all();
+				break;
+			}
+		}
+	}
+	maxtypenum = VFS_GENERIC;
+	for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
+		if (maxtypenum < vfsp->vfc_typenum)
+			maxtypenum = vfsp->vfc_typenum;
+	maxvfsconf = maxtypenum + 1;
+	return 0;
+}
+
+int
+vfs_modevent(module_t mod, int type, void *data)
+{
+	struct vfsconf *vfc;
+	int error = 0;
+
+	vfc = (struct vfsconf *)data;
+
+	switch (type) {
+	case MOD_LOAD:
+		if (vfc)
+			error = vfs_register(vfc);
+		break;
+
+	case MOD_UNLOAD:
+		if (vfc)
+			error = vfs_unregister(vfc);
+		break;
+	default:	/* including MOD_SHUTDOWN */
+		break;
+	}
+	return (error);
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
new file mode 100644
index 0000000..67efd52
--- /dev/null
+++ b/sys/kern/vfs_lookup.c
@@ -0,0 +1,706 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
+ * $Id: vfs_lookup.c,v 1.30 1999/01/08 17:31:16 eivind Exp $
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm_zone.h>
+
+/*
+ * Convert a pathname into a pointer to a locked inode.
+ *
+ * The FOLLOW flag is set when symbolic links are to be followed
+ * when they occur at the end of the name translation process.
+ * Symbolic links are always followed for all other pathname
+ * components other than the last.
+ *
+ * The segflg defines whether the name is to be copied from user
+ * space or kernel space.
+ *
+ * Overall outline of namei:
+ *
+ *	copy in name
+ *	get starting directory
+ *	while (!done && !error) {
+ *		call lookup to search path.
+ *		if symbolic link, massage name in buffer and continue
+ *	}
+ */
+int
+namei(ndp)
+	register struct nameidata *ndp;
+{
+	register struct filedesc *fdp;	/* pointer to file descriptor state */
+	register char *cp;		/* pointer into pathname argument */
+	register struct vnode *dp;	/* the directory we are searching */
+	struct iovec aiov;		/* uio for reading symbolic links */
+	struct uio auio;
+	int error, linklen;
+	struct componentname *cnp = &ndp->ni_cnd;
+	struct proc *p = cnp->cn_proc;
+
+	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_proc->p_ucred;
+	KASSERT(cnp->cn_cred && cnp->cn_proc, ("namei: bad cred/proc"));
+	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
+	    ("namei: nameiop contaminated with flags"));
+	KASSERT((cnp->cn_flags & OPMASK) == 0,
+	    ("namei: flags contaminated with nameiops"));
+	fdp = cnp->cn_proc->p_fd;
+
+	/*
+	 * Get a buffer for the name to be translated, and copy the
+	 * name into the buffer.
+	 */
+	if ((cnp->cn_flags & HASBUF) == 0)
+		cnp->cn_pnbuf = zalloc(namei_zone);
+	if (ndp->ni_segflg == UIO_SYSSPACE)
+		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
+			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+	else
+		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
+			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+
+	/*
+	 * Don't allow empty pathnames.
+	 */
+	if (!error && *cnp->cn_pnbuf == '\0')
+		error = ENOENT;
+
+	if (error) {
+		zfree(namei_zone, cnp->cn_pnbuf);
+		ndp->ni_vp = NULL;
+		return (error);
+	}
+	ndp->ni_loopcnt = 0;
+#ifdef KTRACE
+	if (KTRPOINT(cnp->cn_proc, KTR_NAMEI))
+		ktrnamei(cnp->cn_proc->p_tracep, cnp->cn_pnbuf);
+#endif
+
+	/*
+	 * Get starting point for the translation.
+	 */
+	ndp->ni_rootdir = fdp->fd_rdir;
+
+	dp = fdp->fd_cdir;
+	VREF(dp);
+	for (;;) {
+		/*
+		 * Check if root directory should replace current directory.
+		 * Done at start of translation and after symbolic link.
+		 */
+		cnp->cn_nameptr = cnp->cn_pnbuf;
+		if (*(cnp->cn_nameptr) == '/') {
+			vrele(dp);
+			while (*(cnp->cn_nameptr) == '/') {
+				cnp->cn_nameptr++;
+				ndp->ni_pathlen--;
+			}
+			dp = ndp->ni_rootdir;
+			VREF(dp);
+		}
+		ndp->ni_startdir = dp;
+		error = lookup(ndp);
+		if (error) {
+			zfree(namei_zone, cnp->cn_pnbuf);
+			return (error);
+		}
+		/*
+		 * Check for symbolic link
+		 */
+		if ((cnp->cn_flags & ISSYMLINK) == 0) {
+			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
+				zfree(namei_zone, cnp->cn_pnbuf);
+			else
+				cnp->cn_flags |= HASBUF;
+
+			if (ndp->ni_vp && ndp->ni_vp->v_type == VREG &&
+				(cnp->cn_nameiop != DELETE) &&
+				((cnp->cn_flags & (NOOBJ|LOCKLEAF)) ==
+				 LOCKLEAF))
+				vfs_object_create(ndp->ni_vp,
+					ndp->ni_cnd.cn_proc,
+					ndp->ni_cnd.cn_cred);
+
+			return (0);
+		}
+		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
+			VOP_UNLOCK(ndp->ni_dvp, 0, p);
+		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
+			error = ELOOP;
+			break;
+		}
+		if (ndp->ni_pathlen > 1)
+			cp = zalloc(namei_zone);
+		else
+			cp = cnp->cn_pnbuf;
+		aiov.iov_base = cp;
+		aiov.iov_len = MAXPATHLEN;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_SYSSPACE;
+		auio.uio_procp = (struct proc *)0;
+		auio.uio_resid = MAXPATHLEN;
+		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+		if (error) {
+			if (ndp->ni_pathlen > 1)
+				zfree(namei_zone, cp);
+			break;
+		}
+		linklen = MAXPATHLEN - auio.uio_resid;
+		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
+			if (ndp->ni_pathlen > 1)
+				zfree(namei_zone, cp);
+			error = ENAMETOOLONG;
+			break;
+		}
+		if (ndp->ni_pathlen > 1) {
+			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
+			zfree(namei_zone, cnp->cn_pnbuf);
+			cnp->cn_pnbuf = cp;
+		} else
+			cnp->cn_pnbuf[linklen] = '\0';
+		ndp->ni_pathlen += linklen;
+		vput(ndp->ni_vp);
+		dp = ndp->ni_dvp;
+	}
+	zfree(namei_zone, cnp->cn_pnbuf);
+	vrele(ndp->ni_dvp);
+	vput(ndp->ni_vp);
+	ndp->ni_vp = NULL;
+	return (error);
+}
+
+/*
+ * Search a pathname.
+ * This is a very central and rather complicated routine.
+ *
+ * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
+ * The starting directory is taken from ni_startdir. The pathname is
+ * descended until done, or a symbolic link is encountered. The variable
+ * ni_more is clear if the path is completed; it is set to one if a
+ * symbolic link needing interpretation is encountered.
+ *
+ * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
+ * whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it, the parent directory is returned
+ * locked. If flag has WANTPARENT or'ed into it, the parent directory is
+ * returned unlocked. Otherwise the parent directory is not returned. If
+ * the target of the pathname exists and LOCKLEAF is or'ed into the flag
+ * the target is returned locked, otherwise it is returned unlocked.
+ * When creating or renaming and LOCKPARENT is specified, the target may not
+ * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
+ *
+ * Overall outline of lookup:
+ *
+ * dirloop:
+ *	identify next component of name at ndp->ni_ptr
+ *	handle degenerate case where name is null string
+ *	if .. and crossing mount points and on mounted filesys, find parent
+ *	call VOP_LOOKUP routine for next component name
+ *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
+ *	    component vnode returned in ni_vp (if it exists), locked.
+ *	if result vnode is mounted on and crossing mount points,
+ *	    find mounted on vnode
+ *	if more components of name, do next level at dirloop
+ *	return the answer in ni_vp, locked if LOCKLEAF set
+ *	    if LOCKPARENT set, return locked parent in ni_dvp
+ *	    if WANTPARENT set, return unlocked parent in ni_dvp
+ */
+int
+lookup(ndp)
+	register struct nameidata *ndp;
+{
+	register char *cp;		/* pointer into pathname argument */
+	register struct vnode *dp = 0;	/* the directory we are searching */
+	struct vnode *tdp;		/* saved dp */
+	struct mount *mp;		/* mount table entry */
+	int docache;			/* == 0 do not cache last component */
+	int wantparent;			/* 1 => wantparent or lockparent flag */
+	int rdonly;			/* lookup read-only flag bit */
+	int trailing_slash;
+	int error = 0;
+	struct componentname *cnp = &ndp->ni_cnd;
+	struct proc *p = cnp->cn_proc;
+
+	/*
+	 * Setup: break out flag bits into variables.
+	 */
+	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
+	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+	if (cnp->cn_nameiop == DELETE ||
+	    (wantparent && cnp->cn_nameiop != CREATE &&
+	     cnp->cn_nameiop != LOOKUP))
+		docache = 0;
+	rdonly = cnp->cn_flags & RDONLY;
+	ndp->ni_dvp = NULL;
+	cnp->cn_flags &= ~ISSYMLINK;
+	dp = ndp->ni_startdir;
+	ndp->ni_startdir = NULLVP;
+	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p);
+
+dirloop:
+	/*
+	 * Search a new directory.
+	 *
+	 * The cn_hash value is for use by vfs_cache.
+	 * The last component of the filename is left accessible via
+	 * cnp->cn_nameptr for callers that need the name. Callers needing
+	 * the name set the SAVENAME flag. When done, they assume
+	 * responsibility for freeing the pathname buffer.
+	 */
+	cnp->cn_consume = 0;
+	cnp->cn_hash = 0;
+	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+		cnp->cn_hash += (unsigned char)*cp;
+	cnp->cn_namelen = cp - cnp->cn_nameptr;
+	if (cnp->cn_namelen > NAME_MAX) {
+		error = ENAMETOOLONG;
+		goto bad;
+	}
+#ifdef NAMEI_DIAGNOSTIC
+	{ char c = *cp;
+	*cp = '\0';
+	printf("{%s}: ", cnp->cn_nameptr);
+	*cp = c; }
+#endif
+	ndp->ni_pathlen -= cnp->cn_namelen;
+	ndp->ni_next = cp;
+
+	/*
+	 * Replace multiple slashes by a single slash and trailing slashes
+	 * by a null.  This must be done before VOP_LOOKUP() because some
+	 * fs's don't know about trailing slashes.  Remember if there were
+	 * trailing slashes to handle symlinks, existing non-directories
+	 * and non-existing files that won't be directories specially later.
+	 */
+	trailing_slash = 0;
+	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+		cp++;
+		ndp->ni_pathlen--;
+		if (*cp == '\0') {
+			trailing_slash = 1;
+			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
+		}
+	}
+	ndp->ni_next = cp;
+
+	cnp->cn_flags |= MAKEENTRY;
+	if (*cp == '\0' && docache == 0)
+		cnp->cn_flags &= ~MAKEENTRY;
+	if (cnp->cn_namelen == 2 &&
+	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+		cnp->cn_flags |= ISDOTDOT;
+	else
+		cnp->cn_flags &= ~ISDOTDOT;
+	if (*ndp->ni_next == 0)
+		cnp->cn_flags |= ISLASTCN;
+	else
+		cnp->cn_flags &= ~ISLASTCN;
+
+
+	/*
+	 * Check for degenerate name (e.g. / or "")
+	 * which is a way of talking about a directory,
+	 * e.g. like "/." or ".".
+	 */
+	if (cnp->cn_nameptr[0] == '\0') {
+		if (dp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto bad;
+		}
+		if (cnp->cn_nameiop != LOOKUP) {
+			error = EISDIR;
+			goto bad;
+		}
+		if (wantparent) {
+			ndp->ni_dvp = dp;
+			VREF(dp);
+		}
+		ndp->ni_vp = dp;
+		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
+			VOP_UNLOCK(dp, 0, p);
+		if (cnp->cn_flags & SAVESTART)
+			panic("lookup: SAVESTART");
+		return (0);
+	}
+
+	/*
+	 * Handle "..": two special cases.
+	 * 1. If at root directory (e.g. after chroot)
+	 *    or at absolute root directory
+	 *    then ignore it so can't get out.
+	 * 2. If this vnode is the root of a mounted
+	 *    filesystem, then replace it with the
+	 *    vnode which was mounted on so we take the
+	 *    .. in the other file system.
+	 */
+	if (cnp->cn_flags & ISDOTDOT) {
+		for (;;) {
+			if (dp == ndp->ni_rootdir || dp == rootvnode) {
+				ndp->ni_dvp = dp;
+				ndp->ni_vp = dp;
+				VREF(dp);
+				goto nextname;
+			}
+			if ((dp->v_flag & VROOT) == 0 ||
+			    (cnp->cn_flags & NOCROSSMOUNT))
+				break;
+			tdp = dp;
+			dp = dp->v_mount->mnt_vnodecovered;
+			vput(tdp);
+			VREF(dp);
+			vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p);
+		}
+	}
+
+	/*
+	 * We now have a segment name to search for, and a directory to search.
+	 */
+unionlookup:
+	ndp->ni_dvp = dp;
+	ndp->ni_vp = NULL;
+	ASSERT_VOP_LOCKED(dp, "lookup");
+	if (error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) {
+		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
+#ifdef NAMEI_DIAGNOSTIC
+		printf("not found\n");
+#endif
+		if ((error == ENOENT) &&
+		    (dp->v_flag & VROOT) &&
+		    (dp->v_mount->mnt_flag & MNT_UNION)) {
+			tdp = dp;
+			dp = dp->v_mount->mnt_vnodecovered;
+			vput(tdp);
+			VREF(dp);
+			vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p);
+			goto unionlookup;
+		}
+
+		if (error != EJUSTRETURN)
+			goto bad;
+		/*
+		 * If creating and at end of pathname, then can consider
+		 * allowing file to be created.
+		 */
+		if (rdonly) {
+			error = EROFS;
+			goto bad;
+		}
+		if (*cp == '\0' && trailing_slash &&
+		     !(cnp->cn_flags & WILLBEDIR)) {
+			error = ENOENT;
+			goto bad;
+		}
+		/*
+		 * We return with ni_vp NULL to indicate that the entry
+		 * doesn't currently exist, leaving a pointer to the
+		 * (possibly locked) directory inode in ndp->ni_dvp.
+		 */
+		if (cnp->cn_flags & SAVESTART) {
+			ndp->ni_startdir = ndp->ni_dvp;
+			VREF(ndp->ni_startdir);
+		}
+		return (0);
+	}
+#ifdef NAMEI_DIAGNOSTIC
+	printf("found\n");
+#endif
+
+	ASSERT_VOP_LOCKED(ndp->ni_vp, "lookup");
+
+	/*
+	 * Take into account any additional components consumed by
+	 * the underlying filesystem.
+	 */
+	if (cnp->cn_consume > 0) {
+		cnp->cn_nameptr += cnp->cn_consume;
+		ndp->ni_next += cnp->cn_consume;
+		ndp->ni_pathlen -= cnp->cn_consume;
+		cnp->cn_consume = 0;
+	}
+
+	dp = ndp->ni_vp;
+
+	/*
+	 * Check to see if the vnode has been mounted on;
+	 * if so find the root of the mounted file system.
+	 */
+	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
+	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+		if (vfs_busy(mp, 0, 0, p))
+			continue;
+		error = VFS_ROOT(mp, &tdp);
+		vfs_unbusy(mp, p);
+		if (error)
+			goto bad2;
+		vput(dp);
+		ndp->ni_vp = dp = tdp;
+	}
+
+	/*
+	 * Check for symbolic link
+	 */
+	if ((dp->v_type == VLNK) &&
+	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
+	     *ndp->ni_next == '/')) {
+		cnp->cn_flags |= ISSYMLINK;
+		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
+			error = EACCES;
+			goto bad2;
+		}
+		return (0);
+	}
+
+	/*
+	 * Check for bogus trailing slashes.
+	 */
+	if (trailing_slash && dp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto bad2;
+	}
+
+nextname:
+	/*
+	 * Not a symbolic link.  If more pathname,
+	 * continue at next component, else return.
+	 */
+	if (*ndp->ni_next == '/') {
+		cnp->cn_nameptr = ndp->ni_next;
+		while (*cnp->cn_nameptr == '/') {
+			cnp->cn_nameptr++;
+			ndp->ni_pathlen--;
+		}
+		if (ndp->ni_dvp != ndp->ni_vp) {
+		    ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "lookup");
+		}
+		vrele(ndp->ni_dvp);
+		goto dirloop;
+	}
+	/*
+	 * Disallow directory write attempts on read-only file systems.
+	 */
+	if (rdonly &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		error = EROFS;
+		goto bad2;
+	}
+	if (cnp->cn_flags & SAVESTART) {
+		ndp->ni_startdir = ndp->ni_dvp;
+		VREF(ndp->ni_startdir);
+	}
+	if (!wantparent)
+		vrele(ndp->ni_dvp);
+
+	if ((cnp->cn_flags & LOCKLEAF) == 0)
+		VOP_UNLOCK(dp, 0, p);
+	return (0);
+
+bad2:
+	if ((cnp->cn_flags & LOCKPARENT) && *ndp->ni_next == '\0')
+		VOP_UNLOCK(ndp->ni_dvp, 0, p);
+	vrele(ndp->ni_dvp);
+bad:
+	vput(dp);
+	ndp->ni_vp = NULL;
+	return (error);
+}
+
+/*
+ * relookup - lookup a path name component
+ *    Used by lookup to re-aquire things.
+ */
+int
+relookup(dvp, vpp, cnp)
+	struct vnode *dvp, **vpp;
+	struct componentname *cnp;
+{
+	struct proc *p = cnp->cn_proc;
+	struct vnode *dp = 0;		/* the directory we are searching */
+	int docache;			/* == 0 do not cache last component */
+	int wantparent;			/* 1 => wantparent or lockparent flag */
+	int rdonly;			/* lookup read-only flag bit */
+	int error = 0;
+#ifdef NAMEI_DIAGNOSTIC
+	int newhash;			/* DEBUG: check name hash */
+	char *cp;			/* DEBUG: check name ptr/len */
+#endif
+
+	/*
+	 * Setup: break out flag bits into variables.
+	 */
+	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
+	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+	if (cnp->cn_nameiop == DELETE ||
+	    (wantparent && cnp->cn_nameiop != CREATE))
+		docache = 0;
+	rdonly = cnp->cn_flags & RDONLY;
+	cnp->cn_flags &= ~ISSYMLINK;
+	dp = dvp;
+	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p);
+
+/* dirloop: */
+	/*
+	 * Search a new directory.
+	 *
+	 * The cn_hash value is for use by vfs_cache.
+	 * The last component of the filename is left accessible via
+	 * cnp->cn_nameptr for callers that need the name. Callers needing
+	 * the name set the SAVENAME flag. When done, they assume
+	 * responsibility for freeing the pathname buffer.
+	 */
+#ifdef NAMEI_DIAGNOSTIC
+	for (newhash = 0, cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+		newhash += (unsigned char)*cp;
+	if (newhash != cnp->cn_hash)
+		panic("relookup: bad hash");
+	if (cnp->cn_namelen != cp - cnp->cn_nameptr)
+		panic ("relookup: bad len");
+	if (*cp != 0)
+		panic("relookup: not last component");
+	printf("{%s}: ", cnp->cn_nameptr);
+#endif
+
+	/*
+	 * Check for degenerate name (e.g. / or "")
+	 * which is a way of talking about a directory,
+	 * e.g. like "/." or ".".
+	 */
+	if (cnp->cn_nameptr[0] == '\0') {
+		if (cnp->cn_nameiop != LOOKUP || wantparent) {
+			error = EISDIR;
+			goto bad;
+		}
+		if (dp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto bad;
+		}
+		if (!(cnp->cn_flags & LOCKLEAF))
+			VOP_UNLOCK(dp, 0, p);
+		*vpp = dp;
+		if (cnp->cn_flags & SAVESTART)
+			panic("lookup: SAVESTART");
+		return (0);
+	}
+
+	if (cnp->cn_flags & ISDOTDOT)
+		panic ("relookup: lookup on dot-dot");
+
+	/*
+	 * We now have a segment name to search for, and a directory to search.
+	 */
+	if (error = VOP_LOOKUP(dp, vpp, cnp)) {
+		KASSERT(*vpp == NULL, ("leaf should be empty"));
+		if (error != EJUSTRETURN)
+			goto bad;
+		/*
+		 * If creating and at end of pathname, then can consider
+		 * allowing file to be created.
+		 */
+		if (rdonly) {
+			error = EROFS;
+			goto bad;
+		}
+		/* ASSERT(dvp == ndp->ni_startdir) */
+		if (cnp->cn_flags & SAVESTART)
+			VREF(dvp);
+		/*
+		 * We return with ni_vp NULL to indicate that the entry
+		 * doesn't currently exist, leaving a pointer to the
+		 * (possibly locked) directory inode in ndp->ni_dvp.
+		 */
+		return (0);
+	}
+	dp = *vpp;
+
+	/*
+	 * Check for symbolic link
+	 */
+	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
+	    ("relookup: symlink found.\n"));
+
+	/*
+	 * Disallow directory write attempts on read-only file systems.
+	 */
+	if (rdonly &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		error = EROFS;
+		goto bad2;
+	}
+	/* ASSERT(dvp == ndp->ni_startdir) */
+	if (cnp->cn_flags & SAVESTART)
+		VREF(dvp);
+	
+	if (!wantparent)
+		vrele(dvp);
+
+	if (dp->v_type == VREG &&
+		((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF))
+		vfs_object_create(dp, cnp->cn_proc, cnp->cn_cred);
+
+	if ((cnp->cn_flags & LOCKLEAF) == 0)
+		VOP_UNLOCK(dp, 0, p);
+	return (0);
+
+bad2:
+	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN))
+		VOP_UNLOCK(dvp, 0, p);
+	vrele(dvp);
+bad:
+	vput(dp);
+	*vpp = NULL;
+	return (error);
+}
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
new file mode 100644
index 0000000..a7a830f
--- /dev/null
+++ b/sys/kern/vfs_mount.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 1989, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 1995 Artisoft, Inc.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_conf.c	8.8 (Berkeley) 3/31/94
+ * $Id: vfs_conf.c,v 1.25 1998/06/09 12:52:33 bde Exp $
+ */
+
+/*
+ * PURPOSE:	This file abstracts the root mounting interface from
+ *		the per file system semantics for handling mounts,
+ *		the overall intent of which is to move the BSD
+ *		internals dependence out of the FS code, both to
+ *		make the FS code more portable and to free up some
+ *		of the BSD internals so that they may more easily
+ *		be changed.
+ *
+ * NOTE1:	Code is single entry/single exit to aid debugging
+ *		and conversion for kernel multithreading.
+ *
+ * NOTE2:	Code notes lock state in headers on entry and exit
+ *		as an aid to conversion for kernel multithreading
+ *		on SMP reentrancy
+ */
+#include "opt_bootp.h"
+
+#include <sys/param.h>		/* dev_t (types.h)*/
+#include <sys/kernel.h>
+#include <sys/systm.h>		/* rootvp*/
+#include <sys/proc.h>		/* curproc*/
+#include <sys/vnode.h>		/* NULLVP*/
+#include <sys/mount.h>		/* struct mount*/
+#include <sys/malloc.h>		/* M_MOUNT*/
+
+/*
+ * GLOBALS
+ */
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct");
+
+/*
+ *  These define the root filesystem, device, and root filesystem type.
+ */
+dev_t rootdevs[] = { NODEV, NODEV };
+char *rootdevnames[2];
+struct vnode *rootvnode;
+char *mountrootfsname;
+#ifdef BOOTP
+extern void bootpc_init __P((void));
+#endif
+
+/*
+ * vfs_init() will set maxvfsconf
+ * to the highest defined type number.
+ */
+int maxvfsconf;
+struct vfsconf *vfsconf;
+
+/*
+ * Common root mount code shared by all filesystems
+ */
+#define ROOTNAME	"root_device"
+
+/*
+ * vfs_mountrootfs
+ *
+ * Common entry point for root mounts
+ *
+ * PARAMETERS:
+ * 		NONE
+ *
+ * RETURNS:	0	Success
+ *		!0	error number (errno.h)
+ *
+ * LOCK STATE:
+ *		ENTRY
+ *			<no locks held>
+ *		EXIT
+ *			<no locks held>
+ *
+ * NOTES:
+ *		This code is currently supported only for use for
+ *		the FFS file system type.  This is a matter of
+ *		fixing the other file systems, not this code!
+ */
+static void
+vfs_mountrootfs(void *unused)
+{
+	struct mount		*mp;
+	int			i, err;
+	struct proc		*p = curproc;	/* XXX */
+	dev_t			orootdev;
+
+#ifdef BOOTP
+	bootpc_init();
+#endif
+	/*
+	 *  New root mount structure
+	 */
+	if ((err = vfs_rootmountalloc(mountrootfsname, ROOTNAME, &mp))) {
+		printf("error %d: ", err);
+		panic("cannot mount root\n");
+		return ;
+	}
+	mp->mnt_flag		|= MNT_ROOTFS;
+
+	/*
+	 * Attempt the mount
+	 */
+	err = ENXIO;
+	orootdev = rootdev;
+	if (rootdevs[0] == NODEV)
+		rootdevs[0] = rootdev;
+	for (i = 0; i < sizeof(rootdevs) / sizeof(rootdevs[0]); i++) {
+		if (rootdevs[i] == NODEV)
+			break;
+		rootdev = rootdevs[i];
+		if (rootdev != orootdev) {
+			printf("changing root device to %s\n", rootdevnames[i]);
+			orootdev = rootdev;
+		}
+		strncpy(mp->mnt_stat.f_mntfromname,
+		    rootdevnames[i] ? rootdevnames[i] : ROOTNAME, MNAMELEN - 1);
+		err = VFS_MOUNT(mp, NULL, NULL, NULL, p);
+		if (err != ENXIO)
+			break;
+	}
+	if (err) {
+		/*
+		 * XXX should ask the user for the name in some cases.
+		 * Why do we call vfs_unbusy() here and not after ENXIO
+		 * is returned above?
+		 */
+		vfs_unbusy(mp, p);
+		/*
+		 * free mount struct before failing
+		 * (hardly worthwhile with the PANIC eh?)
+		 */
+		free( mp, M_MOUNT);
+		printf("error %d: ", err);
+		panic("cannot mount root (2)\n");
+		return;
+	}
+
+	simple_lock(&mountlist_slock);
+
+	/*
+	 * Add fs to list of mounted file systems
+	 */
+	CIRCLEQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+
+	simple_unlock(&mountlist_slock);
+	vfs_unbusy(mp, p);
+
+	/* root mount, update system time from FS specific data*/
+	inittodr(mp->mnt_time);
+	return;
+}
+
+SYSINIT(mountroot, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, vfs_mountrootfs, NULL)
+
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
new file mode 100644
index 0000000..44b1698
--- /dev/null
+++ b/sys/kern/vfs_subr.c
@@ -0,0 +1,2872 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
+ * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/domain.h>
+#include <sys/dirent.h>
+#include <sys/vmmeter.h>
+
+#include <machine/limits.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+#include <vm/vm_zone.h>
+#include <sys/sysctl.h>
+
+#include <miscfs/specfs/specdev.h>
+
+static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+
+static void	insmntque __P((struct vnode *vp, struct mount *mp));
+static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
+static void	vfree __P((struct vnode *));
+static void	vgonel __P((struct vnode *vp, struct proc *p));
+static unsigned long	numvnodes;
+SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
+
+enum vtype iftovt_tab[16] = {
+	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[9] = {
+	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+	S_IFSOCK, S_IFIFO, S_IFMT,
+};
+
+static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
+struct tobefreelist vnode_tobefree_list;	/* vnode free list */
+
+static u_long wantfreevnodes = 25;
+SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+static u_long freevnodes = 0;
+SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
+
+int vfs_ioopt = 0;
+#ifdef ENABLE_VFS_IOOPT
+SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
+#endif
+
+struct mntlist mountlist;	/* mounted filesystem list */
+struct simplelock mountlist_slock;
+struct simplelock mntvnode_slock;
+int	nfs_mount_type = -1;
+#ifndef NULL_SIMPLELOCKS
+static struct simplelock mntid_slock;
+static struct simplelock vnode_free_list_slock;
+static struct simplelock spechash_slock;
+#endif
+struct nfs_public nfs_pub;	/* publicly exported FS */
+static vm_zone_t vnode_zone;
+
+/*
+ * The workitem queue.
+ */
+#define SYNCER_MAXDELAY		32
+static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
+time_t syncdelay =		30;
+int rushjob;				/* number of slots to run ASAP */
+
+static int syncer_delayno = 0;
+static long syncer_mask; 
+LIST_HEAD(synclist, vnode);
+static struct synclist *syncer_workitem_pending;
+
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
+
+static void	vfs_free_addrlist __P((struct netexport *nep));
+static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
+static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
+				       struct export_args *argp));
+
+/*
+ * Initialize the vnode management data structures.
+ */
+void
+vntblinit()
+{
+
+	desiredvnodes = maxproc + cnt.v_page_count / 4;
+	simple_lock_init(&mntvnode_slock);
+	simple_lock_init(&mntid_slock);
+	simple_lock_init(&spechash_slock);
+	TAILQ_INIT(&vnode_free_list);
+	TAILQ_INIT(&vnode_tobefree_list);
+	simple_lock_init(&vnode_free_list_slock);
+	CIRCLEQ_INIT(&mountlist);
+	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
+	/*
+	 * Initialize the filesystem syncer.
+	 */     
+	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 
+		&syncer_mask);
+	syncer_maxdelay = syncer_mask + 1;
+}
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Interlock is not released on failure.
+ */
+int
+vfs_busy(mp, flags, interlkp, p)
+	struct mount *mp;
+	int flags;
+	struct simplelock *interlkp;
+	struct proc *p;
+{
+	int lkflags;
+
+	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+		if (flags & LK_NOWAIT)
+			return (ENOENT);
+		mp->mnt_kern_flag |= MNTK_MWAIT;
+		if (interlkp) {
+			simple_unlock(interlkp);
+		}
+		/*
+		 * Since all busy locks are shared except the exclusive
+		 * lock granted when unmounting, the only place that a
+		 * wakeup needs to be done is at the release of the
+		 * exclusive lock at the end of dounmount.
+		 */
+		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
+		if (interlkp) {
+			simple_lock(interlkp);
+		}
+		return (ENOENT);
+	}
+	lkflags = LK_SHARED | LK_NOPAUSE;
+	if (interlkp)
+		lkflags |= LK_INTERLOCK;
+	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
+		panic("vfs_busy: unexpected lock failure");
+	return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(mp, p)
+	struct mount *mp;
+	struct proc *p;
+{
+
+	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
+}
+
+/*
+ * Lookup a filesystem type, and if found allocate and initialize
+ * a mount structure for it.
+ *
+ * Devname is usually updated by mount(8) after booting.
+ */
+int
+vfs_rootmountalloc(fstypename, devname, mpp)
+	char *fstypename;
+	char *devname;
+	struct mount **mpp;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct vfsconf *vfsp;
+	struct mount *mp;
+
+	if (fstypename == NULL)
+		return (ENODEV);
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstypename))
+			break;
+	if (vfsp == NULL)
+		return (ENODEV);
+	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
+	bzero((char *)mp, (u_long)sizeof(struct mount));
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
+	LIST_INIT(&mp->mnt_vnodelist);
+	mp->mnt_vfc = vfsp;
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_flag = MNT_RDONLY;
+	mp->mnt_vnodecovered = NULLVP;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+	mp->mnt_stat.f_mntonname[0] = '/';
+	mp->mnt_stat.f_mntonname[1] = 0;
+	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
+	*mpp = mp;
+	return (0);
+}
+
+/*
+ * Find an appropriate filesystem to use for the root. If a filesystem
+ * has not been preselected, walk through the list of known filesystems
+ * trying those that have mountroot routines, and try them until one
+ * works or we have tried them all.
+ */
+#ifdef notdef	/* XXX JH */
+int
+lite2_vfs_mountroot()
+{
+	struct vfsconf *vfsp;
+	extern int (*lite2_mountroot) __P((void));
+	int error;
+
+	if (lite2_mountroot != NULL)
+		return ((*lite2_mountroot)());
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		if (vfsp->vfc_mountroot == NULL)
+			continue;
+		if ((error = (*vfsp->vfc_mountroot)()) == 0)
+			return (0);
+		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
+	}
+	return (ENODEV);
+}
+#endif
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid)
+	fsid_t *fsid;
+{
+	register struct mount *mp;
+
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
+	    mp = mp->mnt_list.cqe_next) {
+		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+			simple_unlock(&mountlist_slock);
+			return (mp);
+	    }
+	}
+	simple_unlock(&mountlist_slock);
+	return ((struct mount *) 0);
+}
+
+/*
+ * Get a new unique fsid
+ */
+void
+vfs_getnewfsid(mp)
+	struct mount *mp;
+{
+	static u_short xxxfs_mntid;
+
+	fsid_t tfsid;
+	int mtype;
+
+	simple_lock(&mntid_slock); 
+	mtype = mp->mnt_vfc->vfc_typenum;
+	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
+	mp->mnt_stat.f_fsid.val[1] = mtype;
+	if (xxxfs_mntid == 0)
+		++xxxfs_mntid;
+	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
+	tfsid.val[1] = mtype;
+	if (mountlist.cqh_first != (void *)&mountlist) {
+		while (vfs_getvfs(&tfsid)) {
+			tfsid.val[0]++;
+			xxxfs_mntid++;
+		}
+	}
+	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+	simple_unlock(&mntid_slock);
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(vap)
+	register struct vattr *vap;
+{
+
+	vap->va_type = VNON;
+	vap->va_size = VNOVAL;
+	vap->va_bytes = VNOVAL;
+	vap->va_mode = VNOVAL;
+	vap->va_nlink = VNOVAL;
+	vap->va_uid = VNOVAL;
+	vap->va_gid = VNOVAL;
+	vap->va_fsid = VNOVAL;
+	vap->va_fileid = VNOVAL;
+	vap->va_blocksize = VNOVAL;
+	vap->va_rdev = VNOVAL;
+	vap->va_atime.tv_sec = VNOVAL;
+	vap->va_atime.tv_nsec = VNOVAL;
+	vap->va_mtime.tv_sec = VNOVAL;
+	vap->va_mtime.tv_nsec = VNOVAL;
+	vap->va_ctime.tv_sec = VNOVAL;
+	vap->va_ctime.tv_nsec = VNOVAL;
+	vap->va_flags = VNOVAL;
+	vap->va_gen = VNOVAL;
+	vap->va_vaflags = 0;
+}
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+extern vop_t **dead_vnodeop_p;
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(tag, mp, vops, vpp)
+	enum vtagtype tag;
+	struct mount *mp;
+	vop_t **vops;
+	struct vnode **vpp;
+{
+	int s;
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp, *tvp, *nvp;
+	vm_object_t object;
+	TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
+
+	/*
+	 * We take the least recently used vnode from the freelist
+	 * if we can get it and it has no cached pages, and no
+	 * namecache entries are relative to it.
+	 * Otherwise we allocate a new vnode
+	 */
+
+	s = splbio();
+	simple_lock(&vnode_free_list_slock);
+	TAILQ_INIT(&vnode_tmp_list);
+
+	for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
+		nvp = TAILQ_NEXT(vp, v_freelist);
+		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+		if (vp->v_flag & VAGE) {
+			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+		} else {
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+		}
+		vp->v_flag &= ~(VTBFREE|VAGE);
+		vp->v_flag |= VFREE;
+		if (vp->v_usecount)
+			panic("tobe free vnode isn't");
+		freevnodes++;
+	}
+
+	if (wantfreevnodes && freevnodes < wantfreevnodes) {
+		vp = NULL;
+	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
+		/* 
+		 * XXX: this is only here to be backwards compatible
+		 */
+		vp = NULL;
+	} else {
+		for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
+			nvp = TAILQ_NEXT(vp, v_freelist);
+			if (!simple_lock_try(&vp->v_interlock)) 
+				continue;
+			if (vp->v_usecount)
+				panic("free vnode isn't");
+
+			object = vp->v_object;
+			if (object && (object->resident_page_count || object->ref_count)) {
+				printf("object inconsistant state: RPC: %d, RC: %d\n",
+					object->resident_page_count, object->ref_count);
+				/* Don't recycle if it's caching some pages */
+				TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+				TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
+				continue;
+			} else if (LIST_FIRST(&vp->v_cache_src)) {
+				/* Don't recycle if active in the namecache */
+				simple_unlock(&vp->v_interlock);
+				continue;
+			} else {
+				break;
+			}
+		}
+	}
+
+	for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
+		nvp = TAILQ_NEXT(tvp, v_freelist);
+		TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
+		TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
+		simple_unlock(&tvp->v_interlock);
+	}
+
+	if (vp) {
+		vp->v_flag |= VDOOMED;
+		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		freevnodes--;
+		simple_unlock(&vnode_free_list_slock);
+		cache_purge(vp);
+		vp->v_lease = NULL;
+		if (vp->v_type != VBAD) {
+			vgonel(vp, p);
+		} else {
+			simple_unlock(&vp->v_interlock);
+		}
+
+#ifdef INVARIANTS
+		{
+			int s;
+
+			if (vp->v_data)
+				panic("cleaned vnode isn't");
+			s = splbio();
+			if (vp->v_numoutput)
+				panic("Clean vnode has pending I/O's");
+			splx(s);
+		}
+#endif
+		vp->v_flag = 0;
+		vp->v_lastr = 0;
+		vp->v_lastw = 0;
+		vp->v_lasta = 0;
+		vp->v_cstart = 0;
+		vp->v_clen = 0;
+		vp->v_socket = 0;
+		vp->v_writecount = 0;	/* XXX */
+		vp->v_maxio = 0;
+	} else {
+		simple_unlock(&vnode_free_list_slock);
+		vp = (struct vnode *) zalloc(vnode_zone);
+		bzero((char *) vp, sizeof *vp);
+		simple_lock_init(&vp->v_interlock);
+		vp->v_dd = vp;
+		cache_purge(vp);
+		LIST_INIT(&vp->v_cache_src);
+		TAILQ_INIT(&vp->v_cache_dst);
+		numvnodes++;
+	}
+
+	TAILQ_INIT(&vp->v_cleanblkhd);
+	TAILQ_INIT(&vp->v_dirtyblkhd);
+	vp->v_type = VNON;
+	vp->v_tag = tag;
+	vp->v_op = vops;
+	insmntque(vp, mp);
+	*vpp = vp;
+	vp->v_usecount = 1;
+	vp->v_data = 0;
+	splx(s);
+
+	vfs_object_create(vp, p, p->p_ucred);
+	return (0);
+}
+
+/*
+ * Move a vnode from one mount queue to another.
+ */
+static void
+insmntque(vp, mp)
+	register struct vnode *vp;
+	register struct mount *mp;
+{
+
+	simple_lock(&mntvnode_slock);
+	/*
+	 * Delete from old mount point vnode list, if on one.
+	 */
+	if (vp->v_mount != NULL)
+		LIST_REMOVE(vp, v_mntvnodes);
+	/*
+	 * Insert into list of vnodes for the new mount point, if available.
+	 */
+	if ((vp->v_mount = mp) == NULL) {
+		simple_unlock(&mntvnode_slock);
+		return;
+	}
+	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
+	simple_unlock(&mntvnode_slock);
+}
+
+/*
+ * Update outstanding I/O count and do wakeup if requested.
+ */
+void
+vwakeup(bp)
+	register struct buf *bp;
+{
+	register struct vnode *vp;
+
+	bp->b_flags &= ~B_WRITEINPROG;
+	if ((vp = bp->b_vp)) {
+		vp->v_numoutput--;
+		if (vp->v_numoutput < 0)
+			panic("vwakeup: neg numoutput");
+		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
+			vp->v_flag &= ~VBWAIT;
+			wakeup((caddr_t) &vp->v_numoutput);
+		}
+	}
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
+	register struct vnode *vp;
+	int flags;
+	struct ucred *cred;
+	struct proc *p;
+	int slpflag, slptimeo;
+{
+	register struct buf *bp;
+	struct buf *nbp, *blist;
+	int s, error;
+	vm_object_t object;
+
+	if (flags & V_SAVE) {
+		s = splbio();
+		while (vp->v_numoutput) {
+			vp->v_flag |= VBWAIT;
+			error = tsleep((caddr_t)&vp->v_numoutput,
+			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
+			if (error) {
+				splx(s);
+				return (error);
+			}
+		}
+		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+			splx(s);
+			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
+				return (error);
+			s = splbio();
+			if (vp->v_numoutput > 0 ||
+			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
+				panic("vinvalbuf: dirty bufs");
+		}
+		splx(s);
+  	}
+	s = splbio();
+	for (;;) {
+		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
+		if (!blist)
+			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
+		if (!blist)
+			break;
+
+		for (bp = blist; bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if (bp->b_flags & B_BUSY) {
+				bp->b_flags |= B_WANTED;
+				error = tsleep((caddr_t) bp,
+				    slpflag | (PRIBIO + 4), "vinvalbuf",
+				    slptimeo);
+				if (error) {
+					splx(s);
+					return (error);
+				}
+				break;
+			}
+			/*
+			 * XXX Since there are no node locks for NFS, I
+			 * believe there is a slight chance that a delayed
+			 * write will occur while sleeping just above, so
+			 * check for it.  Note that vfs_bio_awrite expects
+			 * buffers to reside on a queue, while VOP_BWRITE and
+			 * brelse do not.
+			 */
+			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+				(flags & V_SAVE)) {
+
+				if (bp->b_vp == vp) {
+					if (bp->b_flags & B_CLUSTEROK) {
+						vfs_bio_awrite(bp);
+					} else {
+						bremfree(bp);
+						bp->b_flags |= (B_BUSY | B_ASYNC);
+						VOP_BWRITE(bp);
+					}
+				} else {
+					bremfree(bp);
+					bp->b_flags |= B_BUSY;
+					(void) VOP_BWRITE(bp);
+				}
+				break;
+			}
+			bremfree(bp);
+			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF | B_BUSY);
+			bp->b_flags &= ~B_ASYNC;
+			brelse(bp);
+		}
+	}
+
+	while (vp->v_numoutput > 0) {
+		vp->v_flag |= VBWAIT;
+		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+	}
+
+	splx(s);
+
+	/*
+	 * Destroy the copy in the VM cache, too.
+	 */
+	simple_lock(&vp->v_interlock);
+	object = vp->v_object;
+	if (object != NULL) {
+		vm_object_page_remove(object, 0, 0,
+			(flags & V_SAVE) ? TRUE : FALSE);
+	}
+	simple_unlock(&vp->v_interlock);
+
+	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
+		panic("vinvalbuf: flush failed");
+	return (0);
+}
+
+/*
+ * Truncate a file's buffer and pages to a specified length.  This
+ * is in lieu of the old vinvalbuf mechanism, which performed unneeded
+ * sync activity.
+ */
+int
+vtruncbuf(vp, cred, p, length, blksize)
+	register struct vnode *vp;
+	struct ucred *cred;
+	struct proc *p;
+	off_t length;
+	int blksize;
+{
+	register struct buf *bp;
+	struct buf *nbp;
+	int s, anyfreed;
+	int trunclbn;
+
+	/*
+	 * Round up to the *next* lbn.
+	 */
+	trunclbn = (length + blksize - 1) / blksize;
+
+	s = splbio();
+restart:
+	anyfreed = 1;
+	for (;anyfreed;) {
+		anyfreed = 0;
+		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if (bp->b_lblkno >= trunclbn) {
+				if (bp->b_flags & B_BUSY) {
+					bp->b_flags |= B_WANTED;
+					tsleep(bp, PRIBIO + 4, "vtrb1", 0);
+					goto restart;
+				} else {
+					bremfree(bp);
+					bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
+					bp->b_flags &= ~B_ASYNC;
+					brelse(bp);
+					anyfreed = 1;
+				}
+				if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)||
+					 (nbp->b_vp != vp) ||
+					 (nbp->b_flags & B_DELWRI))) {
+					goto restart;
+				}
+			}
+		}
+
+		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if (bp->b_lblkno >= trunclbn) {
+				if (bp->b_flags & B_BUSY) {
+					bp->b_flags |= B_WANTED;
+					tsleep(bp, PRIBIO + 4, "vtrb2", 0);
+					goto restart;
+				} else {
+					bremfree(bp);
+					bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
+					bp->b_flags &= ~B_ASYNC;
+					brelse(bp);
+					anyfreed = 1;
+				}
+				if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)||
+					 (nbp->b_vp != vp) ||
+					 (nbp->b_flags & B_DELWRI) == 0)) {
+					goto restart;
+				}
+			}
+		}
+	}
+
+	if (length > 0) {
+restartsync:
+		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
+				if (bp->b_flags & B_BUSY) {
+					bp->b_flags |= B_WANTED;
+					tsleep(bp, PRIBIO, "vtrb3", 0);
+				} else {
+					bremfree(bp);
+					bp->b_flags |= B_BUSY;
+					if (bp->b_vp == vp) {
+						bp->b_flags |= B_ASYNC;
+					} else {
+						bp->b_flags &= ~B_ASYNC;
+					}
+					VOP_BWRITE(bp);
+				}
+				goto restartsync;
+			}
+
+		}
+	}
+
+	while (vp->v_numoutput > 0) {
+		vp->v_flag |= VBWAIT;
+		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
+	}
+
+	splx(s);
+
+	vnode_pager_setsize(vp, length);
+
+	return (0);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(vp, bp)
+	register struct vnode *vp;
+	register struct buf *bp;
+{
+	int s;
+
+	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
+
+	vhold(vp);
+	bp->b_vp = vp;
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		bp->b_dev = vp->v_rdev;
+	else
+		bp->b_dev = NODEV;
+	/*
+	 * Insert onto list for new vnode.
+	 */
+	s = splbio();
+	bp->b_xflags |= B_VNCLEAN;
+	bp->b_xflags &= ~B_VNDIRTY;
+	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
+	splx(s);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(bp)
+	register struct buf *bp;
+{
+	struct vnode *vp;
+	struct buflists *listheadp;
+	int s;
+
+	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
+
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	vp = bp->b_vp;
+	s = splbio();
+	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
+		if (bp->b_xflags & B_VNDIRTY)
+			listheadp = &vp->v_dirtyblkhd;
+		else 
+			listheadp = &vp->v_cleanblkhd;
+		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
+	}
+	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+		vp->v_flag &= ~VONWORKLST;
+		LIST_REMOVE(vp, v_synclist);
+	}
+	splx(s);
+	bp->b_vp = (struct vnode *) 0;
+	vdrop(vp);
+}
+
+/*
+ * The workitem queue.
+ * 
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syner process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ *	syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+
+/*
+ * Add an item to the syncer work queue.
+ */
+void
+vn_syncer_add_to_worklist(vp, delay)
+	struct vnode *vp;
+	int delay;
+{
+	int s, slot;
+
+	s = splbio();
+
+	if (vp->v_flag & VONWORKLST) {
+		LIST_REMOVE(vp, v_synclist);
+	}
+
+	if (delay > syncer_maxdelay - 2)
+		delay = syncer_maxdelay - 2;
+	slot = (syncer_delayno + delay) & syncer_mask;
+
+	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
+	vp->v_flag |= VONWORKLST;
+	splx(s);
+}
+
+static void sched_sync __P((void));
+static struct	proc *updateproc;
+static struct kproc_desc up_kp = {
+	"syncer",
+	sched_sync,
+	&updateproc
+};
+SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+void 
+sched_sync(void)
+{
+	struct synclist *slp;
+	struct vnode *vp;
+	long starttime;
+	int s;
+	struct proc *p = updateproc;
+
+	for (;;) {
+		starttime = time_second;
+
+		/*
+		 * Push files whose dirty time has expired.
+		 */
+		s = splbio();
+		slp = &syncer_workitem_pending[syncer_delayno];
+		syncer_delayno += 1;
+		if (syncer_delayno == syncer_maxdelay)
+			syncer_delayno = 0;
+		splx(s);
+
+		while ((vp = LIST_FIRST(slp)) != NULL) {
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
+			VOP_UNLOCK(vp, 0, p);
+			if (LIST_FIRST(slp) == vp) {
+				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
+				    vp->v_type != VBLK)
+					panic("sched_sync: fsync failed");
+				/*
+				 * Move ourselves to the back of the sync list.
+				 */
+				LIST_REMOVE(vp, v_synclist);
+				vn_syncer_add_to_worklist(vp, syncdelay);
+			}
+		}
+
+		/*
+		 * Do soft update processing.
+		 */
+		if (bioops.io_sync)
+			(*bioops.io_sync)(NULL);
+
+		/*
+		 * The variable rushjob allows the kernel to speed up the
+		 * processing of the filesystem syncer process. A rushjob
+		 * value of N tells the filesystem syncer to process the next
+		 * N seconds worth of work on its queue ASAP. Currently rushjob
+		 * is used by the soft update code to speed up the filesystem
+		 * syncer process when the incore state is getting so far
+		 * ahead of the disk that the kernel memory pool is being
+		 * threatened with exhaustion.
+		 */
+		if (rushjob > 0) {
+			rushjob -= 1;
+			continue;
+		}
+		/*
+		 * If it has taken us less than a second to process the
+		 * current work, then wait. Otherwise start right over
+		 * again. We can still lose time if any single round
+		 * takes more than two seconds, but it does not really
+		 * matter as we are just trying to generally pace the
+		 * filesystem activity.
+		 */
+		if (time_second == starttime)
+			tsleep(&lbolt, PPAUSE, "syncer", 0);
+	}
+}
+
+/*
+ * Associate a p-buffer with a vnode.
+ *
+ * Also sets B_PAGING flag to indicate that vnode is not fully associated
+ * with the buffer.  i.e. the bp has not been linked into the vnode or
+ * ref-counted.
+ */
+void
+pbgetvp(vp, bp)
+	register struct vnode *vp;
+	register struct buf *bp;
+{
+
+	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
+
+	bp->b_vp = vp;
+	bp->b_flags |= B_PAGING;
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		bp->b_dev = vp->v_rdev;
+	else
+		bp->b_dev = NODEV;
+}
+
+/*
+ * Disassociate a p-buffer from a vnode.
+ */
+void
+pbrelvp(bp)
+	register struct buf *bp;
+{
+
+	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
+
+#if !defined(MAX_PERF)
+	/* XXX REMOVE ME */
+	if (bp->b_vnbufs.tqe_next != NULL) {
+		panic(
+		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
+		    bp,
+		    (int)bp->b_flags
+		);
+	}
+#endif
+	bp->b_vp = (struct vnode *) 0;
+	bp->b_flags &= ~B_PAGING;
+}
+
+void
+pbreassignbuf(bp, newvp)
+	struct buf *bp;
+	struct vnode *newvp;
+{
+#if !defined(MAX_PERF)
+	if ((bp->b_flags & B_PAGING) == 0) {
+		panic(
+		    "pbreassignbuf() on non phys bp %p", 
+		    bp
+		);
+	}
+#endif
+	bp->b_vp = newvp;
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(bp, newvp)
+	register struct buf *bp;
+	register struct vnode *newvp;
+{
+	struct buflists *listheadp;
+	struct vnode *oldvp;
+	int delay;
+	int s;
+
+	if (newvp == NULL) {
+		printf("reassignbuf: NULL");
+		return;
+	}
+
+#if !defined(MAX_PERF)
+	/*
+	 * B_PAGING flagged buffers cannot be reassigned because their vp
+	 * is not fully linked in.
+	 */
+	if (bp->b_flags & B_PAGING)
+		panic("cannot reassign paging buffer");
+#endif
+
+	s = splbio();
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
+		oldvp = bp->b_vp;
+		if (bp->b_xflags & B_VNDIRTY)
+			listheadp = &oldvp->v_dirtyblkhd;
+		else 
+			listheadp = &oldvp->v_cleanblkhd;
+		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
+		vdrop(oldvp);
+	}
+	/*
+	 * If dirty, put on list of dirty buffers; otherwise insert onto list
+	 * of clean buffers.
+	 */
+	if (bp->b_flags & B_DELWRI) {
+		struct buf *tbp;
+
+		listheadp = &newvp->v_dirtyblkhd;
+		if ((newvp->v_flag & VONWORKLST) == 0) {
+			switch (newvp->v_type) {
+			case VDIR:
+				delay = syncdelay / 3;
+				break;
+			case VBLK:
+				if (newvp->v_specmountpoint != NULL) {
+					delay = syncdelay / 2;
+					break;
+				}
+				/* fall through */
+			default:
+				delay = syncdelay;
+			}
+			vn_syncer_add_to_worklist(newvp, delay);
+		}
+		bp->b_xflags |= B_VNDIRTY;
+		tbp = TAILQ_FIRST(listheadp);
+		if (tbp == NULL ||
+		    (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) {
+			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
+		} else {
+			if (bp->b_lblkno >= 0) {
+				struct buf *ttbp;
+				while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
+				    (ttbp->b_lblkno < bp->b_lblkno)) {
+					tbp = ttbp;
+				}
+				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+			} else {
+				TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
+			}
+		}
+	} else {
+		bp->b_xflags |= B_VNCLEAN;
+		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
+		if ((newvp->v_flag & VONWORKLST) &&
+		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
+			newvp->v_flag &= ~VONWORKLST;
+			LIST_REMOVE(newvp, v_synclist);
+		}
+	}
+	bp->b_vp = newvp;
+	vhold(bp->b_vp);
+	splx(s);
+}
+
+/*
+ * Create a vnode for a block device.
+ * Used for mounting the root file system.
+ */
+int
+bdevvp(dev, vpp)
+	dev_t dev;
+	struct vnode **vpp;
+{
+	register struct vnode *vp;
+	struct vnode *nvp;
+	int error;
+
+	/* XXX 255 is for mfs. */
+	if (dev == NODEV || (major(dev) != 255 && (major(dev) >= nblkdev ||
+	    bdevsw[major(dev)] == NULL))) {
+		*vpp = NULLVP;
+		return (ENXIO);
+	}
+	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
+	if (error) {
+		*vpp = NULLVP;
+		return (error);
+	}
+	vp = nvp;
+	vp->v_type = VBLK;
+	if ((nvp = checkalias(vp, dev, (struct mount *)0)) != NULL) {
+		vput(vp);
+		vp = nvp;
+	}
+	*vpp = vp;
+	return (0);
+}
+
+/*
+ * Check to see if the new vnode represents a special device
+ * for which we already have a vnode (either because of
+ * bdevvp() or because of a different vnode representing
+ * the same block device). If such an alias exists, deallocate
+ * the existing contents and return the aliased vnode. The
+ * caller is responsible for filling it with its new contents.
+ */
+struct vnode *
+checkalias(nvp, nvp_rdev, mp)
+	register struct vnode *nvp;
+	dev_t nvp_rdev;
+	struct mount *mp;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp;
+	struct vnode **vpp;
+
+	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
+		return (NULLVP);
+
+	vpp = &speclisth[SPECHASH(nvp_rdev)];
+loop:
+	simple_lock(&spechash_slock);
+	for (vp = *vpp; vp; vp = vp->v_specnext) {
+		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
+			continue;
+		/*
+		 * Alias, but not in use, so flush it out.
+		 * Only alias active device nodes.
+		 * Not sure why we don't re-use this like we do below.
+		 */
+		simple_lock(&vp->v_interlock);
+		if (vp->v_usecount == 0) {
+			simple_unlock(&spechash_slock);
+			vgonel(vp, p);
+			goto loop;
+		}
+		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
+			/*
+			 * It dissappeared, and we may have slept.
+			 * Restart from the beginning
+			 */
+			simple_unlock(&spechash_slock);
+			goto loop;
+		}
+		break;
+	}
+	/*
+	 * It would be a lot clearer what is going on here if
+	 * this had been expressed as:
+	 * if ( vp && (vp->v_tag == VT_NULL))
+	 * and the clauses had been swapped.
+	 */
+	if (vp == NULL || vp->v_tag != VT_NON) {
+		/*
+		 * Put the new vnode into the hash chain.
+		 * and if there was an alias, connect them.
+		 */
+		MALLOC(nvp->v_specinfo, struct specinfo *,
+		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
+		nvp->v_rdev = nvp_rdev;
+		nvp->v_hashchain = vpp;
+		nvp->v_specnext = *vpp;
+		nvp->v_specmountpoint = NULL;
+		simple_unlock(&spechash_slock);
+		*vpp = nvp;
+		if (vp != NULLVP) {
+			nvp->v_flag |= VALIASED;
+			vp->v_flag |= VALIASED;
+			vput(vp);
+		}
+		return (NULLVP);
+	}
+	/*
+	 * if ( vp && (vp->v_tag == VT_NULL))
+	 * We have a vnode alias, but it is a trashed.
+	 * Make it look like it's newley allocated. (by getnewvnode())
+	 * The caller should use this instead.
+	 */
+	simple_unlock(&spechash_slock);
+	VOP_UNLOCK(vp, 0, p);
+	simple_lock(&vp->v_interlock);
+	vclean(vp, 0, p);
+	vp->v_op = nvp->v_op;
+	vp->v_tag = nvp->v_tag;
+	nvp->v_type = VNON;
+	insmntque(vp, mp);
+	return (vp);
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it. The vnode lock bit is set the
+ * vnode is being eliminated in vgone. The process is awakened
+ * when the transition is completed, and an error returned to
+ * indicate that the vnode is no longer usable (possibly having
+ * been changed to a new file system type).
+ */
+int
+vget(vp, flags, p)
+	register struct vnode *vp;
+	int flags;
+	struct proc *p;
+{
+	int error;
+
+	/*
+	 * If the vnode is in the process of being cleaned out for
+	 * another use, we wait for the cleaning to finish and then
+	 * return failure. Cleaning is determined by checking that
+	 * the VXLOCK flag is set.
+	 */
+	if ((flags & LK_INTERLOCK) == 0) {
+		simple_lock(&vp->v_interlock);
+	}
+	if (vp->v_flag & VXLOCK) {
+		vp->v_flag |= VXWANT;
+		simple_unlock(&vp->v_interlock);
+		tsleep((caddr_t)vp, PINOD, "vget", 0);
+		return (ENOENT);
+	}
+
+	vp->v_usecount++;
+
+	if (VSHOULDBUSY(vp))
+		vbusy(vp);
+	if (flags & LK_TYPE_MASK) {
+		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
+			/*
+			 * must expand vrele here because we do not want
+			 * to call VOP_INACTIVE if the reference count
+			 * drops back to zero since it was never really
+			 * active. We must remove it from the free list
+			 * before sleeping so that multiple processes do
+			 * not try to recycle it.
+			 */
+			simple_lock(&vp->v_interlock);
+			vp->v_usecount--;
+			if (VSHOULDFREE(vp))
+				vfree(vp);
+			simple_unlock(&vp->v_interlock);
+		}
+		return (error);
+	}
+	simple_unlock(&vp->v_interlock);
+	return (0);
+}
+
+void
+vref(struct vnode *vp)
+{
+	simple_lock(&vp->v_interlock);
+	vp->v_usecount++;
+	simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(vp)
+	struct vnode *vp;
+{
+	struct proc *p = curproc;	/* XXX */
+
+	KASSERT(vp != NULL, ("vrele: null vp"));
+
+	simple_lock(&vp->v_interlock);
+
+	if (vp->v_usecount > 1) {
+
+		vp->v_usecount--;
+		simple_unlock(&vp->v_interlock);
+
+		return;
+	}
+
+	if (vp->v_usecount == 1) {
+
+		vp->v_usecount--;
+		if (VSHOULDFREE(vp))
+			vfree(vp);
+	/*
+	 * If we are doing a vput, the node is already locked, and we must
+	 * call VOP_INACTIVE with the node locked.  So, in the case of
+	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+	 */
+		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
+			VOP_INACTIVE(vp, p);
+		}
+
+	} else {
+#ifdef DIAGNOSTIC
+		vprint("vrele: negative ref count", vp);
+		simple_unlock(&vp->v_interlock);
+#endif
+		panic("vrele: negative ref cnt");
+	}
+}
+
+void
+vput(vp)
+	struct vnode *vp;
+{
+	struct proc *p = curproc;	/* XXX */
+
+	KASSERT(vp != NULL, ("vput: null vp"));
+
+	simple_lock(&vp->v_interlock);
+
+	if (vp->v_usecount > 1) {
+
+		vp->v_usecount--;
+		VOP_UNLOCK(vp, LK_INTERLOCK, p);
+		return;
+
+	}
+
+	if (vp->v_usecount == 1) {
+
+		vp->v_usecount--;
+		if (VSHOULDFREE(vp))
+			vfree(vp);
+	/*
+	 * If we are doing a vput, the node is already locked, and we must
+	 * call VOP_INACTIVE with the node locked.  So, in the case of
+	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+	 */
+		simple_unlock(&vp->v_interlock);
+		VOP_INACTIVE(vp, p);
+
+	} else {
+#ifdef DIAGNOSTIC
+		vprint("vput: negative ref count", vp);
+#endif
+		panic("vput: negative ref cnt");
+	}
+}
+
+/*
+ * Somebody doesn't want the vnode recycled.
+ */
+void
+vhold(vp)
+	register struct vnode *vp;
+{
+	int s;
+
+  	s = splbio();
+	vp->v_holdcnt++;
+	if (VSHOULDBUSY(vp))
+		vbusy(vp);
+	splx(s);
+}
+
+/*
+ * One less who cares about this vnode.
+ */
+void
+vdrop(vp)
+	register struct vnode *vp;
+{
+	int s;
+
+	s = splbio();
+	if (vp->v_holdcnt <= 0)
+		panic("vdrop: holdcnt");
+	vp->v_holdcnt--;
+	if (VSHOULDFREE(vp))
+		vfree(vp);
+	splx(s);
+}
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If MNT_NOFORCE is specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If MNT_FORCE is specified, detach any active vnodes
+ * that are found.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0;		/* print out busy vnodes */
+SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
+#endif
+
+int
+vflush(mp, skipvp, flags)
+	struct mount *mp;
+	struct vnode *skipvp;
+	int flags;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp, *nvp;
+	int busy = 0;
+
+	simple_lock(&mntvnode_slock);
+loop:
+	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
+		/*
+		 * Make sure this vnode wasn't reclaimed in getnewvnode().
+		 * Start over if it has (it won't be on the list anymore).
+		 */
+		if (vp->v_mount != mp)
+			goto loop;
+		nvp = vp->v_mntvnodes.le_next;
+		/*
+		 * Skip over a selected vnode.
+		 */
+		if (vp == skipvp)
+			continue;
+
+		simple_lock(&vp->v_interlock);
+		/*
+		 * Skip over a vnodes marked VSYSTEM.
+		 */
+		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+			simple_unlock(&vp->v_interlock);
+			continue;
+		}
+		/*
+		 * If WRITECLOSE is set, only flush out regular file vnodes
+		 * open for writing.
+		 */
+		if ((flags & WRITECLOSE) &&
+		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
+			simple_unlock(&vp->v_interlock);
+			continue;
+		}
+
+		/*
+		 * With v_usecount == 0, all we need to do is clear out the
+		 * vnode data structures and we are done.
+		 */
+		if (vp->v_usecount == 0) {
+			simple_unlock(&mntvnode_slock);
+			vgonel(vp, p);
+			simple_lock(&mntvnode_slock);
+			continue;
+		}
+
+		/*
+		 * If FORCECLOSE is set, forcibly close the vnode. For block
+		 * or character devices, revert to an anonymous device. For
+		 * all other files, just kill them.
+		 */
+		if (flags & FORCECLOSE) {
+			simple_unlock(&mntvnode_slock);
+			if (vp->v_type != VBLK && vp->v_type != VCHR) {
+				vgonel(vp, p);
+			} else {
+				vclean(vp, 0, p);
+				vp->v_op = spec_vnodeop_p;
+				insmntque(vp, (struct mount *) 0);
+			}
+			simple_lock(&mntvnode_slock);
+			continue;
+		}
+#ifdef DIAGNOSTIC
+		if (busyprt)
+			vprint("vflush: busy vnode", vp);
+#endif
+		simple_unlock(&vp->v_interlock);
+		busy++;
+	}
+	simple_unlock(&mntvnode_slock);
+	if (busy)
+		return (EBUSY);
+	return (0);
+}
+
+/*
+ * Disassociate the underlying file system from a vnode.
+ */
+static void
+vclean(vp, flags, p)
+	struct vnode *vp;
+	int flags;
+	struct proc *p;
+{
+	int active;
+	vm_object_t obj;
+
+	/*
+	 * Check to see if the vnode is in use. If so we have to reference it
+	 * before we clean it out so that its count cannot fall to zero and
+	 * generate a race against ourselves to recycle it.
+	 */
+	if ((active = vp->v_usecount))
+		vp->v_usecount++;
+
+	/*
+	 * Prevent the vnode from being recycled or brought into use while we
+	 * clean it out.
+	 */
+	if (vp->v_flag & VXLOCK)
+		panic("vclean: deadlock");
+	vp->v_flag |= VXLOCK;
+	/*
+	 * Even if the count is zero, the VOP_INACTIVE routine may still
+	 * have the object locked while it cleans it out. The VOP_LOCK
+	 * ensures that the VOP_INACTIVE routine is done with its work.
+	 * For active vnodes, it ensures that no other activity can
+	 * occur while the underlying object is being cleaned out.
+	 */
+	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
+
+	/*
+	 * Clean out any buffers associated with the vnode.
+	 */
+	vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
+	if (obj = vp->v_object) {
+		if (obj->ref_count == 0) {
+			/*
+			 * This is a normal way of shutting down the object/vnode
+			 * association.
+			 */
+			vm_object_terminate(obj);
+		} else {
+			/*
+			 * Woe to the process that tries to page now :-).
+			 */
+			vm_pager_deallocate(obj);
+		}
+	}
+
+	/*
+	 * If purging an active vnode, it must be closed and
+	 * deactivated before being reclaimed. Note that the
+	 * VOP_INACTIVE will unlock the vnode.
+	 */
+	if (active) {
+		if (flags & DOCLOSE)
+			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
+		VOP_INACTIVE(vp, p);
+	} else {
+		/*
+		 * Any other processes trying to obtain this lock must first
+		 * wait for VXLOCK to clear, then call the new lock operation.
+		 */
+		VOP_UNLOCK(vp, 0, p);
+	}
+	/*
+	 * Reclaim the vnode.
+	 */
+	if (VOP_RECLAIM(vp, p))
+		panic("vclean: cannot reclaim");
+
+	if (active)
+		vrele(vp);
+
+	cache_purge(vp);
+	if (vp->v_vnlock) {
+#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
+#ifdef DIAGNOSTIC
+		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
+			vprint("vclean: lock not drained", vp);
+#endif
+#endif
+		FREE(vp->v_vnlock, M_VNODE);
+		vp->v_vnlock = NULL;
+	}
+
+	if (VSHOULDFREE(vp))
+		vfree(vp);
+
+	/*
+	 * Done with purge, notify sleepers of the grim news.
+	 */
+	vp->v_op = dead_vnodeop_p;
+	vn_pollgone(vp);
+	vp->v_tag = VT_NON;
+	vp->v_flag &= ~VXLOCK;
+	if (vp->v_flag & VXWANT) {
+		vp->v_flag &= ~VXWANT;
+		wakeup((caddr_t) vp);
+	}
+}
+
+/*
+ * Eliminate all activity associated with the requested vnode
+ * and with all vnodes aliased to the requested vnode.
+ */
+int
+vop_revoke(ap)
+	struct vop_revoke_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+	} */ *ap;
+{
+	struct vnode *vp, *vq;
+	struct proc *p = curproc;	/* XXX */
+
+	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
+
+	vp = ap->a_vp;
+	simple_lock(&vp->v_interlock);
+
+	if (vp->v_flag & VALIASED) {
+		/*
+		 * If a vgone (or vclean) is already in progress,
+		 * wait until it is done and return.
+		 */
+		if (vp->v_flag & VXLOCK) {
+			vp->v_flag |= VXWANT;
+			simple_unlock(&vp->v_interlock);
+			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
+			return (0);
+		}
+		/*
+		 * Ensure that vp will not be vgone'd while we
+		 * are eliminating its aliases.
+		 */
+		vp->v_flag |= VXLOCK;
+		simple_unlock(&vp->v_interlock);
+		while (vp->v_flag & VALIASED) {
+			simple_lock(&spechash_slock);
+			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+				if (vq->v_rdev != vp->v_rdev ||
+				    vq->v_type != vp->v_type || vp == vq)
+					continue;
+				simple_unlock(&spechash_slock);
+				vgone(vq);
+				break;
+			}
+			if (vq == NULLVP) {
+				simple_unlock(&spechash_slock);
+			}
+		}
+		/*
+		 * Remove the lock so that vgone below will
+		 * really eliminate the vnode after which time
+		 * vgone will awaken any sleepers.
+		 */
+		simple_lock(&vp->v_interlock);
+		vp->v_flag &= ~VXLOCK;
+		if (vp->v_flag & VXWANT) {
+			vp->v_flag &= ~VXWANT;
+			wakeup(vp);
+		}
+	}
+	vgonel(vp, p);
+	return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ * Release the passed interlock if the vnode will be recycled.
+ */
+int
+vrecycle(vp, inter_lkp, p)
+	struct vnode *vp;
+	struct simplelock *inter_lkp;
+	struct proc *p;
+{
+
+	simple_lock(&vp->v_interlock);
+	if (vp->v_usecount == 0) {
+		if (inter_lkp) {
+			simple_unlock(inter_lkp);
+		}
+		vgonel(vp, p);
+		return (1);
+	}
+	simple_unlock(&vp->v_interlock);
+	return (0);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(vp)
+	register struct vnode *vp;
+{
+	struct proc *p = curproc;	/* XXX */
+
+	simple_lock(&vp->v_interlock);
+	vgonel(vp, p);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+static void
+vgonel(vp, p)
+	struct vnode *vp;
+	struct proc *p;
+{
+	int s;
+	struct vnode *vq;
+	struct vnode *vx;
+
+	/*
+	 * If a vgone (or vclean) is already in progress,
+	 * wait until it is done and return.
+	 */
+	if (vp->v_flag & VXLOCK) {
+		vp->v_flag |= VXWANT;
+		simple_unlock(&vp->v_interlock);
+		tsleep((caddr_t)vp, PINOD, "vgone", 0);
+		return;
+	}
+
+	/*
+	 * Clean out the filesystem specific data.
+	 */
+	vclean(vp, DOCLOSE, p);
+	simple_lock(&vp->v_interlock);
+
+	/*
+	 * Delete from old mount point vnode list, if on one.
+	 */
+	if (vp->v_mount != NULL)
+		insmntque(vp, (struct mount *)0);
+	/*
+	 * If special device, remove it from special device alias list
+	 * if it is on one.
+	 */
+	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
+		simple_lock(&spechash_slock);
+		if (*vp->v_hashchain == vp) {
+			*vp->v_hashchain = vp->v_specnext;
+		} else {
+			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+				if (vq->v_specnext != vp)
+					continue;
+				vq->v_specnext = vp->v_specnext;
+				break;
+			}
+			if (vq == NULL)
+				panic("missing bdev");
+		}
+		if (vp->v_flag & VALIASED) {
+			vx = NULL;
+			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+				if (vq->v_rdev != vp->v_rdev ||
+				    vq->v_type != vp->v_type)
+					continue;
+				if (vx)
+					break;
+				vx = vq;
+			}
+			if (vx == NULL)
+				panic("missing alias");
+			if (vq == NULL)
+				vx->v_flag &= ~VALIASED;
+			vp->v_flag &= ~VALIASED;
+		}
+		simple_unlock(&spechash_slock);
+		FREE(vp->v_specinfo, M_VNODE);
+		vp->v_specinfo = NULL;
+	}
+
+	/*
+	 * If it is on the freelist and not already at the head,
+	 * move it to the head of the list. The test of the back
+	 * pointer and the reference count of zero is because
+	 * it will be removed from the free list by getnewvnode,
+	 * but will not have its reference count incremented until
+	 * after calling vgone. If the reference count were
+	 * incremented first, vgone would (incorrectly) try to
+	 * close the previous instance of the underlying object.
+	 */
+	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
+		s = splbio();
+		simple_lock(&vnode_free_list_slock);
+		if (vp->v_flag & VFREE) {
+			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		} else if (vp->v_flag & VTBFREE) {
+			TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+			vp->v_flag &= ~VTBFREE;
+			freevnodes++;
+		} else
+			freevnodes++;
+		vp->v_flag |= VFREE;
+		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+		simple_unlock(&vnode_free_list_slock);
+		splx(s);
+	}
+
+	vp->v_type = VBAD;
+	simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Lookup a vnode by device number.
+ */
+int
+vfinddev(dev, type, vpp)
+	dev_t dev;
+	enum vtype type;
+	struct vnode **vpp;
+{
+	register struct vnode *vp;
+	int rc = 0;
+
+	simple_lock(&spechash_slock);
+	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
+		if (dev != vp->v_rdev || type != vp->v_type)
+			continue;
+		*vpp = vp;
+		rc = 1;
+		break;
+	}
+	simple_unlock(&spechash_slock);
+	return (rc);
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(vp)
+	register struct vnode *vp;
+{
+	struct vnode *vq, *vnext;
+	int count;
+
+loop:
+	if ((vp->v_flag & VALIASED) == 0)
+		return (vp->v_usecount);
+	simple_lock(&spechash_slock);
+	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
+		vnext = vq->v_specnext;
+		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
+			continue;
+		/*
+		 * Alias, but not in use, so flush it out.
+		 */
+		if (vq->v_usecount == 0 && vq != vp) {
+			simple_unlock(&spechash_slock);
+			vgone(vq);
+			goto loop;
+		}
+		count += vq->v_usecount;
+	}
+	simple_unlock(&spechash_slock);
+	return (count);
+}
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
+
+void
+vprint(label, vp)
+	char *label;
+	register struct vnode *vp;
+{
+	char buf[96];
+
+	if (label != NULL)
+		printf("%s: %p: ", label, (void *)vp);
+	else
+		printf("%p: ", (void *)vp);
+	printf("type %s, usecount %d, writecount %d, refcount %d,",
+	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
+	    vp->v_holdcnt);
+	buf[0] = '\0';
+	if (vp->v_flag & VROOT)
+		strcat(buf, "|VROOT");
+	if (vp->v_flag & VTEXT)
+		strcat(buf, "|VTEXT");
+	if (vp->v_flag & VSYSTEM)
+		strcat(buf, "|VSYSTEM");
+	if (vp->v_flag & VXLOCK)
+		strcat(buf, "|VXLOCK");
+	if (vp->v_flag & VXWANT)
+		strcat(buf, "|VXWANT");
+	if (vp->v_flag & VBWAIT)
+		strcat(buf, "|VBWAIT");
+	if (vp->v_flag & VALIASED)
+		strcat(buf, "|VALIASED");
+	if (vp->v_flag & VDOOMED)
+		strcat(buf, "|VDOOMED");
+	if (vp->v_flag & VFREE)
+		strcat(buf, "|VFREE");
+	if (vp->v_flag & VOBJBUF)
+		strcat(buf, "|VOBJBUF");
+	if (buf[0] != '\0')
+		printf(" flags (%s)", &buf[1]);
+	if (vp->v_data == NULL) {
+		printf("\n");
+	} else {
+		printf("\n\t");
+		VOP_PRINT(vp);
+	}
+}
+
+#ifdef DDB
+#include <ddb/ddb.h>
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
+{
+	struct proc *p = curproc;	/* XXX */
+	struct mount *mp, *nmp;
+	struct vnode *vp;
+
+	printf("Locked vnodes\n");
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+		for (vp = mp->mnt_vnodelist.lh_first;
+		     vp != NULL;
+		     vp = vp->v_mntvnodes.le_next) {
+			if (VOP_ISLOCKED(vp))
+				vprint((char *)0, vp);
+		}
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl SYSCTL_HANDLER_ARGS
+{
+	int *name = (int *)arg1 - 1;	/* XXX */
+	u_int namelen = arg2 + 1;	/* XXX */
+	struct vfsconf *vfsp;
+
+#if 1 || defined(COMPAT_PRELITE2)
+	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+	if (namelen == 1)
+		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+#ifdef notyet
+	/* all sysctl names at this level are at least name and field */
+	if (namelen < 2)
+		return (ENOTDIR);		/* overloaded */
+	if (name[0] != VFS_GENERIC) {
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == name[0])
+				break;
+		if (vfsp == NULL)
+			return (EOPNOTSUPP);
+		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
+		    oldp, oldlenp, newp, newlen, p));
+	}
+#endif
+	switch (name[1]) {
+	case VFS_MAXTYPENUM:
+		if (namelen != 2)
+			return (ENOTDIR);
+		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+	case VFS_CONF:
+		if (namelen != 3)
+			return (ENOTDIR);	/* overloaded */
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == name[2])
+				break;
+		if (vfsp == NULL)
+			return (EOPNOTSUPP);
+		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
+	}
+	return (EOPNOTSUPP);
+}
+
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
+	"Generic filesystem");
+
+#if 1 || defined(COMPAT_PRELITE2)
+
+static int
+sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
+{
+	int error;
+	struct vfsconf *vfsp;
+	struct ovfsconf ovfs;
+
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
+		strcpy(ovfs.vfc_name, vfsp->vfc_name);
+		ovfs.vfc_index = vfsp->vfc_typenum;
+		ovfs.vfc_refcount = vfsp->vfc_refcount;
+		ovfs.vfc_flags = vfsp->vfc_flags;
+		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+		if (error)
+			return error;
+	}
+	return 0;
+}
+
+#endif /* 1 || COMPAT_PRELITE2 */
+
+#if 0
+#define KINFO_VNODESLOP	10
+/*
+ * Dump vnode list (via sysctl).
+ * Copyout address of vnode followed by vnode.
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode SYSCTL_HANDLER_ARGS
+{
+	struct proc *p = curproc;	/* XXX */
+	struct mount *mp, *nmp;
+	struct vnode *nvp, *vp;
+	int error;
+
+#define VPTRSZ	sizeof (struct vnode *)
+#define VNODESZ	sizeof (struct vnode)
+
+	req->lock = 0;
+	if (!req->oldptr) /* Make an estimate */
+		return (SYSCTL_OUT(req, 0,
+			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
+
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+again:
+		simple_lock(&mntvnode_slock);
+		for (vp = mp->mnt_vnodelist.lh_first;
+		     vp != NULL;
+		     vp = nvp) {
+			/*
+			 * Check that the vp is still associated with
+			 * this filesystem.  RACE: could have been
+			 * recycled onto the same filesystem.
+			 */
+			if (vp->v_mount != mp) {
+				simple_unlock(&mntvnode_slock);
+				goto again;
+			}
+			nvp = vp->v_mntvnodes.le_next;
+			simple_unlock(&mntvnode_slock);
+			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
+			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
+				return (error);
+			simple_lock(&mntvnode_slock);
+		}
+		simple_unlock(&mntvnode_slock);
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+
+	return (0);
+}
+#endif
+
+/*
+ * XXX
+ * Exporting the vnode list on large systems causes them to crash.
+ * Exporting the vnode list on medium systems causes sysctl to coredump.
+ */
+#if 0
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, 0, sysctl_vnode, "S,vnode", "");
+#endif
+
+/*
+ * Check to see if a filesystem is mounted on a block device.
+ */
+int
+vfs_mountedon(vp)
+	struct vnode *vp;
+{
+	struct vnode *vq;
+	int error = 0;
+
+	if (vp->v_specmountpoint != NULL)
+		return (EBUSY);
+	if (vp->v_flag & VALIASED) {
+		simple_lock(&spechash_slock);
+		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+			if (vq->v_rdev != vp->v_rdev ||
+			    vq->v_type != vp->v_type)
+				continue;
+			if (vq->v_specmountpoint != NULL) {
+				error = EBUSY;
+				break;
+			}
+		}
+		simple_unlock(&spechash_slock);
+	}
+	return (error);
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall()
+{
+	struct mount *mp, *nmp;
+	struct proc *p;
+	int error;
+
+	if (curproc != NULL)
+		p = curproc;
+	else
+		p = initproc;	/* XXX XXX should this be proc0? */
+	/*
+	 * Since this only runs when rebooting, it is not interlocked.
+	 */
+	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
+		nmp = mp->mnt_list.cqe_prev;
+		error = dounmount(mp, MNT_FORCE, p);
+		if (error) {
+			printf("unmount of %s failed (",
+			    mp->mnt_stat.f_mntonname);
+			if (error == EBUSY)
+				printf("BUSY)\n");
+			else
+				printf("%d)\n", error);
+		}
+	}
+}
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by ufs_mount() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(mp, nep, argp)
+	struct mount *mp;
+	struct netexport *nep;
+	struct export_args *argp;
+{
+	register struct netcred *np;
+	register struct radix_node_head *rnh;
+	register int i;
+	struct radix_node *rn;
+	struct sockaddr *saddr, *smask = 0;
+	struct domain *dom;
+	int error;
+
+	if (argp->ex_addrlen == 0) {
+		if (mp->mnt_flag & MNT_DEFEXPORTED)
+			return (EPERM);
+		np = &nep->ne_defexported;
+		np->netc_exflags = argp->ex_flags;
+		np->netc_anon = argp->ex_anon;
+		np->netc_anon.cr_ref = 1;
+		mp->mnt_flag |= MNT_DEFEXPORTED;
+		return (0);
+	}
+	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
+	bzero((caddr_t) np, i);
+	saddr = (struct sockaddr *) (np + 1);
+	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
+		goto out;
+	if (saddr->sa_len > argp->ex_addrlen)
+		saddr->sa_len = argp->ex_addrlen;
+	if (argp->ex_masklen) {
+		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
+		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
+		if (error)
+			goto out;
+		if (smask->sa_len > argp->ex_masklen)
+			smask->sa_len = argp->ex_masklen;
+	}
+	i = saddr->sa_family;
+	if ((rnh = nep->ne_rtable[i]) == 0) {
+		/*
+		 * Seems silly to initialize every AF when most are not used,
+		 * do so on demand here
+		 */
+		for (dom = domains; dom; dom = dom->dom_next)
+			if (dom->dom_family == i && dom->dom_rtattach) {
+				dom->dom_rtattach((void **) &nep->ne_rtable[i],
+				    dom->dom_rtoffset);
+				break;
+			}
+		if ((rnh = nep->ne_rtable[i]) == 0) {
+			error = ENOBUFS;
+			goto out;
+		}
+	}
+	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
+	    np->netc_rnodes);
+	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
+		error = EPERM;
+		goto out;
+	}
+	np->netc_exflags = argp->ex_flags;
+	np->netc_anon = argp->ex_anon;
+	np->netc_anon.cr_ref = 1;
+	return (0);
+out:
+	free(np, M_NETADDR);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+vfs_free_netcred(rn, w)
+	struct radix_node *rn;
+	void *w;
+{
+	register struct radix_node_head *rnh = (struct radix_node_head *) w;
+
+	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+	free((caddr_t) rn, M_NETADDR);
+	return (0);
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(nep)
+	struct netexport *nep;
+{
+	register int i;
+	register struct radix_node_head *rnh;
+
+	for (i = 0; i <= AF_MAX; i++)
+		if ((rnh = nep->ne_rtable[i])) {
+			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
+			    (caddr_t) rnh);
+			free((caddr_t) rnh, M_RTABLE);
+			nep->ne_rtable[i] = 0;
+		}
+}
+
+int
+vfs_export(mp, nep, argp)
+	struct mount *mp;
+	struct netexport *nep;
+	struct export_args *argp;
+{
+	int error;
+
+	if (argp->ex_flags & MNT_DELEXPORT) {
+		if (mp->mnt_flag & MNT_EXPUBLIC) {
+			vfs_setpublicfs(NULL, NULL, NULL);
+			mp->mnt_flag &= ~MNT_EXPUBLIC;
+		}
+		vfs_free_addrlist(nep);
+		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+	}
+	if (argp->ex_flags & MNT_EXPORTED) {
+		if (argp->ex_flags & MNT_EXPUBLIC) {
+			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+				return (error);
+			mp->mnt_flag |= MNT_EXPUBLIC;
+		}
+		if ((error = vfs_hang_addrlist(mp, nep, argp)))
+			return (error);
+		mp->mnt_flag |= MNT_EXPORTED;
+	}
+	return (0);
+}
+
+
+/*
+ * Set the publicly exported filesystem (WebNFS). Currently, only
+ * one public filesystem is possible in the spec (RFC 2054 and 2055)
+ */
+int
+vfs_setpublicfs(mp, nep, argp)
+	struct mount *mp;
+	struct netexport *nep;
+	struct export_args *argp;
+{
+	int error;
+	struct vnode *rvp;
+	char *cp;
+
+	/*
+	 * mp == NULL -> invalidate the current info, the FS is
+	 * no longer exported. May be called from either vfs_export
+	 * or unmount, so check if it hasn't already been done.
+	 */
+	if (mp == NULL) {
+		if (nfs_pub.np_valid) {
+			nfs_pub.np_valid = 0;
+			if (nfs_pub.np_index != NULL) {
+				FREE(nfs_pub.np_index, M_TEMP);
+				nfs_pub.np_index = NULL;
+			}
+		}
+		return (0);
+	}
+
+	/*
+	 * Only one allowed at a time.
+	 */
+	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
+		return (EBUSY);
+
+	/*
+	 * Get real filehandle for root of exported FS.
+	 */
+	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
+	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
+
+	if ((error = VFS_ROOT(mp, &rvp)))
+		return (error);
+
+	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+		return (error);
+
+	vput(rvp);
+
+	/*
+	 * If an indexfile was specified, pull it in.
+	 */
+	if (argp->ex_indexfile != NULL) {
+		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
+		    M_WAITOK);
+		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
+		    MAXNAMLEN, (size_t *)0);
+		if (!error) {
+			/*
+			 * Check for illegal filenames.
+			 */
+			for (cp = nfs_pub.np_index; *cp; cp++) {
+				if (*cp == '/') {
+					error = EINVAL;
+					break;
+				}
+			}
+		}
+		if (error) {
+			FREE(nfs_pub.np_index, M_TEMP);
+			return (error);
+		}
+	}
+
+	nfs_pub.np_mount = mp;
+	nfs_pub.np_valid = 1;
+	return (0);
+}
+
+struct netcred *
+vfs_export_lookup(mp, nep, nam)
+	register struct mount *mp;
+	struct netexport *nep;
+	struct sockaddr *nam;
+{
+	register struct netcred *np;
+	register struct radix_node_head *rnh;
+	struct sockaddr *saddr;
+
+	np = NULL;
+	if (mp->mnt_flag & MNT_EXPORTED) {
+		/*
+		 * Lookup in the export list first.
+		 */
+		if (nam != NULL) {
+			saddr = nam;
+			rnh = nep->ne_rtable[saddr->sa_family];
+			if (rnh != NULL) {
+				np = (struct netcred *)
+					(*rnh->rnh_matchaddr)((caddr_t)saddr,
+							      rnh);
+				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
+					np = NULL;
+			}
+		}
+		/*
+		 * If no address match, use the default if it exists.
+		 */
+		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
+			np = &nep->ne_defexported;
+	}
+	return (np);
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags) {
+	struct vnode *vp, *nvp;
+	struct vm_object *obj;
+	int anyio, tries;
+
+	tries = 5;
+loop:
+	anyio = 0;
+	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
+
+		nvp = vp->v_mntvnodes.le_next;
+
+		if (vp->v_mount != mp) {
+			goto loop;
+		}
+
+		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
+			continue;
+
+		if (flags != MNT_WAIT) {
+			obj = vp->v_object;
+			if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
+				continue;
+			if (VOP_ISLOCKED(vp))
+				continue;
+		}
+
+		simple_lock(&vp->v_interlock);
+		if (vp->v_object &&
+		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
+			if (!vget(vp,
+				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
+				if (vp->v_object) {
+					vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
+					anyio = 1;
+				}
+				vput(vp);
+			}
+		} else {
+			simple_unlock(&vp->v_interlock);
+		}
+	}
+	if (anyio && (--tries > 0))
+		goto loop;
+}
+
+/*
+ * Create the VM object needed for VMIO and mmap support.  This
+ * is done for all VREG files in the system.  Some filesystems might
+ * afford the additional metadata buffering capability of the
+ * VMIO code by making the device node be VMIO mode also.
+ *
+ * vp must be locked when vfs_object_create is called.
+ */
+int
+vfs_object_create(vp, p, cred)
+	struct vnode *vp;
+	struct proc *p;
+	struct ucred *cred;
+{
+	struct vattr vat;
+	vm_object_t object;
+	int error = 0;
+
+	if ((vp->v_type != VREG) && (vp->v_type != VBLK))
+		return 0;
+
+retry:
+	if ((object = vp->v_object) == NULL) {
+		if (vp->v_type == VREG) {
+			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
+				goto retn;
+			object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
+		} else if (major(vp->v_rdev) < nblkdev &&
+		    bdevsw[major(vp->v_rdev)] != NULL) {
+			/*
+			 * This simply allocates the biggest object possible
+			 * for a VBLK vnode.  This should be fixed, but doesn't
+			 * cause any problems (yet).
+			 */
+			object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
+		}
+		object->ref_count--;
+		vp->v_usecount--;
+	} else {
+		if (object->flags & OBJ_DEAD) {
+			VOP_UNLOCK(vp, 0, p);
+			tsleep(object, PVM, "vodead", 0);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			goto retry;
+		}
+	}
+
+	if (vp->v_object)
+		vp->v_flag |= VOBJBUF;
+
+retn:
+	return error;
+}
+
+static void
+vfree(vp)
+	struct vnode *vp;
+{
+	int s;
+
+	s = splbio();
+	simple_lock(&vnode_free_list_slock);
+	if (vp->v_flag & VTBFREE) {
+		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+		vp->v_flag &= ~VTBFREE;
+	}
+	if (vp->v_flag & VAGE) {
+		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+	} else {
+		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+	}
+	freevnodes++;
+	simple_unlock(&vnode_free_list_slock);
+	vp->v_flag &= ~VAGE;
+	vp->v_flag |= VFREE;
+	splx(s);
+}
+
+void
+vbusy(vp)
+	struct vnode *vp;
+{
+	int s;
+
+	s = splbio();
+	simple_lock(&vnode_free_list_slock);
+	if (vp->v_flag & VTBFREE) {
+		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
+		vp->v_flag &= ~VTBFREE;
+	} else {
+		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		freevnodes--;
+	}
+	simple_unlock(&vnode_free_list_slock);
+	vp->v_flag &= ~(VFREE|VAGE);
+	splx(s);
+}
+
+/*
+ * Record a process's interest in events which might happen to
+ * a vnode.  Because poll uses the historic select-style interface
+ * internally, this routine serves as both the ``check for any
+ * pending events'' and the ``record my interest in future events''
+ * functions.  (These are done together, while the lock is held,
+ * to avoid race conditions.)
+ */
+int
+vn_pollrecord(vp, p, events)
+	struct vnode *vp;
+	struct proc *p;
+	short events;
+{
+	simple_lock(&vp->v_pollinfo.vpi_lock);
+	if (vp->v_pollinfo.vpi_revents & events) {
+		/*
+		 * This leaves events we are not interested
+		 * in available for the other process which
+		 * which presumably had requested them
+		 * (otherwise they would never have been
+		 * recorded).
+		 */
+		events &= vp->v_pollinfo.vpi_revents;
+		vp->v_pollinfo.vpi_revents &= ~events;
+
+		simple_unlock(&vp->v_pollinfo.vpi_lock);
+		return events;
+	}
+	vp->v_pollinfo.vpi_events |= events;
+	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
+	simple_unlock(&vp->v_pollinfo.vpi_lock);
+	return 0;
+}
+
+/*
+ * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
+ * it is possible for us to miss an event due to race conditions, but
+ * that condition is expected to be rare, so for the moment it is the
+ * preferred interface.
+ */
+void
+vn_pollevent(vp, events)
+	struct vnode *vp;
+	short events;
+{
+	simple_lock(&vp->v_pollinfo.vpi_lock);
+	if (vp->v_pollinfo.vpi_events & events) {
+		/*
+		 * We clear vpi_events so that we don't
+		 * call selwakeup() twice if two events are
+		 * posted before the polling process(es) is
+		 * awakened.  This also ensures that we take at
+		 * most one selwakeup() if the polling process
+		 * is no longer interested.  However, it does
+		 * mean that only one event can be noticed at
+		 * a time.  (Perhaps we should only clear those
+		 * event bits which we note?) XXX
+		 */
+		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
+		vp->v_pollinfo.vpi_revents |= events;
+		selwakeup(&vp->v_pollinfo.vpi_selinfo);
+	}
+	simple_unlock(&vp->v_pollinfo.vpi_lock);
+}
+
+/*
+ * Wake up anyone polling on vp because it is being revoked.
+ * This depends on dead_poll() returning POLLHUP for correct
+ * behavior.
+ */
+void
+vn_pollgone(vp)
+	struct vnode *vp;
+{
+	simple_lock(&vp->v_pollinfo.vpi_lock);
+	if (vp->v_pollinfo.vpi_events) {
+		vp->v_pollinfo.vpi_events = 0;
+		selwakeup(&vp->v_pollinfo.vpi_selinfo);
+	}
+	simple_unlock(&vp->v_pollinfo.vpi_lock);
+}
+
+
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
+static int	sync_fsync __P((struct  vop_fsync_args *));
+static int	sync_inactive __P((struct  vop_inactive_args *));
+static int	sync_reclaim  __P((struct  vop_reclaim_args *));
+#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
+#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
+static int	sync_print __P((struct vop_print_args *));
+#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
+
+static vop_t **sync_vnodeop_p;
+static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
+	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
+	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
+	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
+	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
+	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
+	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
+	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
+	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
+	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
+	{ NULL, NULL }
+};
+static struct vnodeopv_desc sync_vnodeop_opv_desc =
+	{ &sync_vnodeop_p, sync_vnodeop_entries };
+
+VNODEOP_SET(sync_vnodeop_opv_desc);
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+int
+vfs_allocate_syncvnode(mp)
+	struct mount *mp;
+{
+	struct vnode *vp;
+	static long start, incr, next;
+	int error;
+
+	/* Allocate a new vnode */
+	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
+		mp->mnt_syncer = NULL;
+		return (error);
+	}
+	vp->v_type = VNON;
+	/*
+	 * Place the vnode onto the syncer worklist. We attempt to
+	 * scatter them about on the list so that they will go off
+	 * at evenly distributed times even if all the filesystems
+	 * are mounted at once.
+	 */
+	next += incr;
+	if (next == 0 || next > syncer_maxdelay) {
+		start /= 2;
+		incr /= 2;
+		if (start == 0) {
+			start = syncer_maxdelay / 2;
+			incr = syncer_maxdelay;
+		}
+		next = start;
+	}
+	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+	mp->mnt_syncer = vp;
+	return (0);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+static int
+sync_fsync(ap)
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		struct ucred *a_cred;
+		int a_waitfor;
+		struct proc *a_p;
+	} */ *ap;
+{
+	struct vnode *syncvp = ap->a_vp;
+	struct mount *mp = syncvp->v_mount;
+	struct proc *p = ap->a_p;
+	int asyncflag;
+
+	/*
+	 * We only need to do something if this is a lazy evaluation.
+	 */
+	if (ap->a_waitfor != MNT_LAZY)
+		return (0);
+
+	/*
+	 * Move ourselves to the back of the sync list.
+	 */
+	vn_syncer_add_to_worklist(syncvp, syncdelay);
+
+	/*
+	 * Walk the list of vnodes pushing all that are dirty and
+	 * not already on the sync list.
+	 */
+	simple_lock(&mountlist_slock);
+	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
+		simple_unlock(&mountlist_slock);
+		return (0);
+	}
+	asyncflag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &= ~MNT_ASYNC;
+	vfs_msync(mp, MNT_NOWAIT);
+	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
+	if (asyncflag)
+		mp->mnt_flag |= MNT_ASYNC;
+	vfs_unbusy(mp, p);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+static int
+sync_inactive(ap)
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+		struct proc *a_p;
+	} */ *ap;
+{
+
+	vgone(ap->a_vp);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ */
+static int
+sync_reclaim(ap)
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	vp->v_mount->mnt_syncer = NULL;
+	if (vp->v_flag & VONWORKLST) {
+		LIST_REMOVE(vp, v_synclist);
+		vp->v_flag &= ~VONWORKLST;
+	}
+
+	return (0);
+}
+
+/*
+ * Print out a syncer vnode.
+ */
+static int
+sync_print(ap)
+	struct vop_print_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	printf("syncer vnode");
+	if (vp->v_vnlock != NULL)
+		lockmgr_printinfo(vp->v_vnlock);
+	printf("\n");
+	return (0);
+}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
new file mode 100644
index 0000000..18e39d6
--- /dev/null
+++ b/sys/kern/vfs_syscalls.c
@@ -0,0 +1,3034 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
+ * $Id: vfs_syscalls.c,v 1.111 1998/12/12 21:07:09 dillon Exp $
+ */
+
+/* For 4.3 integer FS ID compatibility */
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/linker.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+
+#include <miscfs/union/union.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
+#include <sys/sysctl.h>
+
+static int change_dir __P((struct nameidata *ndp, struct proc *p));
+static void checkdirs __P((struct vnode *olddp));
+static int setfown __P((struct proc *, struct vnode *, uid_t, gid_t));
+static int setfmode __P((struct proc *, struct vnode *, int));
+static int setfflags __P((struct proc *, struct vnode *, int));
+static int setutimes __P((struct proc *, struct vnode *, struct timeval *, int));
+static int	usermount = 0;	/* if 1, non-root can mount fs. */
+
+int (*union_dircheckp) __P((struct proc *, struct vnode **, struct file *));
+
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
+
+/*
+ * Virtual File System System Calls
+ */
+
+/*
+ * Mount a file system.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+	char	*type;
+	char	*path;
+	int	flags;
+	caddr_t	data;
+};
+#endif
+/* ARGSUSED */
+int
+mount(p, uap)
+	struct proc *p;
+	register struct mount_args /* {
+		syscallarg(char *) type;
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(caddr_t) data;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct vfsconf *vfsp;
+	int error, flag = 0, flag2 = 0;
+	struct vattr va;
+	u_long fstypenum;
+	struct nameidata nd;
+	char fstypename[MFSNAMELEN];
+
+	if (usermount == 0 && (error = suser(p->p_ucred, &p->p_acflag)))
+		return (error);
+
+	/*
+	 * Get vnode to be covered
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (SCARG(uap, flags) & MNT_UPDATE) {
+		if ((vp->v_flag & VROOT) == 0) {
+			vput(vp);
+			return (EINVAL);
+		}
+		mp = vp->v_mount;
+		flag = mp->mnt_flag;
+		flag2 = mp->mnt_kern_flag;
+		/*
+		 * We only allow the filesystem to be reloaded if it
+		 * is currently mounted read-only.
+		 */
+		if ((SCARG(uap, flags) & MNT_RELOAD) &&
+		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+			vput(vp);
+			return (EOPNOTSUPP);	/* Needs translation */
+		}
+		mp->mnt_flag |=
+		    SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
+		/*
+		 * Only root, or the user that did the original mount is
+		 * permitted to update it.
+		 */
+		if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid &&
+		    (error = suser(p->p_ucred, &p->p_acflag))) {
+			vput(vp);
+			return (error);
+		}
+		/*
+		 * Do not allow NFS export by non-root users. Silently
+		 * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+		 */
+		if (p->p_ucred->cr_uid != 0) {
+			if (SCARG(uap, flags) & MNT_EXPORTED) {
+				vput(vp);
+				return (EPERM);
+			}
+			SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+		}
+		if (vfs_busy(mp, LK_NOWAIT, 0, p)) {
+			vput(vp);
+			return (EBUSY);
+		}
+		VOP_UNLOCK(vp, 0, p);
+		goto update;
+	}
+	/*
+	 * If the user is not root, ensure that they own the directory
+	 * onto which we are attempting to mount.
+	 */
+	if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) ||
+	    (va.va_uid != p->p_ucred->cr_uid &&
+	     (error = suser(p->p_ucred, &p->p_acflag)))) {
+		vput(vp);
+		return (error);
+	}
+	/*
+	 * Do not allow NFS export by non-root users. Silently
+	 * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+	 */
+	if (p->p_ucred->cr_uid != 0) {
+		if (SCARG(uap, flags) & MNT_EXPORTED) {
+			vput(vp);
+			return (EPERM);
+		}
+		SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+	}
+	if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0))
+		return (error);
+	if (vp->v_type != VDIR) {
+		vput(vp);
+		return (ENOTDIR);
+	}
+#ifdef COMPAT_43
+	/*
+	 * Historically filesystem types were identified by number. If we
+	 * get an integer for the filesystem type instead of a string, we
+	 * check to see if it matches one of the historic filesystem types.
+	 */
+	fstypenum = (uintptr_t)SCARG(uap, type);
+	if (fstypenum < maxvfsconf) {
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == fstypenum)
+				break;
+		if (vfsp == NULL) {
+			vput(vp);
+			return (ENODEV);
+		}
+		strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN);
+	} else
+#endif /* COMPAT_43 */
+	if (error = copyinstr(SCARG(uap, type), fstypename, MFSNAMELEN, NULL)) {
+		vput(vp);
+		return (error);
+	}
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstypename))
+			break;
+	if (vfsp == NULL) {
+		linker_file_t lf;
+
+		/* Refuse to load modules if securelevel raised */
+		if (securelevel > 0) {
+			vput(vp);
+			return EPERM; 
+		}
+		/* Only load modules for root (very important!) */
+		if (error = suser(p->p_ucred, &p->p_acflag)) {
+			vput(vp);
+			return error;
+		}
+		error = linker_load_file(fstypename, &lf);
+		if (error || lf == NULL) {
+			vput(vp);
+			if (lf == NULL)
+				error = ENODEV;
+			return error;
+		}
+		lf->userrefs++;
+		/* lookup again, see if the VFS was loaded */
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (!strcmp(vfsp->vfc_name, fstypename))
+				break;
+		if (vfsp == NULL) {
+			lf->userrefs--;
+			linker_file_unload(lf);
+			vput(vp);
+			return (ENODEV);
+		}
+	}
+	simple_lock(&vp->v_interlock);
+	if ((vp->v_flag & VMOUNT) != 0 ||
+	    vp->v_mountedhere != NULL) {
+		simple_unlock(&vp->v_interlock);
+		vput(vp);
+		return (EBUSY);
+	}
+	vp->v_flag |= VMOUNT;
+	simple_unlock(&vp->v_interlock);
+
+	/*
+	 * Allocate and initialize the filesystem.
+	 */
+	mp = (struct mount *)malloc((u_long)sizeof(struct mount),
+		M_MOUNT, M_WAITOK);
+	bzero((char *)mp, (u_long)sizeof(struct mount));
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_vfc = vfsp;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+	mp->mnt_vnodecovered = vp;
+	mp->mnt_stat.f_owner = p->p_ucred->cr_uid;
+	VOP_UNLOCK(vp, 0, p);
+update:
+	/*
+	 * Set the mount level flags.
+	 */
+	if (SCARG(uap, flags) & MNT_RDONLY)
+		mp->mnt_flag |= MNT_RDONLY;
+	else if (mp->mnt_flag & MNT_RDONLY)
+		mp->mnt_kern_flag |= MNTK_WANTRDWR;
+	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
+	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
+	    MNT_NOSYMFOLLOW |
+	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
+	mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC |
+	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
+	    MNT_NOSYMFOLLOW |
+	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
+	/*
+	 * Mount the filesystem.
+	 */
+	error = VFS_MOUNT(mp, SCARG(uap, path), SCARG(uap, data), &nd, p);
+	if (mp->mnt_flag & MNT_UPDATE) {
+		vrele(vp);
+		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+			mp->mnt_flag &= ~MNT_RDONLY;
+		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
+		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+		if (error) {
+			mp->mnt_flag = flag;
+			mp->mnt_kern_flag = flag2;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			if (mp->mnt_syncer == NULL)
+				error = vfs_allocate_syncvnode(mp);
+		} else {
+			if (mp->mnt_syncer != NULL)
+				vrele(mp->mnt_syncer);
+			mp->mnt_syncer = NULL;
+		}
+		vfs_unbusy(mp, p);
+		return (error);
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	/*
+	 * Put the new filesystem on the mount list after root.
+	 */
+	cache_purge(vp);
+	if (!error) {
+		simple_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		vp->v_mountedhere = mp;
+		simple_unlock(&vp->v_interlock);
+		simple_lock(&mountlist_slock);
+		CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+		simple_unlock(&mountlist_slock);
+		checkdirs(vp);
+		VOP_UNLOCK(vp, 0, p);
+		if ((mp->mnt_flag & MNT_RDONLY) == 0)
+			error = vfs_allocate_syncvnode(mp);
+		vfs_unbusy(mp, p);
+		if (error = VFS_START(mp, 0, p))
+			vrele(vp);
+	} else {
+		simple_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		simple_unlock(&vp->v_interlock);
+		mp->mnt_vfc->vfc_refcount--;
+		vfs_unbusy(mp, p);
+		free((caddr_t)mp, M_MOUNT);
+		vput(vp);
+	}
+	return (error);
+}
+
+/*
+ * Scan all active processes to see if any of them have a current
+ * or root directory onto which the new filesystem has just been
+ * mounted. If so, replace them with the new mount point.
+ */
+static void
+checkdirs(olddp)
+	struct vnode *olddp;
+{
+	struct filedesc *fdp;
+	struct vnode *newdp;
+	struct proc *p;
+
+	if (olddp->v_usecount == 1)
+		return;
+	if (VFS_ROOT(olddp->v_mountedhere, &newdp))
+		panic("mount: lost mount");
+	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+		fdp = p->p_fd;
+		if (fdp->fd_cdir == olddp) {
+			vrele(fdp->fd_cdir);
+			VREF(newdp);
+			fdp->fd_cdir = newdp;
+		}
+		if (fdp->fd_rdir == olddp) {
+			vrele(fdp->fd_rdir);
+			VREF(newdp);
+			fdp->fd_rdir = newdp;
+		}
+	}
+	if (rootvnode == olddp) {
+		vrele(rootvnode);
+		VREF(newdp);
+		rootvnode = newdp;
+	}
+	vput(newdp);
+}
+
+/*
+ * Unmount a file system.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument,
+ * not special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+unmount(p, uap)
+	struct proc *p;
+	register struct unmount_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct mount *mp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	mp = vp->v_mount;
+
+	/*
+	 * Only root, or the user that did the original mount is
+	 * permitted to unmount this filesystem.
+	 */
+	if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
+	    (error = suser(p->p_ucred, &p->p_acflag))) {
+		vput(vp);
+		return (error);
+	}
+
+	/*
+	 * Don't allow unmounting the root file system.
+	 */
+	if (mp->mnt_flag & MNT_ROOTFS) {
+		vput(vp);
+		return (EINVAL);
+	}
+
+	/*
+	 * Must be the root of the filesystem
+	 */
+	if ((vp->v_flag & VROOT) == 0) {
+		vput(vp);
+		return (EINVAL);
+	}
+	vput(vp);
+	return (dounmount(mp, SCARG(uap, flags), p));
+}
+
+/*
+ * Do the actual file system unmount.
+ */
+int
+dounmount(mp, flags, p)
+	register struct mount *mp;
+	int flags;
+	struct proc *p;
+{
+	struct vnode *coveredvp;
+	int error;
+	int async_flag;
+
+	simple_lock(&mountlist_slock);
+	mp->mnt_kern_flag |= MNTK_UNMOUNT;
+	lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p);
+
+	if (mp->mnt_flag & MNT_EXPUBLIC)
+		vfs_setpublicfs(NULL, NULL, NULL);
+
+	vfs_msync(mp, MNT_WAIT);
+	async_flag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &=~ MNT_ASYNC;
+	cache_purgevfs(mp);	/* remove cache entries for this file sys */
+	if (mp->mnt_syncer != NULL)
+		vrele(mp->mnt_syncer);
+	if (((mp->mnt_flag & MNT_RDONLY) ||
+	     (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) ||
+	    (flags & MNT_FORCE))
+		error = VFS_UNMOUNT(mp, flags, p);
+	simple_lock(&mountlist_slock);
+	if (error) {
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
+			(void) vfs_allocate_syncvnode(mp);
+		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
+		mp->mnt_flag |= async_flag;
+		lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE,
+		    &mountlist_slock, p);
+		if (mp->mnt_kern_flag & MNTK_MWAIT)
+			wakeup((caddr_t)mp);
+		return (error);
+	}
+	CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
+	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
+		coveredvp->v_mountedhere = (struct mount *)0;
+		vrele(coveredvp);
+	}
+	mp->mnt_vfc->vfc_refcount--;
+	if (mp->mnt_vnodelist.lh_first != NULL)
+		panic("unmount: dangling vnode");
+	lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock, p);
+	if (mp->mnt_kern_flag & MNTK_MWAIT)
+		wakeup((caddr_t)mp);
+	free((caddr_t)mp, M_MOUNT);
+	return (0);
+}
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+        int     dummy;
+};
+#endif
+
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/* ARGSUSED */
+int
+sync(p, uap)
+	struct proc *p;
+	struct sync_args *uap;
+{
+	register struct mount *mp, *nmp;
+	int asyncflag;
+
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			asyncflag = mp->mnt_flag & MNT_ASYNC;
+			mp->mnt_flag &= ~MNT_ASYNC;
+			vfs_msync(mp, MNT_NOWAIT);
+			VFS_SYNC(mp, MNT_NOWAIT,
+				((p != NULL) ? p->p_ucred : NOCRED), p);
+			mp->mnt_flag |= asyncflag;
+		}
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
+#ifdef DIAGNOSTIC
+	if (syncprt)
+		vfs_bufstats();
+#endif /* DIAGNOSTIC */
+#endif
+	return (0);
+}
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+	char *path;
+	int cmd;
+	int uid;
+	caddr_t arg;
+};
+#endif
+/* ARGSUSED */
+int
+quotactl(p, uap)
+	struct proc *p;
+	register struct quotactl_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) cmd;
+		syscallarg(int) uid;
+		syscallarg(caddr_t) arg;
+	} */ *uap;
+{
+	register struct mount *mp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	mp = nd.ni_vp->v_mount;
+	vrele(nd.ni_vp);
+	return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
+	    SCARG(uap, arg), p));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+	char *path;
+	struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+statfs(p, uap)
+	struct proc *p;
+	register struct statfs_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct statfs *) buf;
+	} */ *uap;
+{
+	register struct mount *mp;
+	register struct statfs *sp;
+	int error;
+	struct nameidata nd;
+	struct statfs sb;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	mp = nd.ni_vp->v_mount;
+	sp = &mp->mnt_stat;
+	vrele(nd.ni_vp);
+	error = VFS_STATFS(mp, sp, p);
+	if (error)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (p->p_ucred->cr_uid != 0) {
+		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+	int fd;
+	struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+fstatfs(p, uap)
+	struct proc *p;
+	register struct fstatfs_args /* {
+		syscallarg(int) fd;
+		syscallarg(struct statfs *) buf;
+	} */ *uap;
+{
+	struct file *fp;
+	struct mount *mp;
+	register struct statfs *sp;
+	int error;
+	struct statfs sb;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	mp = ((struct vnode *)fp->f_data)->v_mount;
+	sp = &mp->mnt_stat;
+	error = VFS_STATFS(mp, sp, p);
+	if (error)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (p->p_ucred->cr_uid != 0) {
+		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+	struct statfs *buf;
+	long bufsize;
+	int flags;
+};
+#endif
+int
+getfsstat(p, uap)
+	struct proc *p;
+	register struct getfsstat_args /* {
+		syscallarg(struct statfs *) buf;
+		syscallarg(long) bufsize;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	register struct mount *mp, *nmp;
+	register struct statfs *sp;
+	caddr_t sfsp;
+	long count, maxcount, error;
+
+	maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
+	sfsp = (caddr_t)SCARG(uap, buf);
+	count = 0;
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+		if (sfsp && count < maxcount) {
+			sp = &mp->mnt_stat;
+			/*
+			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
+			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+			 * overrides MNT_WAIT.
+			 */
+			if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
+			    (SCARG(uap, flags) & MNT_WAIT)) &&
+			    (error = VFS_STATFS(mp, sp, p))) {
+				simple_lock(&mountlist_slock);
+				nmp = mp->mnt_list.cqe_next;
+				vfs_unbusy(mp, p);
+				continue;
+			}
+			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+			error = copyout((caddr_t)sp, sfsp, sizeof(*sp));
+			if (error) {
+				vfs_unbusy(mp, p);
+				return (error);
+			}
+			sfsp += sizeof(*sp);
+		}
+		count++;
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+	if (sfsp && count > maxcount)
+		p->p_retval[0] = maxcount;
+	else
+		p->p_retval[0] = count;
+	return (0);
+}
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+	int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+fchdir(p, uap)
+	struct proc *p;
+	struct fchdir_args /* {
+		syscallarg(int) fd;
+	} */ *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	struct vnode *vp, *tdp;
+	struct mount *mp;
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(fdp, SCARG(uap, fd), &fp))
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	VREF(vp);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_type != VDIR)
+		error = ENOTDIR;
+	else
+		error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+	while (!error && (mp = vp->v_mountedhere) != NULL) {
+		if (vfs_busy(mp, 0, 0, p))
+			continue;
+		error = VFS_ROOT(mp, &tdp);
+		vfs_unbusy(mp, p);
+		if (error)
+			break;
+		vput(vp);
+		vp = tdp;
+	}
+	if (error) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0, p);
+	vrele(fdp->fd_cdir);
+	fdp->fd_cdir = vp;
+	return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+chdir(p, uap)
+	struct proc *p;
+	struct chdir_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = change_dir(&nd, p))
+		return (error);
+	vrele(fdp->fd_cdir);
+	fdp->fd_cdir = nd.ni_vp;
+	return (0);
+}
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+chroot(p, uap)
+	struct proc *p;
+	struct chroot_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	int error;
+	struct nameidata nd;
+
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = change_dir(&nd, p))
+		return (error);
+	vrele(fdp->fd_rdir);
+	fdp->fd_rdir = nd.ni_vp;
+	return (0);
+}
+
+/*
+ * Common routine for chroot and chdir.
+ */
+static int
+change_dir(ndp, p)
+	register struct nameidata *ndp;
+	struct proc *p;
+{
+	struct vnode *vp;
+	int error;
+
+	error = namei(ndp);
+	if (error)
+		return (error);
+	vp = ndp->ni_vp;
+	if (vp->v_type != VDIR)
+		error = ENOTDIR;
+	else
+		error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+	if (error)
+		vput(vp);
+	else
+		VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * Check permissions, allocate an open file structure,
+ * and call the device open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+	char	*path;
+	int	flags;
+	int	mode;
+};
+#endif
+int
+open(p, uap)
+	struct proc *p;
+	register struct open_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	register struct vnode *vp;
+	int cmode, flags, oflags;
+	struct file *nfp;
+	int type, indx, error;
+	struct flock lf;
+	struct nameidata nd;
+
+	oflags = SCARG(uap, flags);
+	if ((oflags & O_ACCMODE) == O_ACCMODE)
+		return (EINVAL);
+	flags = FFLAGS(oflags);
+	error = falloc(p, &nfp, &indx);
+	if (error)
+		return (error);
+	fp = nfp;
+	cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	p->p_dupfd = -indx - 1;			/* XXX check for fdopen */
+	error = vn_open(&nd, flags, cmode);
+	if (error) {
+		ffree(fp);
+		if ((error == ENODEV || error == ENXIO) &&
+		    p->p_dupfd >= 0 &&			/* XXX from fdopen */
+		    (error =
+			dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) {
+			p->p_retval[0] = indx;
+			return (0);
+		}
+		if (error == ERESTART)
+			error = EINTR;
+		fdp->fd_ofiles[indx] = NULL;
+		return (error);
+	}
+	p->p_dupfd = 0;
+	vp = nd.ni_vp;
+
+	fp->f_flag = flags & FMASK;
+	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+	fp->f_ops = &vnops;
+	fp->f_data = (caddr_t)vp;
+	if (flags & (O_EXLOCK | O_SHLOCK)) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		if (flags & O_EXLOCK)
+			lf.l_type = F_WRLCK;
+		else
+			lf.l_type = F_RDLCK;
+		type = F_FLOCK;
+		if ((flags & FNONBLOCK) == 0)
+			type |= F_WAIT;
+		VOP_UNLOCK(vp, 0, p);
+		if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) {
+			(void) vn_close(vp, fp->f_flag, fp->f_cred, p);
+			ffree(fp);
+			fdp->fd_ofiles[indx] = NULL;
+			return (error);
+		}
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+		fp->f_flag |= FHASLOCK;
+	}
+	if ((vp->v_type == VREG) && (vp->v_object == NULL))
+		vfs_object_create(vp, p, p->p_ucred);
+	VOP_UNLOCK(vp, 0, p);
+	p->p_retval[0] = indx;
+	return (0);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+ocreat(p, uap)
+	struct proc *p;
+	register struct ocreat_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct open_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(int) mode;
+	} */ nuap;
+
+	SCARG(&nuap, path) = SCARG(uap, path);
+	SCARG(&nuap, mode) = SCARG(uap, mode);
+	SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
+	return (open(p, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+	char	*path;
+	int	mode;
+	int	dev;
+};
+#endif
+/* ARGSUSED */
+int
+mknod(p, uap)
+	struct proc *p;
+	register struct mknod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+		syscallarg(int) dev;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	int whiteout = 0;
+	struct nameidata nd;
+
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error)
+		return (error);
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL)
+		error = EEXIST;
+	else {
+		VATTR_NULL(&vattr);
+		vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+		vattr.va_rdev = SCARG(uap, dev);
+		whiteout = 0;
+
+		switch (SCARG(uap, mode) & S_IFMT) {
+		case S_IFMT:	/* used by badsect to flag bad sectors */
+			vattr.va_type = VBAD;
+			break;
+		case S_IFCHR:
+			vattr.va_type = VCHR;
+			break;
+		case S_IFBLK:
+			vattr.va_type = VBLK;
+			break;
+		case S_IFWHT:
+			whiteout = 1;
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+	}
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		if (whiteout) {
+			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+			if (error)
+				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+			vput(nd.ni_dvp);
+		} else {
+			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+						&nd.ni_cnd, &vattr);
+			vput(nd.ni_dvp);
+		}
+	} else {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (vp)
+			vrele(vp);
+	}
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod");
+	return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkfifo(p, uap)
+	struct proc *p;
+	register struct mkfifo_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	if (nd.ni_vp != NULL) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == nd.ni_vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(nd.ni_vp);
+		return (EEXIST);
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VFIFO;
+	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	vput(nd.ni_dvp);
+	return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+	char	*path;
+	char	*link;
+};
+#endif
+/* ARGSUSED */
+int
+link(p, uap)
+	struct proc *p;
+	register struct link_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) link;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type == VDIR)
+		error = EPERM;		/* POSIX */
+	else {
+		NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p);
+		error = namei(&nd);
+		if (!error) {
+			if (nd.ni_vp != NULL) {
+				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+				if (nd.ni_vp)
+					vrele(nd.ni_vp);
+				error = EEXIST;
+			} else {
+				VOP_LEASE(nd.ni_dvp, p, p->p_ucred,
+				    LEASE_WRITE);
+				VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+			}
+			if (nd.ni_dvp == nd.ni_vp)
+				vrele(nd.ni_dvp);
+			else
+				vput(nd.ni_dvp);
+		}
+	}
+	vrele(vp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "link");
+	return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+	char	*path;
+	char	*link;
+};
+#endif
+/* ARGSUSED */
+int
+symlink(p, uap)
+	struct proc *p;
+	register struct symlink_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) link;
+	} */ *uap;
+{
+	struct vattr vattr;
+	char *path;
+	int error;
+	struct nameidata nd;
+
+	path = zalloc(namei_zone);
+	if (error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL))
+		goto out;
+	NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p);
+	if (error = namei(&nd))
+		goto out;
+	if (nd.ni_vp) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == nd.ni_vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(nd.ni_vp);
+		error = EEXIST;
+		goto out;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
+	vput(nd.ni_dvp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
+out:
+	zfree(namei_zone, path);
+	return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+/* ARGSUSED */
+int
+undelete(p, uap)
+	struct proc *p;
+	register struct undelete_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	error = namei(&nd);
+	if (error)
+		return (error);
+
+	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == nd.ni_vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (nd.ni_vp)
+			vrele(nd.ni_vp);
+		return (EEXIST);
+	}
+
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	if (error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE))
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+	vput(nd.ni_dvp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete");
+	return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+unlink(p, uap)
+	struct proc *p;
+	struct unlink_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+
+	if (vp->v_type == VDIR)
+		error = EPERM;		/* POSIX */
+	else {
+		/*
+		 * The root of a mounted filesystem cannot be deleted.
+		 *
+		 * XXX: can this only be a VDIR case?
+		 */
+		if (vp->v_flag & VROOT)
+			error = EBUSY;
+	}
+
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+	} else {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+	}
+	if (nd.ni_dvp == vp)
+		vrele(nd.ni_dvp);
+	else
+		vput(nd.ni_dvp);
+	if (vp != NULLVP)
+		vput(vp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink");
+	return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+	int	fd;
+	int	pad;
+	off_t	offset;
+	int	whence;
+};
+#endif
+int
+lseek(p, uap)
+	struct proc *p;
+	register struct lseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) offset;
+		syscallarg(int) whence;
+	} */ *uap;
+{
+	struct ucred *cred = p->p_ucred;
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	struct vattr vattr;
+	int error;
+
+	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+		return (EBADF);
+	if (fp->f_type != DTYPE_VNODE)
+		return (ESPIPE);
+	switch (SCARG(uap, whence)) {
+	case L_INCR:
+		fp->f_offset += SCARG(uap, offset);
+		break;
+	case L_XTND:
+		error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p);
+		if (error)
+			return (error);
+		fp->f_offset = SCARG(uap, offset) + vattr.va_size;
+		break;
+	case L_SET:
+		fp->f_offset = SCARG(uap, offset);
+		break;
+	default:
+		return (EINVAL);
+	}
+	*(off_t *)(p->p_retval) = fp->f_offset;
+	return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+	int	fd;
+	long	offset;
+	int	whence;
+};
+#endif
+int
+olseek(p, uap)
+	struct proc *p;
+	register struct olseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(long) offset;
+		syscallarg(int) whence;
+	} */ *uap;
+{
+	struct lseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) offset;
+		syscallarg(int) whence;
+	} */ nuap;
+	int error;
+
+	SCARG(&nuap, fd) = SCARG(uap, fd);
+	SCARG(&nuap, offset) = SCARG(uap, offset);
+	SCARG(&nuap, whence) = SCARG(uap, whence);
+	error = lseek(p, &nuap);
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Check access permissions.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+	char	*path;
+	int	flags;
+};
+#endif
+int
+access(p, uap)
+	struct proc *p;
+	register struct access_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	register struct ucred *cred = p->p_ucred;
+	register struct vnode *vp;
+	int error, flags, t_gid, t_uid;
+	struct nameidata nd;
+
+	t_uid = cred->cr_uid;
+	t_gid = cred->cr_groups[0];
+	cred->cr_uid = p->p_cred->p_ruid;
+	cred->cr_groups[0] = p->p_cred->p_rgid;
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		goto out1;
+	vp = nd.ni_vp;
+
+	/* Flags == 0 means only check for existence. */
+	if (SCARG(uap, flags)) {
+		flags = 0;
+		if (SCARG(uap, flags) & R_OK)
+			flags |= VREAD;
+		if (SCARG(uap, flags) & W_OK)
+			flags |= VWRITE;
+		if (SCARG(uap, flags) & X_OK)
+			flags |= VEXEC;
+		if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+			error = VOP_ACCESS(vp, flags, cred, p);
+	}
+	vput(vp);
+out1:
+	cred->cr_uid = t_uid;
+	cred->cr_groups[0] = t_gid;
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+ostat(p, uap)
+	struct proc *p;
+	register struct ostat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct ostat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct ostat osb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, p);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	cvtstat(&sb, &osb);
+	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+olstat(p, uap)
+	struct proc *p;
+	register struct olstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct ostat *) ub;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct stat sb;
+	struct ostat osb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	error = vn_stat(vp, &sb, p);
+	vput(vp);
+	if (error)
+		return (error);
+	cvtstat(&sb, &osb);
+	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+	return (error);
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+	struct stat *st;
+	struct ostat *ost;
+{
+
+	ost->st_dev = st->st_dev;
+	ost->st_ino = st->st_ino;
+	ost->st_mode = st->st_mode;
+	ost->st_nlink = st->st_nlink;
+	ost->st_uid = st->st_uid;
+	ost->st_gid = st->st_gid;
+	ost->st_rdev = st->st_rdev;
+	if (st->st_size < (quad_t)1 << 32)
+		ost->st_size = st->st_size;
+	else
+		ost->st_size = -2;
+	ost->st_atime = st->st_atime;
+	ost->st_mtime = st->st_mtime;
+	ost->st_ctime = st->st_ctime;
+	ost->st_blksize = st->st_blksize;
+	ost->st_blocks = st->st_blocks;
+	ost->st_flags = st->st_flags;
+	ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+stat(p, uap)
+	struct proc *p;
+	register struct stat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct stat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, p);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+lstat(p, uap)
+	struct proc *p;
+	register struct lstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct stat *) ub;
+	} */ *uap;
+{
+	int error;
+	struct vnode *vp;
+	struct stat sb;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	error = vn_stat(vp, &sb, p);
+	vput(vp);
+	if (error)
+		return (error);
+	error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+	return (error);
+}
+
+void
+cvtnstat(sb, nsb)
+	struct stat *sb;
+	struct nstat *nsb;
+{
+	nsb->st_dev = sb->st_dev;
+	nsb->st_ino = sb->st_ino;
+	nsb->st_mode = sb->st_mode;
+	nsb->st_nlink = sb->st_nlink;
+	nsb->st_uid = sb->st_uid;
+	nsb->st_gid = sb->st_gid;
+	nsb->st_rdev = sb->st_rdev;
+	nsb->st_atimespec = sb->st_atimespec;
+	nsb->st_mtimespec = sb->st_mtimespec;
+	nsb->st_ctimespec = sb->st_ctimespec;
+	nsb->st_size = sb->st_size;
+	nsb->st_blocks = sb->st_blocks;
+	nsb->st_blksize = sb->st_blksize;
+	nsb->st_flags = sb->st_flags;
+	nsb->st_gen = sb->st_gen;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nstat_args {
+	char	*path;
+	struct nstat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nstat(p, uap)
+	struct proc *p;
+	register struct nstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct nstat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct nstat nsb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, p);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	cvtnstat(&sb, &nsb);
+	error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nlstat(p, uap)
+	struct proc *p;
+	register struct nlstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct nstat *) ub;
+	} */ *uap;
+{
+	int error;
+	struct vnode *vp;
+	struct stat sb;
+	struct nstat nsb;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	error = vn_stat(vp, &sb, p);
+	vput(vp);
+	if (error)
+		return (error);
+	cvtnstat(&sb, &nsb);
+	error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
+	return (error);
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+	char	*path;
+	int	name;
+};
+#endif
+/* ARGSUSED */
+int
+pathconf(p, uap)
+	struct proc *p;
+	register struct pathconf_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) name;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), p->p_retval);
+	vput(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+	char	*path;
+	char	*buf;
+	int	count;
+};
+#endif
+/* ARGSUSED */
+int
+readlink(p, uap)
+	struct proc *p;
+	register struct readlink_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) buf;
+		syscallarg(int) count;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct iovec aiov;
+	struct uio auio;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VLNK)
+		error = EINVAL;
+	else {
+		aiov.iov_base = SCARG(uap, buf);
+		aiov.iov_len = SCARG(uap, count);
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_procp = p;
+		auio.uio_resid = SCARG(uap, count);
+		error = VOP_READLINK(vp, &auio, p->p_ucred);
+	}
+	vput(vp);
+	p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+
+static int
+setfflags(p, vp, flags)
+	struct proc *p;
+	struct vnode *vp;
+	int flags;
+{
+	int error;
+	struct vattr vattr;
+
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_flags = flags;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return error;
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+chflags(p, uap)
+	struct proc *p;
+	register struct chflags_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setfflags(p, nd.ni_vp, SCARG(uap, flags));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+	int	fd;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+fchflags(p, uap)
+	struct proc *p;
+	register struct fchflags_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	return setfflags(p, (struct vnode *) fp->f_data, SCARG(uap, flags));
+}
+
+static int
+setfmode(p, vp, mode)
+	struct proc *p;
+	struct vnode *vp;
+	int mode;
+{
+	int error;
+	struct vattr vattr;
+
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_mode = mode & ALLPERMS;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return error;
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+chmod(p, uap)
+	struct proc *p;
+	register struct chmod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setfmode(p, nd.ni_vp, SCARG(uap, mode));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+lchmod(p, uap)
+	struct proc *p;
+	register struct lchmod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setfmode(p, nd.ni_vp, SCARG(uap, mode));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+	int	fd;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+fchmod(p, uap)
+	struct proc *p;
+	register struct fchmod_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	return setfmode(p, (struct vnode *)fp->f_data, SCARG(uap, mode));
+}
+
+static int
+setfown(p, vp, uid, gid)
+	struct proc *p;
+	struct vnode *vp;
+	uid_t uid;
+	gid_t gid;
+{
+	int error;
+	struct vattr vattr;
+
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_uid = uid;
+	vattr.va_gid = gid;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return error;
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+chown(p, uap)
+	struct proc *p;
+	register struct chown_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+	vrele(nd.ni_vp);
+
+	return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+lchown(p, uap)
+	struct proc *p;
+	register struct lchown_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+	int	fd;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+fchown(p, uap)
+	struct proc *p;
+	register struct fchown_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	return setfown(p, (struct vnode *)fp->f_data,
+		SCARG(uap, uid), SCARG(uap, gid));
+}
+
+static int
+setutimes(p, vp, tv, nullflag)
+	struct proc *p;
+	struct vnode *vp;
+	struct timeval *tv;
+	int nullflag;
+{
+	int error;
+	struct vattr vattr;
+
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_atime.tv_sec = tv[0].tv_sec;
+	vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000;
+	vattr.va_mtime.tv_sec = tv[1].tv_sec;
+	vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000;
+	if (nullflag)
+		vattr.va_vaflags |= VA_UTIMES_NULL;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return error;
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+utimes(p, uap)
+	struct proc *p;
+	register struct utimes_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timeval tv[2];
+	int error;
+	struct nameidata nd;
+	int nullflag;
+
+	nullflag = 0;
+	if (SCARG(uap, tptr) == NULL) {
+		microtime(&tv[0]);
+		tv[1] = tv[0];
+		nullflag = 1;
+	} else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+	    sizeof (tv)))
+  		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = setutimes(p, nd.ni_vp, tv, nullflag);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+lutimes(p, uap)
+	struct proc *p;
+	register struct lutimes_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timeval tv[2];
+	int error;
+	struct nameidata nd;
+	int nullflag;
+
+	nullflag = 0;
+	if (SCARG(uap, tptr) == NULL) {
+		microtime(&tv[0]);
+		tv[1] = tv[0];
+		nullflag = 1;
+	} else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+	    sizeof (tv)))
+  		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+
+	error = setutimes(p, nd.ni_vp, tv, nullflag);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+	int	fd;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+futimes(p, uap)
+	struct proc *p;
+	register struct futimes_args /* {
+		syscallarg(int ) fd;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timeval tv[2];
+	struct file *fp;
+	int error;
+	int nullflag;
+
+	nullflag = 0;
+	if (SCARG(uap, tptr) == NULL) {
+		microtime(&tv[0]);
+		tv[1] = tv[0];
+		nullflag = 1;
+	} else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+	    sizeof (tv)))
+  		return (error);
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	return setutimes(p, (struct vnode *)fp->f_data, tv, nullflag);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+	char	*path;
+	int	pad;
+	off_t	length;
+};
+#endif
+/* ARGSUSED */
+int
+truncate(p, uap)
+	struct proc *p;
+	register struct truncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	if (uap->length < 0)
+		return(EINVAL);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0 &&
+	    (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = SCARG(uap, length);
+		error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	}
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+	int	fd;
+	int	pad;
+	off_t	length;
+};
+#endif
+/* ARGSUSED */
+int
+ftruncate(p, uap)
+	struct proc *p;
+	register struct ftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ *uap;
+{
+	struct vattr vattr;
+	struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (uap->length < 0)
+		return(EINVAL);
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	if ((fp->f_flag & FWRITE) == 0)
+		return (EINVAL);
+	vp = (struct vnode *)fp->f_data;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = SCARG(uap, length);
+		error = VOP_SETATTR(vp, &vattr, fp->f_cred, p);
+	}
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+	char	*path;
+	long	length;
+};
+#endif
+/* ARGSUSED */
+int
+otruncate(p, uap)
+	struct proc *p;
+	register struct otruncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(long) length;
+	} */ *uap;
+{
+	struct truncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ nuap;
+
+	SCARG(&nuap, path) = SCARG(uap, path);
+	SCARG(&nuap, length) = SCARG(uap, length);
+	return (truncate(p, &nuap));
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+	int	fd;
+	long	length;
+};
+#endif
+/* ARGSUSED */
+int
+oftruncate(p, uap)
+	struct proc *p;
+	register struct oftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(long) length;
+	} */ *uap;
+{
+	struct ftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ nuap;
+
+	SCARG(&nuap, fd) = SCARG(uap, fd);
+	SCARG(&nuap, length) = SCARG(uap, length);
+	return (ftruncate(p, &nuap));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+	int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+fsync(p, uap)
+	struct proc *p;
+	struct fsync_args /* {
+		syscallarg(int) fd;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_object)
+		vm_object_page_clean(vp->v_object, 0, 0, 0);
+	if ((error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p)) == 0 &&
+	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) &&
+	    bioops.io_fsync)
+		error = (*bioops.io_fsync)(vp);
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * Rename files.  Source and destination must either both be directories,
+ * or both not be directories.  If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+	char	*from;
+	char	*to;
+};
+#endif
+/* ARGSUSED */
+int
+rename(p, uap)
+	struct proc *p;
+	register struct rename_args /* {
+		syscallarg(char *) from;
+		syscallarg(char *) to;
+	} */ *uap;
+{
+	register struct vnode *tvp, *fvp, *tdvp;
+	struct nameidata fromnd, tond;
+	int error;
+
+	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
+	    SCARG(uap, from), p);
+	if (error = namei(&fromnd))
+		return (error);
+	fvp = fromnd.ni_vp;
+	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ,
+	    UIO_USERSPACE, SCARG(uap, to), p);
+	if (fromnd.ni_vp->v_type == VDIR)
+		tond.ni_cnd.cn_flags |= WILLBEDIR;
+	if (error = namei(&tond)) {
+		/* Translate error code for rename("dir1", "dir2/."). */
+		if (error == EISDIR && fvp->v_type == VDIR)
+			error = EINVAL;
+		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		goto out1;
+	}
+	tdvp = tond.ni_dvp;
+	tvp = tond.ni_vp;
+	if (tvp != NULL) {
+		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto out;
+		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+			error = EISDIR;
+			goto out;
+		}
+	}
+	if (fvp == tdvp)
+		error = EINVAL;
+	/*
+	 * If source is the same as the destination (that is the
+	 * same inode number with the same name in the same directory),
+	 * then there is nothing to do.
+	 */
+	if (fvp == tvp && fromnd.ni_dvp == tdvp &&
+	    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
+	    !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
+	      fromnd.ni_cnd.cn_namelen))
+		error = -1;
+out:
+	if (!error) {
+		VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE);
+		if (fromnd.ni_dvp != tdvp) {
+			VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		}
+		if (tvp) {
+			VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE);
+		}
+		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+	} else {
+		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		if (tvp)
+			vput(tvp);
+		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+	}
+	vrele(tond.ni_startdir);
+	ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename");
+	ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename");
+	ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename");
+	ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename");
+	zfree(namei_zone, tond.ni_cnd.cn_pnbuf);
+out1:
+	if (fromnd.ni_startdir)
+		vrele(fromnd.ni_startdir);
+	zfree(namei_zone, fromnd.ni_cnd.cn_pnbuf);
+	if (error == -1)
+		return (0);
+	return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkdir(p, uap)
+	struct proc *p;
+	register struct mkdir_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	nd.ni_cnd.cn_flags |= WILLBEDIR;
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(vp);
+		return (EEXIST);
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VDIR;
+	vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask;
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	vput(nd.ni_dvp);
+	if (!error)
+		vput(nd.ni_vp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir");
+	return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+rmdir(p, uap)
+	struct proc *p;
+	struct rmdir_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+	/*
+	 * No rmdir "." please.
+	 */
+	if (nd.ni_dvp == vp) {
+		error = EINVAL;
+		goto out;
+	}
+	/*
+	 * The root of a mounted filesystem cannot be deleted.
+	 */
+	if (vp->v_flag & VROOT)
+		error = EBUSY;
+out:
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+		error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+	} else {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+	}
+	if (nd.ni_dvp == vp)
+		vrele(nd.ni_dvp);
+	else
+		vput(nd.ni_dvp);
+	if (vp != NULLVP)
+		vput(vp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir");
+	return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+ogetdirentries(p, uap)
+	struct proc *p;
+	register struct ogetdirentries_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+		syscallarg(long *) basep;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio, kuio;
+	struct iovec aiov, kiov;
+	struct dirent *dp, *edp;
+	caddr_t dirbuf;
+	int error, eofflag, readcnt;
+	long loff;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	if ((fp->f_flag & FREAD) == 0)
+		return (EBADF);
+	vp = (struct vnode *)fp->f_data;
+unionread:
+	if (vp->v_type != VDIR)
+		return (EINVAL);
+	aiov.iov_base = SCARG(uap, buf);
+	aiov.iov_len = SCARG(uap, count);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+	auio.uio_resid = SCARG(uap, count);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	loff = auio.uio_offset = fp->f_offset;
+#	if (BYTE_ORDER != LITTLE_ENDIAN)
+		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+			fp->f_offset = auio.uio_offset;
+		} else
+#	endif
+	{
+		kuio = auio;
+		kuio.uio_iov = &kiov;
+		kuio.uio_segflg = UIO_SYSSPACE;
+		kiov.iov_len = SCARG(uap, count);
+		MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
+		kiov.iov_base = dirbuf;
+		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+		fp->f_offset = kuio.uio_offset;
+		if (error == 0) {
+			readcnt = SCARG(uap, count) - kuio.uio_resid;
+			edp = (struct dirent *)&dirbuf[readcnt];
+			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+#				if (BYTE_ORDER == LITTLE_ENDIAN)
+					/*
+					 * The expected low byte of
+					 * dp->d_namlen is our dp->d_type.
+					 * The high MBZ byte of dp->d_namlen
+					 * is our dp->d_namlen.
+					 */
+					dp->d_type = dp->d_namlen;
+					dp->d_namlen = 0;
+#				else
+					/*
+					 * The dp->d_type is the high byte
+					 * of the expected dp->d_namlen,
+					 * so must be zero'ed.
+					 */
+					dp->d_type = 0;
+#				endif
+				if (dp->d_reclen > 0) {
+					dp = (struct dirent *)
+					    ((char *)dp + dp->d_reclen);
+				} else {
+					error = EIO;
+					break;
+				}
+			}
+			if (dp >= edp)
+				error = uiomove(dirbuf, readcnt, &auio);
+		}
+		FREE(dirbuf, M_TEMP);
+	}
+	VOP_UNLOCK(vp, 0, p);
+	if (error)
+		return (error);
+	if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) {
+		error = union_dircheckp(p, &vp, fp);
+		if (error == -1)
+			goto unionread;
+		if (error)
+			return (error);
+	}
+	error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+	    sizeof(long));
+	p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+getdirentries(p, uap)
+	struct proc *p;
+	register struct getdirentries_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+		syscallarg(long *) basep;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	long loff;
+	int error, eofflag;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	if ((fp->f_flag & FREAD) == 0)
+		return (EBADF);
+	vp = (struct vnode *)fp->f_data;
+unionread:
+	if (vp->v_type != VDIR)
+		return (EINVAL);
+	aiov.iov_base = SCARG(uap, buf);
+	aiov.iov_len = SCARG(uap, count);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+	auio.uio_resid = SCARG(uap, count);
+	/* vn_lock(vp, LK_SHARED | LK_RETRY, p); */
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	loff = auio.uio_offset = fp->f_offset;
+	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
+	fp->f_offset = auio.uio_offset;
+	VOP_UNLOCK(vp, 0, p);
+	if (error)
+		return (error);
+	if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) {
+		error = union_dircheckp(p, &vp, fp);
+		if (error == -1)
+			goto unionread;
+		if (error)
+			return (error);
+	}
+	if (SCARG(uap, basep) != NULL) {
+		error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+		    sizeof(long));
+	}
+	p->p_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct getdents_args {
+	int fd;
+	char *buf;
+	size_t count;
+};
+#endif
+int
+getdents(p, uap)
+	struct proc *p;
+	register struct getdents_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+	} */ *uap;
+{
+	struct getdirentries_args ap;
+	ap.fd = uap->fd;
+	ap.buf = uap->buf;
+	ap.count = uap->count;
+	ap.basep = NULL;
+	return getdirentries(p, &ap);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+	int	newmask;
+};
+#endif
+int
+umask(p, uap)
+	struct proc *p;
+	struct umask_args /* {
+		syscallarg(int) newmask;
+	} */ *uap;
+{
+	register struct filedesc *fdp;
+
+	fdp = p->p_fd;
+	p->p_retval[0] = fdp->fd_cmask;
+	fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+	return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem
+ * away from vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+revoke(p, uap)
+	struct proc *p;
+	register struct revoke_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p))
+		goto out;
+	if (p->p_ucred->cr_uid != vattr.va_uid &&
+	    (error = suser(p->p_ucred, &p->p_acflag)))
+		goto out;
+	if (vp->v_usecount > 1 || (vp->v_flag & VALIASED))
+		VOP_REVOKE(vp, REVOKEALL);
+out:
+	vrele(vp);
+	return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry.
+ */
+int
+getvnode(fdp, fd, fpp)
+	struct filedesc *fdp;
+	int fd;
+	struct file **fpp;
+{
+	struct file *fp;
+
+	if ((u_int)fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[fd]) == NULL)
+		return (EBADF);
+	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
+		return (EINVAL);
+	*fpp = fp;
+	return (0);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct  __getcwd_args {
+	u_char	*buf;
+	u_int	buflen;
+};
+#endif
+#define STATNODE(mode, name, var) \
+	SYSCTL_INT(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
+
+static int disablecwd;
+SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
+
+static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
+static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
+static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
+static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
+static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
+static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
+int
+__getcwd(p, uap)
+	struct proc *p;
+	struct __getcwd_args *uap;
+{
+	char *bp, *buf;
+	int error, i, slash_prefixed;
+	struct filedesc *fdp;
+	struct namecache *ncp;
+	struct vnode *vp;
+
+	numcwdcalls++;
+	if (disablecwd)
+		return (ENODEV);
+	if (uap->buflen < 2)
+		return (EINVAL);
+	if (uap->buflen > MAXPATHLEN)
+		uap->buflen = MAXPATHLEN;
+	buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
+	bp += uap->buflen - 1;
+	*bp = '\0';
+	fdp = p->p_fd;
+	slash_prefixed = 0;
+	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
+		if (vp->v_flag & VROOT) {
+			vp = vp->v_mount->mnt_vnodecovered;
+			continue;
+		}
+		if (vp->v_dd->v_id != vp->v_ddid) {
+			numcwdfail1++;
+			free(buf, M_TEMP);
+			return (ENOTDIR);
+		}
+		ncp = TAILQ_FIRST(&vp->v_cache_dst);
+		if (!ncp) {
+			numcwdfail2++;
+			free(buf, M_TEMP);
+			return (ENOENT);
+		}
+		if (ncp->nc_dvp != vp->v_dd) {
+			numcwdfail3++;
+			free(buf, M_TEMP);
+			return (EBADF);
+		}
+		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
+			if (bp == buf) {
+				numcwdfail4++;
+				free(buf, M_TEMP);
+				return (ENOMEM);
+			}
+			*--bp = ncp->nc_name[i];
+		}
+		if (bp == buf) {
+			numcwdfail4++;
+			free(buf, M_TEMP);
+			return (ENOMEM);
+		}
+		*--bp = '/';
+		slash_prefixed = 1;
+		vp = vp->v_dd;
+	}
+	if (!slash_prefixed) {
+		if (bp == buf) {
+			numcwdfail4++;
+			free(buf, M_TEMP);
+			return (ENOMEM);
+		}
+		*--bp = '/';
+	}
+	numcwdfound++;
+	error = copyout(bp, uap->buf, strlen(bp) + 1);
+	free(buf, M_TEMP);
+	return (error);
+}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
new file mode 100644
index 0000000..0b32a7d
--- /dev/null
+++ b/sys/kern/vfs_vnops.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
+ * $Id: vfs_vnops.c,v 1.61 1999/01/05 18:49:56 eivind Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+
+static int vn_closefile __P((struct file *fp, struct proc *p));
+static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data, 
+		struct proc *p));
+static int vn_read __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+static int vn_poll __P((struct file *fp, int events, struct ucred *cred,
+		struct proc *p));
+static int vn_write __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+
+struct 	fileops vnops =
+	{ vn_read, vn_write, vn_ioctl, vn_poll, vn_closefile };
+
+/*
+ * Common code for vnode open operations.
+ * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
+ */
+int
+vn_open(ndp, fmode, cmode)
+	register struct nameidata *ndp;
+	int fmode, cmode;
+{
+	register struct vnode *vp;
+	register struct proc *p = ndp->ni_cnd.cn_proc;
+	register struct ucred *cred = p->p_ucred;
+	struct vattr vat;
+	struct vattr *vap = &vat;
+	int mode, error;
+
+	if (fmode & O_CREAT) {
+		ndp->ni_cnd.cn_nameiop = CREATE;
+		ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
+		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
+			ndp->ni_cnd.cn_flags |= FOLLOW;
+		error = namei(ndp);
+		if (error)
+			return (error);
+		if (ndp->ni_vp == NULL) {
+			VATTR_NULL(vap);
+			vap->va_type = VREG;
+			vap->va_mode = cmode;
+			if (fmode & O_EXCL)
+				vap->va_vaflags |= VA_EXCLUSIVE;
+			VOP_LEASE(ndp->ni_dvp, p, cred, LEASE_WRITE);
+			error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
+					   &ndp->ni_cnd, vap);
+			vput(ndp->ni_dvp);
+			if (error)
+				return (error);
+			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
+			ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
+			fmode &= ~O_TRUNC;
+			vp = ndp->ni_vp;
+		} else {
+			VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd);
+			if (ndp->ni_dvp == ndp->ni_vp)
+				vrele(ndp->ni_dvp);
+			else
+				vput(ndp->ni_dvp);
+			ndp->ni_dvp = NULL;
+			vp = ndp->ni_vp;
+			if (fmode & O_EXCL) {
+				error = EEXIST;
+				goto bad;
+			}
+			fmode &= ~O_CREAT;
+		}
+	} else {
+		ndp->ni_cnd.cn_nameiop = LOOKUP;
+		ndp->ni_cnd.cn_flags =
+		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
+		error = namei(ndp);
+		if (error)
+			return (error);
+		vp = ndp->ni_vp;
+	}
+	if (vp->v_type == VLNK) {
+		error = EMLINK;
+		goto bad;
+	}
+	if (vp->v_type == VSOCK) {
+		error = EOPNOTSUPP;
+		goto bad;
+	}
+	if ((fmode & O_CREAT) == 0) {
+		mode = 0;
+		if (fmode & (FWRITE | O_TRUNC)) {
+			if (vp->v_type == VDIR) {
+				error = EISDIR;
+				goto bad;
+			}
+			error = vn_writechk(vp);
+			if (error)
+				goto bad;
+			mode |= VWRITE;
+		}
+		if (fmode & FREAD)
+			mode |= VREAD;
+		if (mode) {
+		        error = VOP_ACCESS(vp, mode, cred, p);
+			if (error)
+				goto bad;
+		}
+	}
+	if (fmode & O_TRUNC) {
+		VOP_UNLOCK(vp, 0, p);				/* XXX */
+		VOP_LEASE(vp, p, cred, LEASE_WRITE);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);	/* XXX */
+		VATTR_NULL(vap);
+		vap->va_size = 0;
+		error = VOP_SETATTR(vp, vap, cred, p);
+		if (error)
+			goto bad;
+	}
+	error = VOP_OPEN(vp, fmode, cred, p);
+	if (error)
+		goto bad;
+	/*
+	 * Make sure that a VM object is created for VMIO support.
+	 */
+	if (vp->v_type == VREG) {
+		if ((error = vfs_object_create(vp, p, cred)) != 0)
+			goto bad;
+	}
+
+	if (fmode & FWRITE)
+		vp->v_writecount++;
+	return (0);
+bad:
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Check for write permissions on the specified vnode.
+ * Prototype text segments cannot be written.
+ */
+int
+vn_writechk(vp)
+	register struct vnode *vp;
+{
+
+	/*
+	 * If there's shared text associated with
+	 * the vnode, try to free it up once.  If
+	 * we fail, we can't allow writing.
+	 */
+	if (vp->v_flag & VTEXT)
+		return (ETXTBSY);
+	return (0);
+}
+
+/*
+ * Vnode close call
+ */
+int
+vn_close(vp, flags, cred, p)
+	register struct vnode *vp;
+	int flags;
+	struct ucred *cred;
+	struct proc *p;
+{
+	int error;
+
+	if (flags & FWRITE)
+		vp->v_writecount--;
+	error = VOP_CLOSE(vp, flags, cred, p);
+	vrele(vp);
+	return (error);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.
+ */
+int
+vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
+	enum uio_rw rw;
+	struct vnode *vp;
+	caddr_t base;
+	int len;
+	off_t offset;
+	enum uio_seg segflg;
+	int ioflg;
+	struct ucred *cred;
+	int *aresid;
+	struct proc *p;
+{
+	struct uio auio;
+	struct iovec aiov;
+	int error;
+
+	if ((ioflg & IO_NODELOCKED) == 0)
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = base;
+	aiov.iov_len = len;
+	auio.uio_resid = len;
+	auio.uio_offset = offset;
+	auio.uio_segflg = segflg;
+	auio.uio_rw = rw;
+	auio.uio_procp = p;
+	if (rw == UIO_READ) {
+		error = VOP_READ(vp, &auio, ioflg, cred);
+	} else {
+		error = VOP_WRITE(vp, &auio, ioflg, cred);
+	}
+	if (aresid)
+		*aresid = auio.uio_resid;
+	else
+		if (auio.uio_resid && error == 0)
+			error = EIO;
+	if ((ioflg & IO_NODELOCKED) == 0)
+		VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * File table vnode read routine.
+ */
+static int
+vn_read(fp, uio, cred)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+{
+	struct vnode *vp = (struct vnode *)fp->f_data;
+	struct proc *p = uio->uio_procp;
+	int count, error;
+	int flag;
+
+	VOP_LEASE(vp, p, cred, LEASE_READ);
+	vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
+	if (uio->uio_offset == -1)
+		uio->uio_offset = fp->f_offset;
+	count = uio->uio_resid;
+	flag = 0;
+	if (fp->f_flag & FNONBLOCK)
+		flag |= IO_NDELAY;
+
+	/*
+	 * Sequential read heuristic.
+	 * If we have been doing sequential input,
+	 * a rewind operation doesn't turn off
+	 * sequential input mode.
+	 */
+	if (((fp->f_offset == 0) && (fp->f_seqcount > 0)) ||
+		(fp->f_offset == fp->f_nextread)) {
+		int tmpseq = fp->f_seqcount;
+		/*
+		 * XXX we assume that the filesystem block size is
+		 * the default.  Not true, but still gives us a pretty
+		 * good indicator of how sequential the read operations
+		 * are.
+		 */
+		tmpseq += ((count + BKVASIZE - 1) / BKVASIZE);
+		if (tmpseq >= 127)
+			tmpseq = 127;
+		fp->f_seqcount = tmpseq;
+		flag |= (fp->f_seqcount << 16);
+	} else {
+		if (fp->f_seqcount > 1)
+			fp->f_seqcount = 1;
+		else
+			fp->f_seqcount = 0;
+	}
+
+	error = VOP_READ(vp, uio, flag, cred);
+	fp->f_offset += count - uio->uio_resid;
+	fp->f_nextread = fp->f_offset;
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * File table vnode write routine.
+ */
+static int
+vn_write(fp, uio, cred)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+{
+	struct vnode *vp = (struct vnode *)fp->f_data;
+	struct proc *p = uio->uio_procp;
+	int count, error, ioflag = IO_UNIT;
+
+	if (uio->uio_offset == -1 && vp->v_type == VREG && (fp->f_flag & O_APPEND))
+		ioflag |= IO_APPEND;
+	if (fp->f_flag & FNONBLOCK)
+		ioflag |= IO_NDELAY;
+	if ((fp->f_flag & O_FSYNC) ||
+	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
+		ioflag |= IO_SYNC;
+	VOP_LEASE(vp, p, cred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	uio->uio_offset = fp->f_offset;
+	count = uio->uio_resid;
+	error = VOP_WRITE(vp, uio, ioflag, cred);
+	if (ioflag & IO_APPEND)
+		fp->f_offset = uio->uio_offset;
+	else
+		fp->f_offset += count - uio->uio_resid;
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * File table vnode stat routine.
+ */
+int
+vn_stat(vp, sb, p)
+	struct vnode *vp;
+	register struct stat *sb;
+	struct proc *p;
+{
+	struct vattr vattr;
+	register struct vattr *vap;
+	int error;
+	u_short mode;
+
+	vap = &vattr;
+	error = VOP_GETATTR(vp, vap, p->p_ucred, p);
+	if (error)
+		return (error);
+	/*
+	 * Copy from vattr table
+	 */
+	sb->st_dev = vap->va_fsid;
+	sb->st_ino = vap->va_fileid;
+	mode = vap->va_mode;
+	switch (vap->va_type) {
+	case VREG:
+		mode |= S_IFREG;
+		break;
+	case VDIR:
+		mode |= S_IFDIR;
+		break;
+	case VBLK:
+		mode |= S_IFBLK;
+		break;
+	case VCHR:
+		mode |= S_IFCHR;
+		break;
+	case VLNK:
+		mode |= S_IFLNK;
+		/* This is a cosmetic change, symlinks do not have a mode. */
+		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
+			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
+		else
+			sb->st_mode |= ACCESSPERMS;	/* 0777 */
+		break;
+	case VSOCK:
+		mode |= S_IFSOCK;
+		break;
+	case VFIFO:
+		mode |= S_IFIFO;
+		break;
+	default:
+		return (EBADF);
+	};
+	sb->st_mode = mode;
+	sb->st_nlink = vap->va_nlink;
+	sb->st_uid = vap->va_uid;
+	sb->st_gid = vap->va_gid;
+	sb->st_rdev = vap->va_rdev;
+	sb->st_size = vap->va_size;
+	sb->st_atimespec = vap->va_atime;
+	sb->st_mtimespec = vap->va_mtime;
+	sb->st_ctimespec = vap->va_ctime;
+	sb->st_blksize = vap->va_blocksize;
+	sb->st_flags = vap->va_flags;
+	if (p->p_ucred->cr_uid != 0)
+		sb->st_gen = 0;
+	else
+		sb->st_gen = vap->va_gen;
+
+#if (S_BLKSIZE == 512)
+	/* Optimize this case */
+	sb->st_blocks = vap->va_bytes >> 9;
+#else
+	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
+#endif
+	return (0);
+}
+
+/*
+ * File table vnode ioctl routine.
+ */
+static int
+vn_ioctl(fp, com, data, p)
+	struct file *fp;
+	u_long com;
+	caddr_t data;
+	struct proc *p;
+{
+	register struct vnode *vp = ((struct vnode *)fp->f_data);
+	struct vattr vattr;
+	int error;
+
+	switch (vp->v_type) {
+
+	case VREG:
+	case VDIR:
+		if (com == FIONREAD) {
+			error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
+			if (error)
+				return (error);
+			*(int *)data = vattr.va_size - fp->f_offset;
+			return (0);
+		}
+		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
+			return (0);			/* XXX */
+		/* fall into ... */
+
+	default:
+#if 0
+		return (ENOTTY);
+#endif
+	case VFIFO:
+	case VCHR:
+	case VBLK:
+		error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p);
+		if (error == 0 && com == TIOCSCTTY) {
+
+			/* Do nothing if reassigning same control tty */
+			if (p->p_session->s_ttyvp == vp)
+				return (0);
+
+			/* Get rid of reference to old control tty */
+			if (p->p_session->s_ttyvp)
+				vrele(p->p_session->s_ttyvp);
+
+			p->p_session->s_ttyvp = vp;
+			VREF(vp);
+		}
+		return (error);
+	}
+}
+
+/*
+ * File table vnode poll routine.
+ */
+static int
+vn_poll(fp, events, cred, p)
+	struct file *fp;
+	int events;
+	struct ucred *cred;
+	struct proc *p;
+{
+
+	return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, p));
+}
+
+/*
+ * Check that the vnode is still valid, and if so
+ * acquire requested lock.
+ */
+int
+#ifndef	DEBUG_LOCKS
+vn_lock(vp, flags, p)
+#else
+debug_vn_lock(vp, flags, p, filename, line)
+#endif
+	struct vnode *vp;
+	int flags;
+	struct proc *p;
+#ifdef	DEBUG_LOCKS
+	const char *filename;
+	int line;
+#endif
+{
+	int error;
+	
+	do {
+		if ((flags & LK_INTERLOCK) == 0)
+			simple_lock(&vp->v_interlock);
+		if (vp->v_flag & VXLOCK) {
+			vp->v_flag |= VXWANT;
+			simple_unlock(&vp->v_interlock);
+			tsleep((caddr_t)vp, PINOD, "vn_lock", 0);
+			error = ENOENT;
+		} else {
+#ifdef	DEBUG_LOCKS
+			vp->filename = filename;
+			vp->line = line;
+#endif
+			error = VOP_LOCK(vp,
+				    flags | LK_NOPAUSE | LK_INTERLOCK, p);
+			if (error == 0)
+				return (error);
+		}
+		flags &= ~LK_INTERLOCK;
+	} while (flags & LK_RETRY);
+	return (error);
+}
+
+/*
+ * File table vnode close routine.
+ */
+static int
+vn_closefile(fp, p)
+	struct file *fp;
+	struct proc *p;
+{
+
+	return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
+		fp->f_cred, p));
+}
diff --git a/sys/kern/vnode_if.pl b/sys/kern/vnode_if.pl
new file mode 100644
index 0000000..8193edb
--- /dev/null
+++ b/sys/kern/vnode_if.pl
@@ -0,0 +1,402 @@
+#!/bin/sh -
+#
+# Copyright (c) 1992, 1993
+#	The Regents of the University of California.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+#    must display the following acknowledgement:
+#	This product includes software developed by the University of
+#	California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+#	@(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+# $Id: vnode_if.sh,v 1.15 1998/07/04 20:45:32 julian Exp $
+#
+
+# Script to produce VFS front-end sugar.
+#
+# usage: vnode_if.sh srcfile
+#	(where srcfile is currently /sys/kern/vnode_if.src)
+#
+# These awk scripts are not particularly well written, specifically they
+# don't use arrays well and figure out the same information repeatedly.
+# Please rewrite them if you actually understand how to use awk.  Note,
+# they use nawk extensions and gawk's toupper.
+
+if [ $# -ne 1 ] ; then
+	echo 'usage: vnode_if.sh srcfile'
+	exit 1
+fi
+
+# Name of the source file.
+SRC=$1
+
+# Names of the created files.
+CFILE=vnode_if.c
+HEADER=vnode_if.h
+
+# Awk program (must support nawk extensions and gawk's "toupper")
+# Use "awk" at Berkeley, "gawk" elsewhere.
+AWK=awk
+
+# Print out header information for vnode_if.h.
+cat << END_OF_LEADING_COMMENT > $HEADER
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+ */
+
+extern struct vnodeop_desc vop_default_desc;
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.h.
+$AWK '
+	NF == 0 || $0 ~ "^#" {
+		next;
+	}
+	{
+		# Get the function name.
+		name = $1;
+		uname = toupper(name);
+
+		# Get the function arguments.
+		for (c1 = 0;; ++c1) {
+			if (getline <= 0)
+				exit
+			if ($0 ~ "^};")
+				break;
+			a[c1] = $0;
+		}
+
+		# Print out the vop_F_args structure.
+		printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n",
+		    name);
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else 
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			printf("%sa_%s\n",
+			    substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+		}
+		printf("};\n");
+
+		# Print out extern declaration.
+		printf("extern struct vnodeop_desc %s_desc;\n", name);
+
+		# Print out prototype.
+		printf("static int %s __P((\n", uname);
+		sep = ",\n";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = "));\n";
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s%s",
+			    substr(t[c4], 0, beg - 1),
+			    substr(t[c4], beg, end - beg), sep);
+		}
+
+		# Print out inline struct.
+		printf("static __inline int %s(", uname);
+		sep = ", ";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = ")\n";
+			c3 = split(a[c2], t);
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s", substr(t[c3], beg, end - beg), sep);
+		}
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			printf("%s%s\n",
+			    substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+		}
+		printf("{\n\tstruct %s_args a;\n\n", name);
+		printf("\ta.a_desc = VDESC(%s);\n", name);
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("a.a_%s = %s\n",
+			    substr(t[c3], beg, end - beg), substr(t[c3], beg));
+		}
+		c1 = split(a[0], t);
+		beg = match(t[c1], "[^*]");
+		end = match(t[c1], ";");
+		printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n",
+		    substr(t[c1], beg, end - beg), name);
+	}' < $SRC >> $HEADER
+
+# Print out header information for vnode_if.c.
+cat << END_OF_LEADING_COMMENT > $CFILE
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/param.h>
+#include <sys/vnode.h>
+
+struct vnodeop_desc vop_default_desc = {
+	1,			/* special case, vop_default => 1 */
+	"default",
+	0,
+	NULL,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	NULL,
+};
+
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.c.
+$AWK 'function kill_surrounding_ws (s) {
+		sub (/^[ \t]*/, "", s);
+		sub (/[ \t]*$/, "", s);
+		return s;
+	}
+
+	function read_args() {
+		numargs = 0;
+		while (getline ln) {
+			if (ln ~ /}/) {
+				break;
+			};
+	
+			# Delete comments, if any.
+			gsub (/\/\*.*\*\//, "", ln);
+			
+			# Delete leading/trailing space.
+			ln = kill_surrounding_ws(ln);
+	
+			# Pick off direction.
+			if (1 == sub(/^INOUT[ \t]+/, "", ln))
+				dir = "INOUT";
+			else if (1 == sub(/^IN[ \t]+/, "", ln))
+				dir = "IN";
+			else if (1 == sub(/^OUT[ \t]+/, "", ln))
+				dir = "OUT";
+			else
+				bail("No IN/OUT direction for \"" ln "\".");
+
+			# check for "WILLRELE"
+			if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) {
+				rele = "WILLRELE";
+			} else {
+				rele = "WONTRELE";
+			};
+	
+			# kill trailing ;
+			if (1 != sub (/;$/, "", ln)) {
+				bail("Missing end-of-line ; in \"" ln "\".");
+			};
+	
+			# pick off variable name
+			if (!(i = match(ln, /[A-Za-z0-9_]+$/))) {
+				bail("Missing var name \"a_foo\" in \"" ln "\".");
+			};
+			arg = substr (ln, i);
+			# Want to <<substr(ln, i) = "";>>, but nawk cannot.
+			# Hack around this.
+			ln = substr(ln, 1, i-1);
+	
+			# what is left must be type
+			# (put clean it up some)
+			type = ln;
+			gsub (/[ \t]+/, " ", type);   # condense whitespace
+			type = kill_surrounding_ws(type);
+	
+			# (boy this was easier in Perl)
+	
+			numargs++;
+			dirs[numargs] = dir;
+			reles[numargs] = rele;
+			types[numargs] = type;
+			args[numargs] = arg;
+		};
+	}
+
+	function generate_operation_vp_offsets() {
+		printf ("static int %s_vp_offsets[] = {\n", name);
+		# as a side effect, figure out the releflags
+		releflags = "";
+		vpnum = 0;
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == "struct vnode *") {
+				printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n",
+					name, args[i]);
+				if (reles[i] == "WILLRELE") {
+					releflags = releflags "|VDESC_VP" vpnum "_WILLRELE";
+				};
+				vpnum++;
+			};
+		};
+		sub (/^\|/, "", releflags);
+		print "\tVDESC_NO_OFFSET";
+		print "};";
+	}
+	
+	function find_arg_with_type (type) {
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == type) {
+				return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")";
+			};
+		};
+		return "VDESC_NO_OFFSET";
+	}
+	
+	function generate_operation_desc() {
+		printf ("struct vnodeop_desc %s_desc = {\n", name);
+		# offset
+		printf ("\t0,\n");
+		# printable name
+		printf ("\t\"%s\",\n", name);
+		# flags
+		vppwillrele = "";
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == "struct vnode **" &&
+				(reles[i] == "WILLRELE")) {
+				vppwillrele = "|VDESC_VPP_WILLRELE";
+			};
+		};
+		if (releflags == "") {
+			printf ("\t0%s,\n", vppwillrele);
+		} else {
+			printf ("\t%s%s,\n", releflags, vppwillrele);
+		};
+		# vp offsets
+		printf ("\t%s_vp_offsets,\n", name);
+		# vpp (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct vnode **"));
+		# cred (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct ucred *"));
+		# proc (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct proc *"));
+		# componentname
+		printf ("\t%s,\n", find_arg_with_type("struct componentname *"));
+		# transport layer information
+		printf ("\tNULL,\n};\n");
+	}
+
+	NF == 0 || $0 ~ "^#" {
+		next;
+	}
+	{
+		# get the function name
+		name = $1;
+
+		# get the function arguments
+		read_args();
+
+		# Print out the vop_F_vp_offsets structure.  This all depends
+		# on naming conventions and nothing else.
+		generate_operation_vp_offsets();
+
+		# Print out the vnodeop_desc structure.
+		generate_operation_desc();
+
+		printf "\n";
+
+	}' < $SRC >> $CFILE
+# THINGS THAT DON'T WORK RIGHT YET.
+# 
+# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as
+# arguments.  This means that these operations can't function successfully
+# through a bypass routine.
+#
+# Bwrite and strategy will be replaced when the VM page/buffer cache
+# integration happens.
+#
+# To get around this problem for now we handle these ops as special cases.
+
+cat << END_OF_SPECIAL_CASES >> $HEADER
+#include <sys/buf.h>
+
+struct vop_bwrite_args {
+	struct vnodeop_desc *a_desc;
+	struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_bwrite_desc;
+static int VOP_BWRITE __P((
+	struct buf *bp));
+static __inline int VOP_BWRITE(bp)
+	struct buf *bp;
+{
+	struct vop_bwrite_args a;
+
+	a.a_desc = VDESC(vop_bwrite);
+	a.a_bp = bp;
+	return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a));
+}
+
+extern int vfs_opv_numops;
+END_OF_SPECIAL_CASES
+
+cat << END_OF_SPECIAL_CASES >> $CFILE
+static int vop_bwrite_vp_offsets[] = {
+	VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_bwrite_desc = {
+	0,
+	"vop_bwrite",
+	0,
+	vop_bwrite_vp_offsets,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	NULL,
+};
+END_OF_SPECIAL_CASES
diff --git a/sys/kern/vnode_if.sh b/sys/kern/vnode_if.sh
new file mode 100644
index 0000000..8193edb
--- /dev/null
+++ b/sys/kern/vnode_if.sh
@@ -0,0 +1,402 @@
+#!/bin/sh -
+#
+# Copyright (c) 1992, 1993
+#	The Regents of the University of California.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+#    must display the following acknowledgement:
+#	This product includes software developed by the University of
+#	California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+#	@(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+# $Id: vnode_if.sh,v 1.15 1998/07/04 20:45:32 julian Exp $
+#
+
+# Script to produce VFS front-end sugar.
+#
+# usage: vnode_if.sh srcfile
+#	(where srcfile is currently /sys/kern/vnode_if.src)
+#
+# These awk scripts are not particularly well written, specifically they
+# don't use arrays well and figure out the same information repeatedly.
+# Please rewrite them if you actually understand how to use awk.  Note,
+# they use nawk extensions and gawk's toupper.
+
+if [ $# -ne 1 ] ; then
+	echo 'usage: vnode_if.sh srcfile'
+	exit 1
+fi
+
+# Name of the source file.
+SRC=$1
+
+# Names of the created files.
+CFILE=vnode_if.c
+HEADER=vnode_if.h
+
+# Awk program (must support nawk extensions and gawk's "toupper")
+# Use "awk" at Berkeley, "gawk" elsewhere.
+AWK=awk
+
+# Print out header information for vnode_if.h.
+cat << END_OF_LEADING_COMMENT > $HEADER
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+ */
+
+extern struct vnodeop_desc vop_default_desc;
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.h.
+$AWK '
+	NF == 0 || $0 ~ "^#" {
+		next;
+	}
+	{
+		# Get the function name.
+		name = $1;
+		uname = toupper(name);
+
+		# Get the function arguments.
+		for (c1 = 0;; ++c1) {
+			if (getline <= 0)
+				exit
+			if ($0 ~ "^};")
+				break;
+			a[c1] = $0;
+		}
+
+		# Print out the vop_F_args structure.
+		printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n",
+		    name);
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else 
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			printf("%sa_%s\n",
+			    substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+		}
+		printf("};\n");
+
+		# Print out extern declaration.
+		printf("extern struct vnodeop_desc %s_desc;\n", name);
+
+		# Print out prototype.
+		printf("static int %s __P((\n", uname);
+		sep = ",\n";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = "));\n";
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s%s",
+			    substr(t[c4], 0, beg - 1),
+			    substr(t[c4], beg, end - beg), sep);
+		}
+
+		# Print out inline struct.
+		printf("static __inline int %s(", uname);
+		sep = ", ";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = ")\n";
+			c3 = split(a[c2], t);
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s", substr(t[c3], beg, end - beg), sep);
+		}
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			printf("%s%s\n",
+			    substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+		}
+		printf("{\n\tstruct %s_args a;\n\n", name);
+		printf("\ta.a_desc = VDESC(%s);\n", name);
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("a.a_%s = %s\n",
+			    substr(t[c3], beg, end - beg), substr(t[c3], beg));
+		}
+		c1 = split(a[0], t);
+		beg = match(t[c1], "[^*]");
+		end = match(t[c1], ";");
+		printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n",
+		    substr(t[c1], beg, end - beg), name);
+	}' < $SRC >> $HEADER
+
+# Print out header information for vnode_if.c.
+cat << END_OF_LEADING_COMMENT > $CFILE
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/param.h>
+#include <sys/vnode.h>
+
+struct vnodeop_desc vop_default_desc = {
+	1,			/* special case, vop_default => 1 */
+	"default",
+	0,
+	NULL,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	NULL,
+};
+
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.c.
+$AWK 'function kill_surrounding_ws (s) {
+		sub (/^[ \t]*/, "", s);
+		sub (/[ \t]*$/, "", s);
+		return s;
+	}
+
+	function read_args() {
+		numargs = 0;
+		while (getline ln) {
+			if (ln ~ /}/) {
+				break;
+			};
+	
+			# Delete comments, if any.
+			gsub (/\/\*.*\*\//, "", ln);
+			
+			# Delete leading/trailing space.
+			ln = kill_surrounding_ws(ln);
+	
+			# Pick off direction.
+			if (1 == sub(/^INOUT[ \t]+/, "", ln))
+				dir = "INOUT";
+			else if (1 == sub(/^IN[ \t]+/, "", ln))
+				dir = "IN";
+			else if (1 == sub(/^OUT[ \t]+/, "", ln))
+				dir = "OUT";
+			else
+				bail("No IN/OUT direction for \"" ln "\".");
+
+			# check for "WILLRELE"
+			if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) {
+				rele = "WILLRELE";
+			} else {
+				rele = "WONTRELE";
+			};
+	
+			# kill trailing ;
+			if (1 != sub (/;$/, "", ln)) {
+				bail("Missing end-of-line ; in \"" ln "\".");
+			};
+	
+			# pick off variable name
+			if (!(i = match(ln, /[A-Za-z0-9_]+$/))) {
+				bail("Missing var name \"a_foo\" in \"" ln "\".");
+			};
+			arg = substr (ln, i);
+			# Want to <<substr(ln, i) = "";>>, but nawk cannot.
+			# Hack around this.
+			ln = substr(ln, 1, i-1);
+	
+			# what is left must be type
+			# (put clean it up some)
+			type = ln;
+			gsub (/[ \t]+/, " ", type);   # condense whitespace
+			type = kill_surrounding_ws(type);
+	
+			# (boy this was easier in Perl)
+	
+			numargs++;
+			dirs[numargs] = dir;
+			reles[numargs] = rele;
+			types[numargs] = type;
+			args[numargs] = arg;
+		};
+	}
+
+	function generate_operation_vp_offsets() {
+		printf ("static int %s_vp_offsets[] = {\n", name);
+		# as a side effect, figure out the releflags
+		releflags = "";
+		vpnum = 0;
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == "struct vnode *") {
+				printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n",
+					name, args[i]);
+				if (reles[i] == "WILLRELE") {
+					releflags = releflags "|VDESC_VP" vpnum "_WILLRELE";
+				};
+				vpnum++;
+			};
+		};
+		sub (/^\|/, "", releflags);
+		print "\tVDESC_NO_OFFSET";
+		print "};";
+	}
+	
+	function find_arg_with_type (type) {
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == type) {
+				return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")";
+			};
+		};
+		return "VDESC_NO_OFFSET";
+	}
+	
+	function generate_operation_desc() {
+		printf ("struct vnodeop_desc %s_desc = {\n", name);
+		# offset
+		printf ("\t0,\n");
+		# printable name
+		printf ("\t\"%s\",\n", name);
+		# flags
+		vppwillrele = "";
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == "struct vnode **" &&
+				(reles[i] == "WILLRELE")) {
+				vppwillrele = "|VDESC_VPP_WILLRELE";
+			};
+		};
+		if (releflags == "") {
+			printf ("\t0%s,\n", vppwillrele);
+		} else {
+			printf ("\t%s%s,\n", releflags, vppwillrele);
+		};
+		# vp offsets
+		printf ("\t%s_vp_offsets,\n", name);
+		# vpp (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct vnode **"));
+		# cred (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct ucred *"));
+		# proc (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct proc *"));
+		# componentname
+		printf ("\t%s,\n", find_arg_with_type("struct componentname *"));
+		# transport layer information
+		printf ("\tNULL,\n};\n");
+	}
+
+	NF == 0 || $0 ~ "^#" {
+		next;
+	}
+	{
+		# get the function name
+		name = $1;
+
+		# get the function arguments
+		read_args();
+
+		# Print out the vop_F_vp_offsets structure.  This all depends
+		# on naming conventions and nothing else.
+		generate_operation_vp_offsets();
+
+		# Print out the vnodeop_desc structure.
+		generate_operation_desc();
+
+		printf "\n";
+
+	}' < $SRC >> $CFILE
+# THINGS THAT DON'T WORK RIGHT YET.
+# 
+# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as
+# arguments.  This means that these operations can't function successfully
+# through a bypass routine.
+#
+# Bwrite and strategy will be replaced when the VM page/buffer cache
+# integration happens.
+#
+# To get around this problem for now we handle these ops as special cases.
+
+cat << END_OF_SPECIAL_CASES >> $HEADER
+#include <sys/buf.h>
+
+struct vop_bwrite_args {
+	struct vnodeop_desc *a_desc;
+	struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_bwrite_desc;
+static int VOP_BWRITE __P((
+	struct buf *bp));
+static __inline int VOP_BWRITE(bp)
+	struct buf *bp;
+{
+	struct vop_bwrite_args a;
+
+	a.a_desc = VDESC(vop_bwrite);
+	a.a_bp = bp;
+	return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a));
+}
+
+extern int vfs_opv_numops;
+END_OF_SPECIAL_CASES
+
+cat << END_OF_SPECIAL_CASES >> $CFILE
+static int vop_bwrite_vp_offsets[] = {
+	VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_bwrite_desc = {
+	0,
+	"vop_bwrite",
+	0,
+	vop_bwrite_vp_offsets,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	NULL,
+};
+END_OF_SPECIAL_CASES
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
new file mode 100644
index 0000000..48c9fef
--- /dev/null
+++ b/sys/kern/vnode_if.src
@@ -0,0 +1,488 @@
+#
+# Copyright (c) 1992, 1993
+#	The Regents of the University of California.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+#    must display the following acknowledgement:
+#	This product includes software developed by the University of
+#	California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+#	@(#)vnode_if.src	8.12 (Berkeley) 5/14/95
+# $Id: vnode_if.src,v 1.18 1998/07/04 20:45:32 julian Exp $
+#
+
+#
+# Above each of the vop descriptors is a specification of the locking
+# protocol used by each vop call.  The first column is the name of
+# the variable, the remaining three columns are in, out and error
+# respectively.  The "in" column defines the lock state on input,
+# the "out" column defines the state on succesful return, and the
+# "error" column defines the locking state on error exit.
+#
+# The locking value can take the following values:
+# L: locked.
+# U: unlocked/
+# -: not applicable.  vnode does not yet (or no longer) exists.
+# =: the same on input and output, may be either L or U.
+# X: locked if not nil.
+#
+
+#
+#% lookup	dvp	L ? ?
+#% lookup	vpp	- L -
+#
+# XXX - the lookup locking protocol defies simple description and depends
+#	on the flags and operation fields in the (cnp) structure.  Note
+#	especially that *vpp may equal dvp and both may be locked.
+#
+vop_lookup {
+	IN struct vnode *dvp;
+	INOUT struct vnode **vpp;
+	IN struct componentname *cnp;
+};
+
+#
+#% cachedlookup	dvp	L ? ?
+#% cachedlookup	vpp	- L -
+#
+# This must be an exact copy of lookup.  See kern/vfs_cache.c for details.
+#
+vop_cachedlookup {
+	IN struct vnode *dvp;
+	INOUT struct vnode **vpp;
+	IN struct componentname *cnp;
+};
+
+#
+#% create	dvp	L L L
+#% create	vpp	- L -
+#
+vop_create {
+	IN struct vnode *dvp;
+	OUT struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+};
+
+#
+#% whiteout	dvp	L L L
+#% whiteout	cnp	- - -
+#% whiteout	flag	- - -
+#
+vop_whiteout {
+	IN struct vnode *dvp;
+	IN struct componentname *cnp;
+	IN int flags;
+};
+
+#
+#% mknod	dvp	L L L
+#% mknod	vpp	- X -
+#
+vop_mknod {
+	IN struct vnode *dvp;
+	OUT WILLRELE struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+};
+
+#
+#% open		vp	L L L
+#
+vop_open {
+	IN struct vnode *vp;
+	IN int mode;
+	IN struct ucred *cred;
+	IN struct proc *p;
+};
+
+#
+#% close	vp	U U U
+#
+vop_close {
+	IN struct vnode *vp;
+	IN int fflag;
+	IN struct ucred *cred;
+	IN struct proc *p;
+};
+
+#
+#% access	vp	L L L
+#
+vop_access {
+	IN struct vnode *vp;
+	IN int mode;
+	IN struct ucred *cred;
+	IN struct proc *p;
+};
+
+#
+#% getattr	vp	= = =
+#
+vop_getattr {
+	IN struct vnode *vp;
+	IN struct vattr *vap;
+	IN struct ucred *cred;
+	IN struct proc *p;
+};
+
+#
+#% setattr	vp	L L L
+#
+vop_setattr {
+	IN struct vnode *vp;
+	IN struct vattr *vap;
+	IN struct ucred *cred;
+	IN struct proc *p;
+};
+
+#
+#% read		vp	L L L
+#
+vop_read {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN int ioflag;
+	IN struct ucred *cred;
+};
+
+#
+#% write	vp	L L L
+#
+vop_write {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN int ioflag;
+	IN struct ucred *cred;
+};
+
+#
+#% lease	vp	= = =
+#
+vop_lease {
+	IN struct vnode *vp;
+	IN struct proc *p;
+	IN struct ucred *cred;
+	IN int flag;
+};
+
+#
+#% ioctl	vp	U U U
+#
+vop_ioctl {
+	IN struct vnode *vp;
+	IN u_long command;
+	IN caddr_t data;
+	IN int fflag;
+	IN struct ucred *cred;
+	IN struct proc *p;
+};
+
+#
+#% poll	vp	U U U
+#
+vop_poll {
+	IN struct vnode *vp;
+	IN int events;
+	IN struct ucred *cred;
+	IN struct proc *p;
+};
+
+#
+#% revoke	vp	U U U
+#
+vop_revoke {
+	IN struct vnode *vp;
+	IN int flags;
+};
+
+#
+# XXX - not used
+#
+vop_mmap {
+	IN struct vnode *vp;
+	IN int fflags;
+	IN struct ucred *cred;
+	IN struct proc *p;
+};
+
+#
+#% fsync	vp	L L L
+#
+vop_fsync {
+	IN struct vnode *vp;
+	IN struct ucred *cred;
+	IN int waitfor;
+	IN struct proc *p;
+};
+
+#
+#% remove	dvp	L L L
+#% remove	vp	L L L
+#
+vop_remove {
+	IN struct vnode *dvp;
+	IN struct vnode *vp;
+	IN struct componentname *cnp;
+};
+
+#
+#% link		tdvp	L L L
+#% link		vp	U U U
+#
+vop_link {
+	IN struct vnode *tdvp;
+	IN struct vnode *vp;
+	IN struct componentname *cnp;
+};
+
+#
+#% rename	fdvp	U U U
+#% rename	fvp	U U U
+#% rename	tdvp	L U U
+#% rename	tvp	X U U
+#
+vop_rename {
+	IN WILLRELE struct vnode *fdvp;
+	IN WILLRELE struct vnode *fvp;
+	IN struct componentname *fcnp;
+	IN WILLRELE struct vnode *tdvp;
+	IN WILLRELE struct vnode *tvp;
+	IN struct componentname *tcnp;
+};
+
+#
+#% mkdir	dvp	L L L
+#% mkdir	vpp	- L -
+#
+vop_mkdir {
+	IN struct vnode *dvp;
+	OUT struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+};
+
+#
+#% rmdir	dvp	L L L
+#% rmdir	vp	L L L
+#
+vop_rmdir {
+	IN struct vnode *dvp;
+	IN struct vnode *vp;
+	IN struct componentname *cnp;
+};
+
+#
+#% symlink	dvp	L L L
+#% symlink	vpp	- U -
+#
+# XXX - note that the return vnode has already been VRELE'ed
+#	by the filesystem layer.  To use it you must use vget,
+#	possibly with a further namei.
+#
+vop_symlink {
+	IN struct vnode *dvp;
+	OUT WILLRELE struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+	IN char *target;
+};
+
+#
+#% readdir	vp	L L L
+#
+vop_readdir {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN struct ucred *cred;
+	INOUT int *eofflag;
+	OUT int *ncookies;
+	INOUT u_long **cookies;
+};
+
+#
+#% readlink	vp	L L L
+#
+vop_readlink {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN struct ucred *cred;
+};
+
+#
+#% abortop	dvp	= = =
+#
+vop_abortop {
+	IN struct vnode *dvp;
+	IN struct componentname *cnp;
+};
+
+#
+#% inactive	vp	L U U
+#
+vop_inactive {
+	IN struct vnode *vp;
+	IN struct proc *p;
+};
+
+#
+#% reclaim	vp	U U U
+#
+vop_reclaim {
+	IN struct vnode *vp;
+	IN struct proc *p;
+};
+
+#
+#% lock		vp	U L U
+#
+vop_lock {
+	IN struct vnode *vp;
+	IN int flags;
+	IN struct proc *p;
+};
+
+#
+#% unlock	vp	L U L
+#
+vop_unlock {
+	IN struct vnode *vp;
+	IN int flags;
+	IN struct proc *p;
+};
+
+#
+#% bmap		vp	L L L
+#% bmap		vpp	- U -
+#
+vop_bmap {
+	IN struct vnode *vp;
+	IN daddr_t bn;
+	OUT struct vnode **vpp;
+	IN daddr_t *bnp;
+	OUT int *runp;
+	OUT int *runb;
+};
+
+#
+# Needs work: no vp?
+#
+vop_strategy {
+	IN struct vnode *vp;
+	IN struct buf *bp;
+};
+
+#
+#% print	vp	= = =
+#
+vop_print {
+	IN struct vnode *vp;
+};
+
+#
+#% islocked	vp	= = =
+#
+vop_islocked {
+	IN struct vnode *vp;
+};
+
+#
+#% pathconf	vp	L L L
+#
+vop_pathconf {
+	IN struct vnode *vp;
+	IN int name;
+	OUT register_t *retval;
+};
+
+#
+#% advlock	vp	U U U
+#
+vop_advlock {
+	IN struct vnode *vp;
+	IN caddr_t id;
+	IN int op;
+	IN struct flock *fl;
+	IN int flags;
+};
+
+#
+#% balloc	vp	L L L
+#
+vop_balloc {
+	IN struct vnode *vp;
+	IN off_t startoffset;
+	IN int size;
+	IN struct ucred *cred;
+	IN int flags;
+	OUT struct buf **bpp;
+};
+
+#
+#% reallocblks	vp	L L L
+#
+vop_reallocblks {
+	IN struct vnode *vp;
+	IN struct cluster_save *buflist;
+};
+
+vop_getpages {
+	IN struct vnode *vp;
+	IN vm_page_t *m;
+	IN int count;
+	IN int reqpage;
+	IN vm_ooffset_t offset;
+};
+
+vop_putpages {
+	IN struct vnode *vp;
+	IN vm_page_t *m;
+	IN int count;
+	IN int sync;
+	IN int *rtvals;
+	IN vm_ooffset_t offset;
+};
+
+#
+#% freeblks	vp	- - -
+#
+# This call is used by the filesystem to release blocks back to 
+# device-driver.  This is useful if the driver has a lengthy 
+# erase handling or similar.
+#
+
+vop_freeblks {
+	IN struct vnode *vp;
+	IN daddr_t addr;
+	IN daddr_t length;
+};
+
+#
+# Needs work: no vp?
+#
+#vop_bwrite {
+#	IN struct buf *bp;
+#};