146 files changed, 108911 insertions, 0 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc
new file mode 100644
index 0000000..b958ba7
--- /dev/null
+++ b/sys/kern/Make.tags.inc
@@ -0,0 +1,103 @@
+# $FreeBSD$
+#	@(#)Make.tags.inc	8.1 (Berkeley) 6/11/93
+
+SYS?=	${.CURDIR}/..
+
+# Common files for "make tags", included by the Makefile for each
+# architecture.
+
+# Put the /sys/sys include files at the end so that subroutine definitions
+# win when there is a struct tag with the same name (e.g., vmmeter).  The
+# better solution would be for ctags to generate "struct vmmeter" tags.
+
+COMM=	${SYS}/dev/advansys/*.[ch] \
+	${SYS}/dev/aha/*.[ch] \
+	${SYS}/dev/aic7xxx/*.[ch] \
+	${SYS}/dev/buslogic/*.[ch] \
+	${SYS}/dev/ccd/*.[ch] \
+	${SYS}/dev/dec/*.[ch] \
+	${SYS}/dev/dpt/*.[ch] \
+	${SYS}/dev/en/*.[ch] \
+	${SYS}/dev/hea/*.[ch] \
+	${SYS}/dev/hfa/*.[ch] \
+	${SYS}/dev/iicbus/*.[ch] \
+	${SYS}/dev/isp/*.[ch] \
+	${SYS}/dev/pdq/*.[ch] \
+	${SYS}/dev/ppbus/*.[ch] \
+	${SYS}/dev/smbus/*.[ch] \
+	${SYS}/dev/vx/*.[ch] \
+	${SYS}/fs/deadfs/*.[ch] \
+	${SYS}/fs/fdescfs/*.[ch] \
+	${SYS}/fs/fifofs/*.[ch] \
+	${SYS}/fs/msdosfs/*.[ch] \
+	${SYS}/fs/nullfs/*.[ch] \
+	${SYS}/fs/portalfs/*.[ch] \
+	${SYS}/fs/procfs/*.[ch] \
+	${SYS}/fs/specfs/*.[ch] \
+	${SYS}/fs/umapfs/*.[ch] \
+	${SYS}/fs/unionfs/*.[ch] \
+	${SYS}/isofs/cd9660/*.[ch] \
+	${SYS}/kern/*.[ch] \
+	${SYS}/net/*.[ch] \
+	${SYS}/netatalk/*.[ch] \
+	${SYS}/netatm/*.[ch] \
+	${SYS}/netinet/*.[ch] \
+	${SYS}/netipx/*.[ch] \
+	${SYS}/netkey/*.[ch] \
+	${SYS}/netnatm/*.[ch] \
+	${SYS}/netns/*.[ch] \
+	${SYS}/nfs/*.[ch] \
+	${SYS}/pci/*.[ch] \
+	${SYS}/posix4/*.[ch] \
+	${SYS}/ufs/ffs/*.[ch] \
+	${SYS}/ufs/ufs/*.[ch] \
+	${SYS}/vm/*.[ch] \
+	${SYS}/sys/*.[ch]
+
+COMMDIR1= ${SYS}/conf \
+	${SYS}/kern \
+	${SYS}/net \
+	${SYS}/netatalk \
+	${SYS}/netatm \
+	${SYS}/netinet \
+	${SYS}/netipx \
+	${SYS}/netkey \
+	${SYS}/netnatm \
+	${SYS}/netns \
+	${SYS}/nfs \
+	${SYS}/pci \
+	${SYS}/posix4 \
+	${SYS}/vm \
+	${SYS}/sys
+
+COMMDIR2= ${SYS}/dev/advansys \
+	${SYS}/dev/aha \
+	${SYS}/dev/aic7xxx \
+	${SYS}/dev/buslogic \
+	${SYS}/dev/ccd \
+	${SYS}/dev/dec \
+	${SYS}/dev/dpt \
+	${SYS}/dev/en \
+	${SYS}/dev/hea \
+	${SYS}/dev/hfa \
+	${SYS}/dev/iicbus \
+	${SYS}/dev/isp \
+	${SYS}/dev/pdq \
+	${SYS}/dev/ppbus \
+	${SYS}/dev/smbus \
+	${SYS}/dev/vn \
+	${SYS}/dev/vx \
+	${SYS}/fs/deadfs \
+	${SYS}/fs/devfs \
+	${SYS}/fs/fdescfs \
+	${SYS}/fs/fifofs \
+	${SYS}/fs/msdosfs \
+	${SYS}/fs/nullfs \
+	${SYS}/fs/portalfs \
+	${SYS}/fs/procfs \
+	${SYS}/fs/specfs \
+	${SYS}/fs/umapfs \
+	${SYS}/fs/unionfs \
+	${SYS}/isofs/cd9660 \
+	${SYS}/ufs/ffs \
+	${SYS}/ufs/ufs
diff --git a/sys/kern/Makefile b/sys/kern/Makefile
new file mode 100644
index 0000000..cdfcc2a
--- /dev/null
+++ b/sys/kern/Makefile
@@ -0,0 +1,54 @@
+#	@(#)Makefile	8.2 (Berkeley) 3/21/94
+# $FreeBSD$
+
+# Makefile for kernel tags files, init_sysent, etc.
+
+ARCH=	i386 # luna68k news3400 pmax sparc tahoe vax
+
+all:
+	@echo "make tags, make links or init_sysent.c only"
+
+init_sysent.c syscalls.c ../sys/syscall.h \
+../sys/syscall.mk ../sys/sysproto.h: makesyscalls.sh syscalls.master
+	-mv -f init_sysent.c init_sysent.c.bak
+	-mv -f syscalls.c syscalls.c.bak
+	-mv -f ../sys/syscall.h ../sys/syscall.h.bak
+	-mv -f ../sys/syscall.mk ../sys/syscall.mk.bak
+	-mv -f ../sys/sysproto.h ../sys/sysproto.h.bak
+	sh makesyscalls.sh syscalls.master
+
+# Kernel tags:
+# Tags files are built in the top-level directory for each architecture,
+# with a makefile listing the architecture-dependent files, etc.  The list
+# of common files is in ./Make.tags.inc.  Links to the correct tags file
+# are placed in each source directory.  We need to have links to tags files
+# from the generic directories that are relative to the machine type, even
+# via remote mounts; therefore we use symlinks to $SYSTAGS, which points at
+# ${SYSDIR}/${MACHINE_ARCH}/tags.
+
+SYSTAGS=/var/db/sys_tags
+SYSDIR=/sys
+
+# Directories in which to place tags links (other than machine-dependent)
+DGEN=	conf \
+	dev dev/scsi \
+	fs fs/deadfs fs/fdescfs fs/fifofs \
+	fs/lofs fs/nullfs fs/portalfs fs/procfs \
+	fs/specfs fs/umapfs fs/unionfs \
+	hp hp/dev hp/hpux \
+	kern libkern \
+	net netccitt netinet netiso netns nfs scripts sys \
+	ufs ufs/ffs ufs/lfs ufs/ufs \
+	vm
+
+tags::
+	-for i in ${ARCH}; do \
+	    (cd ../$$i && make ${MFLAGS} tags); done
+
+links::
+	rm -f ${SYSTAGS}
+	ln -s ${SYSDIR}/${MACHINE_ARCH}/tags ${SYSTAGS}
+	-for i in ${DGEN}; do \
+	    (cd ../$$i && { rm -f tags; ln -s ${SYSTAGS} tags; }) done
+	-for i in ${ARCH}; do \
+	    (cd ../$$i && make ${MFLAGS} SYSTAGS=${SYSTAGS} links); done
diff --git a/sys/kern/bus_if.m b/sys/kern/bus_if.m
new file mode 100644
index 0000000..bf8d4ac
--- /dev/null
+++ b/sys/kern/bus_if.m
@@ -0,0 +1,246 @@
+#
+# Copyright (c) 1998 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+
+INTERFACE bus;
+
+#
+# Default implementations of some methods.
+#
+CODE {
+	static struct resource *
+	null_alloc_resource(device_t dev, device_t child,
+			    int type, int *rid,
+			    u_long start, u_long end,
+			    u_long count, u_int flags)
+	{
+	    return 0;
+	}
+};
+
+#
+# This is called from system code which prints out a description of a
+# device.  It should describe the attachment that the child has with
+# the parent.  For instance the TurboLaser bus prints which node the
+# device is attached to.  See bus_generic_print_child.9 for more 
+# information.
+# This method returns the number of characters output.
+#
+METHOD int print_child {
+	device_t dev;
+	device_t child;
+};
+
+# 
+# Called for each child device that 
+# did not succeed in probing for a
+# driver.
+#    
+METHOD void probe_nomatch {
+        device_t dev;
+        device_t child;
+};
+
+#
+# These two methods manage a bus specific set of instance variables of
+# a child device.  The intention is that each different type of bus
+# defines a set of appropriate instance variables (such as ports and
+# irqs for ISA bus etc.)
+#
+# This information could be given to the child device as a struct but
+# that makes it hard for a bus to add or remove variables without
+# forcing an edit and recompile for all drivers which may not be
+# possible for vendor supplied binary drivers.
+
+#
+# Read an instance variable.  Return 0 on success.
+#
+METHOD int read_ivar {
+	device_t _dev;
+	device_t _child;
+	int _indx;
+	uintptr_t *_result;
+};
+
+#
+# Write an instance variable.  Return 0 on success.
+#
+METHOD int write_ivar {
+	device_t _dev;
+	device_t _child;
+	int _indx;
+	uintptr_t _value;
+};
+
+#
+# Called after the child's DEVICE_DETACH method to allow the parent
+# to reclaim any resources allocated on behalf of the child.
+#
+METHOD void child_detached {
+	device_t _dev;
+	device_t _child;
+};
+
+#
+# Called when a new driver is added to the devclass which owns this
+# bus. The generic implementation of this method attempts to probe and
+# attach any un-matched children of the bus.
+#
+METHOD void driver_added {
+	device_t _dev;
+	driver_t *_driver;
+} DEFAULT bus_generic_driver_added;
+
+#
+# For busses which use use drivers supporting DEVICE_IDENTIFY to
+# enumerate their devices, these methods are used to create new
+# device instances. If place is non-NULL, the new device will be
+# added after the last existing child with the same order.
+#
+METHOD device_t add_child {
+	device_t _dev;
+	int _order;
+	const char *_name;
+	int _unit;
+};
+
+#
+# Allocate a system resource attached to `dev' on behalf of `child'.
+# The types are defined in <machine/resource.h>; the meaning of the
+# resource-ID field varies from bus to bus (but *rid == 0 is always
+# valid if the resource type is).  start and end reflect the allowable
+# range, and should be passed as `0UL' and `~0UL', respectively, if
+# the client has no range restriction.  count is the number of consecutive
+# indices in the resource required.  flags is a set of sharing flags
+# as defined in <sys/rman.h>.
+#
+# Returns a resource or a null pointer on failure.  The caller is
+# responsible for calling rman_activate_resource() when it actually
+# uses the resource.
+#
+METHOD struct resource * alloc_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int	       *_rid;
+	u_long		_start;
+	u_long		_end;
+	u_long		_count;
+	u_int		_flags;
+} DEFAULT null_alloc_resource;
+
+METHOD int activate_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+	struct resource *_r;
+};
+
+METHOD int deactivate_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+	struct resource *_r;
+};
+
+#
+# Free a resource allocated by the preceding method.  The `rid' value
+# must be the same as the one returned by BUS_ALLOC_RESOURCE (which
+# is not necessarily the same as the one the client passed).
+#
+METHOD int release_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+	struct resource *_res;
+};
+
+METHOD int setup_intr {
+	device_t	_dev;
+	device_t	_child;
+	struct resource *_irq;
+	int		_flags;
+	driver_intr_t	*_intr;
+	void		*_arg;
+	void		**_cookiep;
+};
+
+METHOD int teardown_intr {
+	device_t	_dev;
+	device_t	_child;
+	struct resource	*_irq;
+	void		*_cookie;
+};
+
+#
+# Set the range used for a particular resource. Return EINVAL if
+# the type or rid are out of range.
+#
+METHOD int set_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+	u_long		_start;
+	u_long		_count;
+};
+
+#
+# Get the range for a resource. Return ENOENT if the type or rid are
+# out of range or have not been set.
+#
+METHOD int get_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+	u_long		*_startp;
+	u_long		*_countp;
+};
+
+#
+# Delete a resource.
+#
+METHOD void delete_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+};
+
+#
+# Return a struct resource_list.
+#
+METHOD struct resource_list * get_resource_list {
+	device_t	_dev;
+	device_t	_child;
+} DEFAULT bus_generic_get_resource_list;
diff --git a/sys/kern/clock_if.m b/sys/kern/clock_if.m
new file mode 100644
index 0000000..3ddb25e
--- /dev/null
+++ b/sys/kern/clock_if.m
@@ -0,0 +1,44 @@
+# Copyright (c) 2001 by Thomas Moestl <tmm@FreeBSD.org>.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/bus.h>
+#include <sys/time.h>
+
+INTERFACE clock;
+
+# Interface for clock drivers. This is inspired by the NetBSD device-independent
+# clock code (by Gordon W. Ross).
+
+# An EINVAL error return from this call signifies that the clock has an illegal
+# setting.
+METHOD int gettime {
+	device_t dev;
+	struct timespec *ts;
+};
+
+METHOD int settime {
+	device_t dev;
+	struct timespec *ts;
+};
diff --git a/sys/kern/device_if.m b/sys/kern/device_if.m
new file mode 100644
index 0000000..005eb38
--- /dev/null
+++ b/sys/kern/device_if.m
@@ -0,0 +1,127 @@
+#
+# Copyright (c) 1998 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+
+INTERFACE device;
+
+#
+# Default implementations of some methods.
+#
+CODE {
+	static int null_shutdown(device_t dev)
+	{
+	    return 0;
+	}
+
+	static int null_suspend(device_t dev)
+	{
+	    return 0;
+	}
+
+	static int null_resume(device_t dev)
+	{
+	    return 0;
+	}
+};
+
+#
+# Probe to see if the device is present.  Return 0 if the device exists,
+# ENXIO if it cannot be found. If some other error happens during the
+# probe (such as a memory allocation failure), an appropriate error code
+# should be returned. For cases where more than one driver matches a
+# device, a priority value can be returned.  In this case, success codes
+# are values less than or equal to zero with the highest value representing
+# the best match.  Failure codes are represented by positive values and
+# the regular unix error codes should be used for the purpose.
+
+# If a driver returns a success code which is less than zero, it must
+# not assume that it will be the same driver which is attached to the
+# device. In particular, it must not assume that any values stored in
+# the softc structure will be available for its attach method and any
+# resources allocated during probe must be released and re-allocated
+# if the attach method is called.  If a success code of zero is
+# returned, the driver can assume that it will be the one attached.
+# 
+# Devices which implement busses should use this method to probe for
+# the existence of devices attached to the bus and add them as
+# children.  If this is combined with the use of bus_generic_attach,
+# the child devices will be automatically probed and attached.
+#
+METHOD int probe {
+	device_t dev;
+};
+
+#
+# Called by a parent bus to add new devices to the bus.
+#
+STATICMETHOD void identify {
+	driver_t *driver;
+	device_t parent;
+};
+
+#
+# Attach a device to the system.  The probe method will have been
+# called and will have indicated that the device exists.  This routine
+# should initialise the hardware and allocate other system resources
+# (such as devfs entries).  Returns 0 on success.
+#
+METHOD int attach {
+	device_t dev;
+};
+
+#
+# Detach a device.  This can be called if the user is replacing the
+# driver software or if a device is about to be physically removed
+# from the system (e.g. for pccard devices).  Returns 0 on success.
+#
+METHOD int detach {
+	device_t dev;
+};
+
+#
+# This is called during system shutdown to allow the driver to put the 
+# hardware into a consistent state for rebooting the computer.
+#
+METHOD int shutdown {
+	device_t dev;
+} DEFAULT null_shutdown;
+
+#
+# This is called by the power-management subsystem when a suspend has been
+# requested by the user or by some automatic mechanism.  This gives
+# drivers a chance to veto the suspend or save their configuration before
+# power is removed.
+#
+METHOD int suspend {
+	device_t dev;
+} DEFAULT null_suspend;
+
+METHOD int resume {
+	device_t dev;
+} DEFAULT null_resume;
diff --git a/sys/kern/genassym.sh b/sys/kern/genassym.sh
new file mode 100644
index 0000000..70ad69e
--- /dev/null
+++ b/sys/kern/genassym.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+# $FreeBSD$
+
+# Grrr, this should use stdin and stdout, but is encrufted for compatibility.
+
+usage()
+{
+	echo "usage: genassym [-o outfile] objfile"
+	exit 1
+}
+
+outfile=/dev/stdout
+while getopts "o:" option
+do
+	case "$option" in
+	o)	outfile="$OPTARG";;
+	*)	usage;;
+	esac
+done
+shift $(($OPTIND - 1))
+case $# in
+1)	;;
+*)	usage;;
+esac
+
+${NM:='nm'} "$1" | ${AWK:='awk'} '
+/ C .*sign$/ {
+	sign = substr($1, length($1) - 3, 4)
+	sub("^0*", "", sign)
+	if (sign != "")
+		sign = "-"
+}
+/ C .*w0$/ {
+	w0 = substr($1, length($1) - 3, 4)
+}
+/ C .*w1$/ {
+	w1 = substr($1, length($1) - 3, 4)
+}
+/ C .*w2$/ {
+	w2 = substr($1, length($1) - 3, 4)
+}
+/ C .*w3$/ {
+	w3 = substr($1, length($1) - 3, 4)
+	w = w3 w2 w1 w0
+	sub("^0*", "", w)
+	if (w == "")
+		w = "0"
+	sub("w3$", "", $3)
+	# This still has minor problems representing INT_MIN, etc.  E.g.,
+	# with 32-bit 2''s complement ints, this prints -0x80000000, which 
+	# has the wrong type (unsigned int).
+	printf("#define\t%s\t%s0x%s\n", $3, sign, w)
+}
+' 3>"$outfile" >&3 3>&-
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
new file mode 100644
index 0000000..41ae8cf
--- /dev/null
+++ b/sys/kern/imgact_aout.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_kstack_pages.h"
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sysent.h>
+#include <sys/syscall.h>
+#include <sys/vnode.h>
+#include <sys/user.h>
+
+#include <machine/md_var.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+
+static int	exec_aout_imgact(struct image_params *imgp);
+
+struct sysentvec aout_sysvec = {
+	SYS_MAXSYSCALL,
+	sysent,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	sendsig,
+	sigcode,
+	&szsigcode,
+	0,
+	"FreeBSD a.out",
+	aout_coredump,
+	NULL,
+	MINSIGSTKSZ
+};
+
+static int
+exec_aout_imgact(imgp)
+	struct image_params *imgp;
+{
+	const struct exec *a_out = (const struct exec *) imgp->image_header;
+	struct vmspace *vmspace;
+	struct vnode *vp;
+	vm_map_t map;
+	vm_object_t object;
+	vm_offset_t text_end, data_end;
+	unsigned long virtual_offset;
+	unsigned long file_offset;
+	unsigned long bss_size;
+	int error;
+
+	GIANT_REQUIRED;
+
+	/*
+	 * Linux and *BSD binaries look very much alike,
+	 * only the machine id is different:
+	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
+	 * NetBSD is in network byte order.. ugh.
+	 */
+	if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
+	    ((a_out->a_magic >> 16) & 0xff) != 0 &&
+	    ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
+                return -1;
+
+	/*
+	 * Set file/virtual offset based on a.out variant.
+	 *	We do two cases: host byte order and network byte order
+	 *	(for NetBSD compatibility)
+	 */
+	switch ((int)(a_out->a_magic & 0xffff)) {
+	case ZMAGIC:
+		virtual_offset = 0;
+		if (a_out->a_text) {
+			file_offset = PAGE_SIZE;
+		} else {
+			/* Bill's "screwball mode" */
+			file_offset = 0;
+		}
+		break;
+	case QMAGIC:
+		virtual_offset = PAGE_SIZE;
+		file_offset = 0;
+		/* Pass PS_STRINGS for BSD/OS binaries only. */
+		if (N_GETMID(*a_out) == MID_ZERO)
+			imgp->ps_strings = PS_STRINGS;
+		break;
+	default:
+		/* NetBSD compatibility */
+		switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
+		case ZMAGIC:
+		case QMAGIC:
+			virtual_offset = PAGE_SIZE;
+			file_offset = 0;
+			break;
+		default:
+			return (-1);
+		}
+	}
+
+	bss_size = roundup(a_out->a_bss, PAGE_SIZE);
+
+	/*
+	 * Check various fields in header for validity/bounds.
+	 */
+	if (/* entry point must lay with text region */
+	    a_out->a_entry < virtual_offset ||
+	    a_out->a_entry >= virtual_offset + a_out->a_text ||
+
+	    /* text and data size must each be page rounded */
+	    a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
+		return (-1);
+
+	/* text + data can't exceed file size */
+	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
+		return (EFAULT);
+
+	/*
+	 * text/data/bss must not exceed limits
+	 */
+	mtx_assert(&Giant, MA_OWNED);
+	if (/* text can't exceed maximum text size */
+	    a_out->a_text > maxtsiz ||
+
+	    /* data + bss can't exceed rlimit */
+	    a_out->a_data + bss_size >
+		imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
+			return (ENOMEM);
+
+	/* copy in arguments and/or environment from old process */
+	error = exec_extract_strings(imgp);
+	if (error)
+		return (error);
+
+	/*
+	 * Destroy old process VM and create a new one (with a new stack)
+	 */
+	exec_new_vmspace(imgp);
+
+	/*
+	 * The vm space can be changed by exec_new_vmspace
+	 */
+	vmspace = imgp->proc->p_vmspace;
+
+	vp = imgp->vp;
+	map = &vmspace->vm_map;
+	vm_map_lock(map);
+	VOP_GETVOBJECT(vp, &object);
+	vm_object_reference(object);
+
+	text_end = virtual_offset + a_out->a_text;
+	error = vm_map_insert(map, object,
+		file_offset,
+		virtual_offset, text_end,
+		VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
+		MAP_COPY_ON_WRITE | MAP_PREFAULT);
+	if (error) {
+		vm_map_unlock(map);
+		return (error);
+	}
+	data_end = text_end + a_out->a_data;
+	if (a_out->a_data) {
+		vm_object_reference(object);
+		error = vm_map_insert(map, object,
+			file_offset + a_out->a_text,
+			text_end, data_end,
+			VM_PROT_ALL, VM_PROT_ALL,
+			MAP_COPY_ON_WRITE | MAP_PREFAULT);
+		if (error) {
+			vm_map_unlock(map);
+			return (error);
+		}
+	}
+
+	if (bss_size) {
+		error = vm_map_insert(map, NULL, 0,
+			data_end, data_end + bss_size,
+			VM_PROT_ALL, VM_PROT_ALL, 0);
+		if (error) {
+			vm_map_unlock(map);
+			return (error);
+		}
+	}
+	vm_map_unlock(map);
+
+	/* Fill in process VM information */
+	vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
+	vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
+	vmspace->vm_daddr = (caddr_t) (uintptr_t)
+			    (virtual_offset + a_out->a_text);
+
+	/* Fill in image_params */
+	imgp->interpreted = 0;
+	imgp->entry_addr = a_out->a_entry;
+
+	imgp->proc->p_sysent = &aout_sysvec;
+
+	/* Indicate that this file should not be modified */
+	imgp->vp->v_flag |= VTEXT;
+
+	return (0);
+}
+
+/*
+ * Dump core, into a file named as described in the comments for
+ * expand_name(), unless the process was setuid/setgid.
+ */
+int
+aout_coredump(td, vp, limit)
+	register struct thread *td;
+	register struct vnode *vp;
+	off_t limit;
+{
+	struct proc *p = td->td_proc;
+	register struct ucred *cred = td->td_ucred;
+	register struct vmspace *vm = p->p_vmspace;
+	int error;
+
+	if (ctob((UAREA_PAGES + KSTACK_PAGES)
+	    + vm->vm_dsize + vm->vm_ssize) >= limit)
+		return (EFAULT);
+	PROC_LOCK(p);
+	fill_kinfo_proc(p, &p->p_uarea->u_kproc);
+	PROC_UNLOCK(p);
+	error = cpu_coredump(td, vp, cred);
+	if (error == 0)
+		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
+		    (int)ctob(vm->vm_dsize),
+		    (off_t)ctob(UAREA_PAGES + KSTACK_PAGES), UIO_USERSPACE,
+		    IO_UNIT | IO_DIRECT, cred, (int *) NULL, td);
+	if (error == 0)
+		error = vn_rdwr_inchunks(UIO_WRITE, vp,
+		    (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)),
+		    round_page(ctob(vm->vm_ssize)),
+		    (off_t)ctob(UAREA_PAGES + KSTACK_PAGES) +
+		        ctob(vm->vm_dsize), UIO_USERSPACE,
+		    IO_UNIT | IO_DIRECT, cred, (int *) NULL, td);
+	return (error);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
+EXEC_SET(aout, aout_execsw);
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
new file mode 100644
index 0000000..9044527
--- /dev/null
+++ b/sys/kern/imgact_elf.c
@@ -0,0 +1,1075 @@
+/*-
+ * Copyright (c) 2000 David O'Brien
+ * Copyright (c) 1995-1996 S�ren Schmidt
+ * Copyright (c) 1996 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/mman.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/resourcevar.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+
+#include <machine/elf.h>
+#include <machine/md_var.h>
+
+#define OLD_EI_BRAND	8
+
+__ElfType(Brandinfo);
+__ElfType(Auxargs);
+
+static int elf_check_header(const Elf_Ehdr *hdr);
+static int elf_freebsd_fixup(register_t **stack_base,
+    struct image_params *imgp);
+static int elf_load_file(struct proc *p, const char *file, u_long *addr,
+    u_long *entry);
+static int elf_load_section(struct proc *p,
+    struct vmspace *vmspace, struct vnode *vp,
+    vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
+    vm_prot_t prot);
+static int exec_elf_imgact(struct image_params *imgp);
+
+static int elf_trace = 0;
+SYSCTL_INT(_debug, OID_AUTO, elf_trace, CTLFLAG_RW, &elf_trace, 0, "");
+
+struct sysentvec elf_freebsd_sysvec = {
+        SYS_MAXSYSCALL,
+        sysent,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        elf_freebsd_fixup,
+        sendsig,
+        sigcode,
+        &szsigcode,
+        0,
+	"FreeBSD ELF",
+	elf_coredump,
+	NULL,
+	MINSIGSTKSZ
+};
+
+static Elf_Brandinfo freebsd_brand_info = {
+						ELFOSABI_FREEBSD,
+						"FreeBSD",
+						"",
+						"/usr/libexec/ld-elf.so.1",
+						&elf_freebsd_sysvec
+					  };
+static Elf_Brandinfo *elf_brand_list[MAX_BRANDS] = {
+							&freebsd_brand_info,
+							NULL, NULL, NULL,
+							NULL, NULL, NULL, NULL
+						    };
+
+int
+elf_insert_brand_entry(Elf_Brandinfo *entry)
+{
+	int i;
+
+	for (i=1; i<MAX_BRANDS; i++) {
+		if (elf_brand_list[i] == NULL) {
+			elf_brand_list[i] = entry;
+			break;
+		}
+	}
+	if (i == MAX_BRANDS)
+		return -1;
+	return 0;
+}
+
+int
+elf_remove_brand_entry(Elf_Brandinfo *entry)
+{
+	int i;
+
+	for (i=1; i<MAX_BRANDS; i++) {
+		if (elf_brand_list[i] == entry) {
+			elf_brand_list[i] = NULL;
+			break;
+		}
+	}
+	if (i == MAX_BRANDS)
+		return -1;
+	return 0;
+}
+
+int
+elf_brand_inuse(Elf_Brandinfo *entry)
+{
+	struct proc *p;
+	int rval = FALSE;
+
+	sx_slock(&allproc_lock);
+	LIST_FOREACH(p, &allproc, p_list) {
+		if (p->p_sysent == entry->sysvec) {
+			rval = TRUE;
+			break;
+		}
+	}
+	sx_sunlock(&allproc_lock);
+
+	return (rval);
+}
+
+static int
+elf_check_header(const Elf_Ehdr *hdr)
+{
+	if (!IS_ELF(*hdr) ||
+	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
+	    hdr->e_ident[EI_VERSION] != EV_CURRENT)
+		return ENOEXEC;
+
+	if (!ELF_MACHINE_OK(hdr->e_machine))
+		return ENOEXEC;
+
+	if (hdr->e_version != ELF_TARG_VER)
+		return ENOEXEC;
+	
+	return 0;
+}
+
+static int
+elf_load_section(struct proc *p, struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
+{
+	size_t map_len;
+	vm_offset_t map_addr;
+	int error, rv;
+	size_t copy_len;
+	vm_object_t object;
+	vm_offset_t file_addr;
+	vm_offset_t data_buf = 0;
+
+	GIANT_REQUIRED;
+
+	VOP_GETVOBJECT(vp, &object);
+	error = 0;
+
+	/*
+	 * It's necessary to fail if the filsz + offset taken from the
+	 * header is greater than the actual file pager object's size.
+	 * If we were to allow this, then the vm_map_find() below would
+	 * walk right off the end of the file object and into the ether.
+	 *
+	 * While I'm here, might as well check for something else that
+	 * is invalid: filsz cannot be greater than memsz.
+	 */
+	if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size ||
+	    filsz > memsz) {
+		uprintf("elf_load_section: truncated ELF file\n");
+		return (ENOEXEC);
+	}
+
+	map_addr = trunc_page((vm_offset_t)vmaddr);
+	file_addr = trunc_page(offset);
+
+	/*
+	 * We have two choices.  We can either clear the data in the last page
+	 * of an oversized mapping, or we can start the anon mapping a page
+	 * early and copy the initialized data into that first page.  We
+	 * choose the second..
+	 */
+	if (memsz > filsz)
+		map_len = trunc_page(offset+filsz) - file_addr;
+	else
+		map_len = round_page(offset+filsz) - file_addr;
+
+	if (map_len != 0) {
+		vm_object_reference(object);
+		vm_map_lock(&vmspace->vm_map);
+		rv = vm_map_insert(&vmspace->vm_map,
+				      object,
+				      file_addr,	/* file offset */
+				      map_addr,		/* virtual start */
+				      map_addr + map_len,/* virtual end */
+				      prot,
+				      VM_PROT_ALL,
+				      MAP_COPY_ON_WRITE | MAP_PREFAULT);
+		vm_map_unlock(&vmspace->vm_map);
+		if (rv != KERN_SUCCESS) {
+			vm_object_deallocate(object);
+			return EINVAL;
+		}
+
+		/* we can stop now if we've covered it all */
+		if (memsz == filsz) {
+			return 0;
+		}
+	}
+
+
+	/*
+	 * We have to get the remaining bit of the file into the first part
+	 * of the oversized map segment.  This is normally because the .data
+	 * segment in the file is extended to provide bss.  It's a neat idea
+	 * to try and save a page, but it's a pain in the behind to implement.
+	 */
+	copy_len = (offset + filsz) - trunc_page(offset + filsz);
+	map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
+	map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
+
+	/* This had damn well better be true! */
+        if (map_len != 0) {
+		vm_map_lock(&vmspace->vm_map);
+		rv = vm_map_insert(&vmspace->vm_map, NULL, 0,
+					map_addr, map_addr + map_len,
+					VM_PROT_ALL, VM_PROT_ALL, 0);
+		vm_map_unlock(&vmspace->vm_map);
+		if (rv != KERN_SUCCESS) {
+			return EINVAL; 
+		}
+	}
+
+	if (copy_len != 0) {
+		vm_object_reference(object);
+		rv = vm_map_find(exec_map,
+				 object, 
+				 trunc_page(offset + filsz),
+				 &data_buf,
+				 PAGE_SIZE,
+				 TRUE,
+				 VM_PROT_READ,
+				 VM_PROT_ALL,
+				 MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
+		if (rv != KERN_SUCCESS) {
+			vm_object_deallocate(object);
+			return EINVAL;
+		}
+
+		/* send the page fragment to user space */
+		error = copyout((caddr_t)data_buf, (caddr_t)map_addr, copy_len);
+		vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
+		if (error) {
+			return (error);
+		}
+	}
+
+	/*
+	 * set it to the specified protection
+	 */
+	vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len,  prot,
+		       FALSE);
+
+	return error;
+}
+
+/*
+ * Load the file "file" into memory.  It may be either a shared object
+ * or an executable.
+ *
+ * The "addr" reference parameter is in/out.  On entry, it specifies
+ * the address where a shared object should be loaded.  If the file is
+ * an executable, this value is ignored.  On exit, "addr" specifies
+ * where the file was actually loaded.
+ *
+ * The "entry" reference parameter is out only.  On exit, it specifies
+ * the entry point for the loaded file.
+ */
+static int
+elf_load_file(struct proc *p, const char *file, u_long *addr, u_long *entry)
+{
+	struct {
+		struct nameidata nd;
+		struct vattr attr;
+		struct image_params image_params;
+	} *tempdata;
+	const Elf_Ehdr *hdr = NULL;
+	const Elf_Phdr *phdr = NULL;
+	struct nameidata *nd;
+	struct vmspace *vmspace = p->p_vmspace;
+	struct vattr *attr;
+	struct image_params *imgp;
+	vm_prot_t prot;
+	u_long rbase;
+	u_long base_addr = 0;
+	int error, i, numsegs;
+
+	if (curthread->td_proc != p)
+		panic("elf_load_file - thread");	/* XXXKSE DIAGNOSTIC */
+
+	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
+	nd = &tempdata->nd;
+	attr = &tempdata->attr;
+	imgp = &tempdata->image_params;
+
+	/*
+	 * Initialize part of the common data
+	 */
+	imgp->proc = p;
+	imgp->uap = NULL;
+	imgp->attr = attr;
+	imgp->firstpage = NULL;
+	imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE);
+
+	if (imgp->image_header == NULL) {
+		nd->ni_vp = NULL;
+		error = ENOMEM;
+		goto fail;
+	}
+
+	/* XXXKSE */
+        NDINIT(nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread);   
+			 
+	if ((error = namei(nd)) != 0) {
+		nd->ni_vp = NULL;
+		goto fail;
+	}
+	NDFREE(nd, NDF_ONLY_PNBUF);
+	imgp->vp = nd->ni_vp;
+
+	/*
+	 * Check permissions, modes, uid, etc on the file, and "open" it.
+	 */
+	error = exec_check_permissions(imgp);
+	if (error) {
+		VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
+		goto fail;
+	}
+
+	error = exec_map_first_page(imgp);
+	/*
+	 * Also make certain that the interpreter stays the same, so set
+	 * its VTEXT flag, too.
+	 */
+	if (error == 0)
+		nd->ni_vp->v_flag |= VTEXT;
+	VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
+	if (error)
+                goto fail;
+
+	hdr = (const Elf_Ehdr *)imgp->image_header;
+	if ((error = elf_check_header(hdr)) != 0)
+		goto fail;
+	if (hdr->e_type == ET_DYN)
+		rbase = *addr;
+	else if (hdr->e_type == ET_EXEC)
+		rbase = 0;
+	else {
+		error = ENOEXEC;
+		goto fail;
+	}
+
+	/* Only support headers that fit within first page for now */
+	if ((hdr->e_phoff > PAGE_SIZE) ||
+	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
+		error = ENOEXEC;
+		goto fail;
+	}
+
+	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+
+	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
+		if (phdr[i].p_type == PT_LOAD) {	/* Loadable segment */
+			prot = 0;
+			if (phdr[i].p_flags & PF_X)
+  				prot |= VM_PROT_EXECUTE;
+			if (phdr[i].p_flags & PF_W)
+  				prot |= VM_PROT_WRITE;
+			if (phdr[i].p_flags & PF_R)
+  				prot |= VM_PROT_READ;
+
+			if ((error = elf_load_section(p, vmspace, nd->ni_vp,
+  						     phdr[i].p_offset,
+  						     (caddr_t)phdr[i].p_vaddr +
+							rbase,
+  						     phdr[i].p_memsz,
+  						     phdr[i].p_filesz, prot)) != 0)
+				goto fail;
+			/*
+			 * Establish the base address if this is the
+			 * first segment.
+			 */
+			if (numsegs == 0)
+  				base_addr = trunc_page(phdr[i].p_vaddr + rbase);
+			numsegs++;
+		}
+	}
+	*addr = base_addr;
+	*entry=(unsigned long)hdr->e_entry + rbase;
+
+fail:
+	if (imgp->firstpage)
+		exec_unmap_first_page(imgp);
+	if (imgp->image_header)
+		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header,
+			PAGE_SIZE);
+	if (nd->ni_vp)
+		vrele(nd->ni_vp);
+
+	free(tempdata, M_TEMP);
+
+	return error;
+}
+
+/*
+ * non static, as it can be overridden by start_init()
+ */
+#ifdef __ia64__
+int fallback_elf_brand = ELFOSABI_FREEBSD;
+#else
+int fallback_elf_brand = -1;
+#endif
+SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW,
+		&fallback_elf_brand, -1,
+		"ELF brand of last resort");
+
+static int
+exec_elf_imgact(struct image_params *imgp)
+{
+	const Elf_Ehdr *hdr = (const Elf_Ehdr *) imgp->image_header;
+	const Elf_Phdr *phdr;
+	Elf_Auxargs *elf_auxargs = NULL;
+	struct vmspace *vmspace;
+	vm_prot_t prot;
+	u_long text_size = 0, data_size = 0;
+	u_long text_addr = 0, data_addr = 0;
+	u_long addr, entry = 0, proghdr = 0;
+	int error, i;
+	const char *interp = NULL;
+	Elf_Brandinfo *brand_info;
+	char *path;
+
+	GIANT_REQUIRED;
+
+	/*
+	 * Do we have a valid ELF header ?
+	 */
+	if (elf_check_header(hdr) != 0 || hdr->e_type != ET_EXEC)
+		return -1;
+
+	/*
+	 * From here on down, we return an errno, not -1, as we've
+	 * detected an ELF file.
+	 */
+
+	if ((hdr->e_phoff > PAGE_SIZE) ||
+	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
+		/* Only support headers in first page for now */
+		return ENOEXEC;
+	}
+	phdr = (const Elf_Phdr*)(imgp->image_header + hdr->e_phoff);
+	
+	/*
+	 * From this point on, we may have resources that need to be freed.
+	 */
+
+	/*
+	 * Yeah, I'm paranoid.  There is every reason in the world to get
+	 * VTEXT now since from here on out, there are places we can have
+	 * a context switch.  Better safe than sorry; I really don't want
+	 * the file to change while it's being loaded.
+	 */
+	mtx_lock(&imgp->vp->v_interlock);
+	imgp->vp->v_flag |= VTEXT;
+	mtx_unlock(&imgp->vp->v_interlock);
+
+	if ((error = exec_extract_strings(imgp)) != 0)
+		goto fail;
+
+	exec_new_vmspace(imgp);
+
+	vmspace = imgp->proc->p_vmspace;
+
+	for (i = 0; i < hdr->e_phnum; i++) {
+		switch(phdr[i].p_type) {
+
+		case PT_LOAD:	/* Loadable segment */
+			prot = 0;
+			if (phdr[i].p_flags & PF_X)
+  				prot |= VM_PROT_EXECUTE;
+			if (phdr[i].p_flags & PF_W)
+  				prot |= VM_PROT_WRITE;
+			if (phdr[i].p_flags & PF_R)
+  				prot |= VM_PROT_READ;
+
+			if ((error = elf_load_section(imgp->proc,
+						     vmspace, imgp->vp,
+  						     phdr[i].p_offset,
+  						     (caddr_t)phdr[i].p_vaddr,
+  						     phdr[i].p_memsz,
+  						     phdr[i].p_filesz, prot)) != 0)
+  				goto fail;
+
+			/*
+			 * Is this .text or .data ??
+			 *
+			 * We only handle one each of those yet XXX
+			 */
+			if (hdr->e_entry >= phdr[i].p_vaddr &&
+			hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) {
+  				text_addr = trunc_page(phdr[i].p_vaddr);
+  				text_size = round_page(phdr[i].p_memsz +
+						       phdr[i].p_vaddr -
+						       text_addr);
+				entry = (u_long)hdr->e_entry;
+			} else {
+  				data_addr = trunc_page(phdr[i].p_vaddr);
+  				data_size = round_page(phdr[i].p_memsz +
+						       phdr[i].p_vaddr -
+						       data_addr);
+			}
+			break;
+	  	case PT_INTERP:	/* Path to interpreter */
+			if (phdr[i].p_filesz > MAXPATHLEN ||
+			    phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) {
+				error = ENOEXEC;
+				goto fail;
+			}
+			interp = imgp->image_header + phdr[i].p_offset;
+			break;
+		case PT_PHDR: 	/* Program header table info */
+			proghdr = phdr[i].p_vaddr;
+			break;
+		default:
+			break;
+		}
+	}
+
+	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
+	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
+	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
+
+	addr = ELF_RTLD_ADDR(vmspace);
+
+	imgp->entry_addr = entry;
+
+	brand_info = NULL;
+
+	/* We support three types of branding -- (1) the ELF EI_OSABI field
+	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
+	 * branding w/in the ELF header, and (3) path of the `interp_path'
+	 * field.  We should also look for an ".note.ABI-tag" ELF section now
+	 * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones.
+	 */
+
+	/* If the executable has a brand, search for it in the brand list. */
+	if (brand_info == NULL) {
+		for (i = 0;  i < MAX_BRANDS;  i++) {
+			Elf_Brandinfo *bi = elf_brand_list[i];
+
+			if (bi != NULL && 
+			    (hdr->e_ident[EI_OSABI] == bi->brand
+			    || 0 == 
+			    strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND], 
+			    bi->compat_3_brand, strlen(bi->compat_3_brand)))) {
+				brand_info = bi;
+				break;
+			}
+		}
+	}
+
+	/* Lacking a known brand, search for a recognized interpreter. */
+	if (brand_info == NULL && interp != NULL) {
+		for (i = 0;  i < MAX_BRANDS;  i++) {
+			Elf_Brandinfo *bi = elf_brand_list[i];
+
+			if (bi != NULL &&
+			    strcmp(interp, bi->interp_path) == 0) {
+				brand_info = bi;
+				break;
+			}
+		}
+	}
+
+	/* Lacking a recognized interpreter, try the default brand */
+	if (brand_info == NULL) {
+		for (i = 0; i < MAX_BRANDS; i++) {
+			Elf_Brandinfo *bi = elf_brand_list[i];
+
+			if (bi != NULL && fallback_elf_brand == bi->brand) {
+				brand_info = bi;
+				break;
+			}
+		}
+	}
+
+	if (brand_info == NULL) {
+		uprintf("ELF binary type \"%u\" not known.\n",
+		    hdr->e_ident[EI_OSABI]);
+		error = ENOEXEC;
+		goto fail;
+	}
+
+	imgp->proc->p_sysent = brand_info->sysvec;
+	if (interp != NULL) {
+		path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	        snprintf(path, MAXPATHLEN, "%s%s",
+			 brand_info->emul_path, interp);
+		if ((error = elf_load_file(imgp->proc, path, &addr,
+					   &imgp->entry_addr)) != 0) {
+		        if ((error = elf_load_file(imgp->proc, interp, &addr,
+						   &imgp->entry_addr)) != 0) {
+			        uprintf("ELF interpreter %s not found\n", path);
+				free(path, M_TEMP);
+				goto fail;
+			}
+                }
+		free(path, M_TEMP);
+	}
+
+	/*
+	 * Construct auxargs table (used by the fixup routine)
+	 */
+	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
+	elf_auxargs->execfd = -1;
+	elf_auxargs->phdr = proghdr;
+	elf_auxargs->phent = hdr->e_phentsize;
+	elf_auxargs->phnum = hdr->e_phnum;
+	elf_auxargs->pagesz = PAGE_SIZE;
+	elf_auxargs->base = addr;
+	elf_auxargs->flags = 0;
+	elf_auxargs->entry = entry;
+	elf_auxargs->trace = elf_trace;
+
+	imgp->auxargs = elf_auxargs;
+	imgp->interpreted = 0;
+
+fail:
+	return error;
+}
+
+static int
+elf_freebsd_fixup(register_t **stack_base, struct image_params *imgp)
+{
+	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
+	register_t *pos;
+
+	pos = *stack_base + (imgp->argc + imgp->envc + 2);
+
+	if (args->trace) {
+		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
+	}
+	if (args->execfd != -1) {
+		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
+	}
+	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
+	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
+	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
+	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
+	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
+	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
+	AUXARGS_ENTRY(pos, AT_BASE, args->base);
+	AUXARGS_ENTRY(pos, AT_NULL, 0);
+
+	free(imgp->auxargs, M_TEMP);
+	imgp->auxargs = NULL;
+
+	(*stack_base)--;
+	suword(*stack_base, (long) imgp->argc);
+	return 0;
+} 
+
+/*
+ * Code for generating ELF core dumps.
+ */
+
+typedef void (*segment_callback)(vm_map_entry_t, void *);
+
+/* Closure for cb_put_phdr(). */
+struct phdr_closure {
+	Elf_Phdr *phdr;		/* Program header to fill in */
+	Elf_Off offset;		/* Offset of segment in core file */
+};
+
+/* Closure for cb_size_segment(). */
+struct sseg_closure {
+	int count;		/* Count of writable segments. */
+	size_t size;		/* Total size of all writable segments. */
+};
+
+static void cb_put_phdr(vm_map_entry_t, void *);
+static void cb_size_segment(vm_map_entry_t, void *);
+static void each_writable_segment(struct proc *, segment_callback, void *);
+static int elf_corehdr(struct thread *, struct vnode *, struct ucred *,
+    int, void *, size_t);
+static void elf_puthdr(struct proc *, void *, size_t *,
+    const prstatus_t *, const prfpregset_t *, const prpsinfo_t *, int);
+static void elf_putnote(void *, size_t *, const char *, int,
+    const void *, size_t);
+
+extern int osreldate;
+
+int
+elf_coredump(td, vp, limit)
+	struct thread *td;
+	register struct vnode *vp;
+	off_t limit;
+{
+	register struct proc *p = td->td_proc;
+	register struct ucred *cred = td->td_ucred;
+	int error = 0;
+	struct sseg_closure seginfo;
+	void *hdr;
+	size_t hdrsize;
+
+	/* Size the program segments. */
+	seginfo.count = 0;
+	seginfo.size = 0;
+	each_writable_segment(p, cb_size_segment, &seginfo);
+
+	/*
+	 * Calculate the size of the core file header area by making
+	 * a dry run of generating it.  Nothing is written, but the
+	 * size is calculated.
+	 */
+	hdrsize = 0;
+	elf_puthdr((struct proc *)NULL, (void *)NULL, &hdrsize,
+	    (const prstatus_t *)NULL, (const prfpregset_t *)NULL,
+	    (const prpsinfo_t *)NULL, seginfo.count);
+
+	if (hdrsize + seginfo.size >= limit)
+		return (EFAULT);
+
+	/*
+	 * Allocate memory for building the header, fill it up,
+	 * and write it out.
+	 */
+	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
+	if (hdr == NULL) {
+		return EINVAL;
+	}
+	error = elf_corehdr(td, vp, cred, seginfo.count, hdr, hdrsize);
+
+	/* Write the contents of all of the writable segments. */
+	if (error == 0) {
+		Elf_Phdr *php;
+		off_t offset;
+		int i;
+
+		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
+		offset = hdrsize;
+		for (i = 0;  i < seginfo.count;  i++) {
+			error = vn_rdwr_inchunks(UIO_WRITE, vp, 
+			    (caddr_t)php->p_vaddr,
+			    php->p_filesz, offset, UIO_USERSPACE,
+			    IO_UNIT | IO_DIRECT, cred, (int *)NULL, curthread); /* XXXKSE */
+			if (error != 0)
+				break;
+			offset += php->p_filesz;
+			php++;
+		}
+	}
+	free(hdr, M_TEMP);
+	
+	return error;
+}
+
+/*
+ * A callback for each_writable_segment() to write out the segment's
+ * program header entry.
+ */
+static void
+cb_put_phdr(entry, closure)
+	vm_map_entry_t entry;
+	void *closure;
+{
+	struct phdr_closure *phc = (struct phdr_closure *)closure;
+	Elf_Phdr *phdr = phc->phdr;
+
+	phc->offset = round_page(phc->offset);
+
+	phdr->p_type = PT_LOAD;
+	phdr->p_offset = phc->offset;
+	phdr->p_vaddr = entry->start;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
+	phdr->p_align = PAGE_SIZE;
+	phdr->p_flags = 0;
+	if (entry->protection & VM_PROT_READ)
+		phdr->p_flags |= PF_R;
+	if (entry->protection & VM_PROT_WRITE)
+		phdr->p_flags |= PF_W;
+	if (entry->protection & VM_PROT_EXECUTE)
+		phdr->p_flags |= PF_X;
+
+	phc->offset += phdr->p_filesz;
+	phc->phdr++;
+}
+
+/*
+ * A callback for each_writable_segment() to gather information about
+ * the number of segments and their total size.
+ */
+static void
+cb_size_segment(entry, closure)
+	vm_map_entry_t entry;
+	void *closure;
+{
+	struct sseg_closure *ssc = (struct sseg_closure *)closure;
+
+	ssc->count++;
+	ssc->size += entry->end - entry->start;
+}
+
+/*
+ * For each writable segment in the process's memory map, call the given
+ * function with a pointer to the map entry and some arbitrary
+ * caller-supplied data.
+ */
+static void
+each_writable_segment(p, func, closure)
+	struct proc *p;
+	segment_callback func;
+	void *closure;
+{
+	vm_map_t map = &p->p_vmspace->vm_map;
+	vm_map_entry_t entry;
+
+	for (entry = map->header.next;  entry != &map->header;
+	    entry = entry->next) {
+		vm_object_t obj;
+
+		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) ||
+		    (entry->protection & (VM_PROT_READ|VM_PROT_WRITE)) !=
+		    (VM_PROT_READ|VM_PROT_WRITE))
+			continue;
+
+		/*
+		** Dont include memory segment in the coredump if
+		** MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
+		** madvise(2).
+		*/
+		if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
+			continue;
+
+		if ((obj = entry->object.vm_object) == NULL)
+			continue;
+
+		/* Find the deepest backing object. */
+		while (obj->backing_object != NULL)
+			obj = obj->backing_object;
+
+		/* Ignore memory-mapped devices and such things. */
+		if (obj->type != OBJT_DEFAULT &&
+		    obj->type != OBJT_SWAP &&
+		    obj->type != OBJT_VNODE)
+			continue;
+
+		(*func)(entry, closure);
+	}
+}
+
+/*
+ * Write the core file header to the file, including padding up to
+ * the page boundary.
+ */
+static int
+elf_corehdr(td, vp, cred, numsegs, hdr, hdrsize)
+	struct thread *td;
+	struct vnode *vp;
+	struct ucred *cred;
+	int numsegs;
+	size_t hdrsize;
+	void *hdr;
+{
+	struct {
+		prstatus_t status;
+		prfpregset_t fpregset;
+		prpsinfo_t psinfo;
+	} *tempdata;
+	struct proc *p = td->td_proc;
+	size_t off;
+	prstatus_t *status;
+	prfpregset_t *fpregset;
+	prpsinfo_t *psinfo;
+
+	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO | M_WAITOK);
+	status = &tempdata->status;
+	fpregset = &tempdata->fpregset;
+	psinfo = &tempdata->psinfo;
+
+	/* Gather the information for the header. */
+	status->pr_version = PRSTATUS_VERSION;
+	status->pr_statussz = sizeof(prstatus_t);
+	status->pr_gregsetsz = sizeof(gregset_t);
+	status->pr_fpregsetsz = sizeof(fpregset_t);
+	status->pr_osreldate = osreldate;
+	status->pr_cursig = p->p_sig;
+	status->pr_pid = p->p_pid;
+	fill_regs(td, &status->pr_reg);
+
+	fill_fpregs(td, fpregset);
+
+	psinfo->pr_version = PRPSINFO_VERSION;
+	psinfo->pr_psinfosz = sizeof(prpsinfo_t);
+	strncpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname) - 1);
+
+	/* XXX - We don't fill in the command line arguments properly yet. */
+	strncpy(psinfo->pr_psargs, p->p_comm, PRARGSZ);
+
+	/* Fill in the header. */
+	bzero(hdr, hdrsize);
+	off = 0;
+	elf_puthdr(p, hdr, &off, status, fpregset, psinfo, numsegs);
+
+	free(tempdata, M_TEMP);
+
+	/* Write it to the core file. */
+	return vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
+	    UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NULL, td); /* XXXKSE */
+}
+
+static void
+elf_puthdr(struct proc *p, void *dst, size_t *off, const prstatus_t *status,
+    const prfpregset_t *fpregset, const prpsinfo_t *psinfo, int numsegs)
+{
+	size_t ehoff;
+	size_t phoff;
+	size_t noteoff;
+	size_t notesz;
+
+	ehoff = *off;
+	*off += sizeof(Elf_Ehdr);
+
+	phoff = *off;
+	*off += (numsegs + 1) * sizeof(Elf_Phdr);
+
+	noteoff = *off;
+	elf_putnote(dst, off, "FreeBSD", NT_PRSTATUS, status,
+	    sizeof *status);
+	elf_putnote(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
+	    sizeof *fpregset);
+	elf_putnote(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
+	    sizeof *psinfo);
+	notesz = *off - noteoff;
+
+	/* Align up to a page boundary for the program segments. */
+	*off = round_page(*off);
+
+	if (dst != NULL) {
+		Elf_Ehdr *ehdr;
+		Elf_Phdr *phdr;
+		struct phdr_closure phc;
+
+		/*
+		 * Fill in the ELF header.
+		 */
+		ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
+		ehdr->e_ident[EI_MAG0] = ELFMAG0;
+		ehdr->e_ident[EI_MAG1] = ELFMAG1;
+		ehdr->e_ident[EI_MAG2] = ELFMAG2;
+		ehdr->e_ident[EI_MAG3] = ELFMAG3;
+		ehdr->e_ident[EI_CLASS] = ELF_CLASS;
+		ehdr->e_ident[EI_DATA] = ELF_DATA;
+		ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+		ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
+		ehdr->e_ident[EI_ABIVERSION] = 0;
+		ehdr->e_ident[EI_PAD] = 0;
+		ehdr->e_type = ET_CORE;
+		ehdr->e_machine = ELF_ARCH;
+		ehdr->e_version = EV_CURRENT;
+		ehdr->e_entry = 0;
+		ehdr->e_phoff = phoff;
+		ehdr->e_flags = 0;
+		ehdr->e_ehsize = sizeof(Elf_Ehdr);
+		ehdr->e_phentsize = sizeof(Elf_Phdr);
+		ehdr->e_phnum = numsegs + 1;
+		ehdr->e_shentsize = sizeof(Elf_Shdr);
+		ehdr->e_shnum = 0;
+		ehdr->e_shstrndx = SHN_UNDEF;
+
+		/*
+		 * Fill in the program header entries.
+		 */
+		phdr = (Elf_Phdr *)((char *)dst + phoff);
+
+		/* The note segement. */
+		phdr->p_type = PT_NOTE;
+		phdr->p_offset = noteoff;
+		phdr->p_vaddr = 0;
+		phdr->p_paddr = 0;
+		phdr->p_filesz = notesz;
+		phdr->p_memsz = 0;
+		phdr->p_flags = 0;
+		phdr->p_align = 0;
+		phdr++;
+
+		/* All the writable segments from the program. */
+		phc.phdr = phdr;
+		phc.offset = *off;
+		each_writable_segment(p, cb_put_phdr, &phc);
+	}
+}
+
+static void
+elf_putnote(void *dst, size_t *off, const char *name, int type,
+    const void *desc, size_t descsz)
+{
+	Elf_Note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = descsz;
+	note.n_type = type;
+	if (dst != NULL)
+		bcopy(&note, (char *)dst + *off, sizeof note);
+	*off += sizeof note;
+	if (dst != NULL)
+		bcopy(name, (char *)dst + *off, note.n_namesz);
+	*off += roundup2(note.n_namesz, sizeof(Elf_Size));
+	if (dst != NULL)
+		bcopy(desc, (char *)dst + *off, note.n_descsz);
+	*off += roundup2(note.n_descsz, sizeof(Elf_Size));
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw elf_execsw = {exec_elf_imgact, "ELF"};
+EXEC_SET(elf, elf_execsw);
diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c
new file mode 100644
index 0000000..57a5c1d
--- /dev/null
+++ b/sys/kern/imgact_gzip.c
@@ -0,0 +1,385 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ *
+ * This module handles execution of a.out files which have been run through
+ * "gzip".  This saves diskspace, but wastes cpu-cycles and VM.
+ *
+ * TODO:
+ *	text-segments should be made R/O after being filled
+ *	is the vm-stuff safe ?
+ * 	should handle the entire header of gzip'ed stuff.
+ *	inflate isn't quite reentrant yet...
+ *	error-handling is a mess...
+ *	so is the rest...
+ *	tidy up unnecesary includes
+ */
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/inflate.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+struct imgact_gzip {
+	struct image_params *ip;
+	struct exec     a_out;
+	int             error;
+	int		gotheader;
+	int             where;
+	u_char         *inbuf;
+	u_long          offset;
+	u_long          output;
+	u_long          len;
+	int             idx;
+	u_long          virtual_offset, file_offset, file_end, bss_size;
+};
+
+static int exec_gzip_imgact(struct image_params *imgp);
+static int NextByte(void *vp);
+static int do_aout_hdr(struct imgact_gzip *);
+static int Flush(void *vp, u_char *, u_long siz);
+
+static int
+exec_gzip_imgact(imgp)
+	struct image_params *imgp;
+{
+	int             error, error2 = 0;
+	const u_char   *p = (const u_char *) imgp->image_header;
+	struct imgact_gzip igz;
+	struct inflate  infl;
+	struct vmspace *vmspace;
+
+	/* If these four are not OK, it isn't a gzip file */
+	if (p[0] != 0x1f)
+		return -1;	/* 0    Simply magic	 */
+	if (p[1] != 0x8b)
+		return -1;	/* 1    Simply magic	 */
+	if (p[2] != 0x08)
+		return -1;	/* 2    Compression method	 */
+	if (p[9] != 0x03)
+		return -1;	/* 9    OS compressed on	 */
+
+	/*
+	 * If this one contains anything but a comment or a filename marker,
+	 * we don't want to chew on it
+	 */
+	if (p[3] & ~(0x18))
+		return ENOEXEC;	/* 3    Flags		 */
+
+	/* These are of no use to us */
+	/* 4-7  Timestamp		 */
+	/* 8    Extra flags		 */
+
+	bzero(&igz, sizeof igz);
+	bzero(&infl, sizeof infl);
+	infl.gz_private = (void *) &igz;
+	infl.gz_input = NextByte;
+	infl.gz_output = Flush;
+
+	igz.ip = imgp;
+	igz.idx = 10;
+
+	if (p[3] & 0x08) {	/* skip a filename */
+		while (p[igz.idx++])
+			if (igz.idx >= PAGE_SIZE)
+				return ENOEXEC;
+	}
+	if (p[3] & 0x10) {	/* skip a comment */
+		while (p[igz.idx++])
+			if (igz.idx >= PAGE_SIZE)
+				return ENOEXEC;
+	}
+	igz.len = imgp->attr->va_size;
+
+	error = inflate(&infl);
+
+	/*
+	 * The unzipped file may not even have been long enough to contain
+	 * a header giving Flush() a chance to return error.  Check for this.
+	 */
+	if ( !igz.gotheader )
+		return ENOEXEC;
+
+	if ( !error ) {
+		vmspace = imgp->proc->p_vmspace;
+		error = vm_map_protect(&vmspace->vm_map,
+			(vm_offset_t) vmspace->vm_taddr,
+			(vm_offset_t) (vmspace->vm_taddr + 
+				      (vmspace->vm_tsize << PAGE_SHIFT)) ,
+			VM_PROT_READ|VM_PROT_EXECUTE,0);
+	}
+
+	if (igz.inbuf) {
+		error2 =
+			vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
+			    (vm_offset_t) igz.inbuf + PAGE_SIZE);
+	}
+	if (igz.error || error || error2) {
+		printf("Output=%lu ", igz.output);
+		printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
+		       error, igz.error, error2, igz.where);
+	}
+	if (igz.error)
+		return igz.error;
+	if (error)
+		return ENOEXEC;
+	if (error2)
+		return error2;
+	return 0;
+}
+
+static int
+do_aout_hdr(struct imgact_gzip * gz)
+{
+	int             error;
+	struct vmspace *vmspace;
+	vm_offset_t     vmaddr;
+
+	/*
+	 * Set file/virtual offset based on a.out variant. We do two cases:
+	 * host byte order and network byte order (for NetBSD compatibility)
+	 */
+	switch ((int) (gz->a_out.a_magic & 0xffff)) {
+	case ZMAGIC:
+		gz->virtual_offset = 0;
+		if (gz->a_out.a_text) {
+			gz->file_offset = PAGE_SIZE;
+		} else {
+			/* Bill's "screwball mode" */
+			gz->file_offset = 0;
+		}
+		break;
+	case QMAGIC:
+		gz->virtual_offset = PAGE_SIZE;
+		gz->file_offset = 0;
+		break;
+	default:
+		/* NetBSD compatibility */
+		switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
+		case ZMAGIC:
+		case QMAGIC:
+			gz->virtual_offset = PAGE_SIZE;
+			gz->file_offset = 0;
+			break;
+		default:
+			gz->where = __LINE__;
+			return (-1);
+		}
+	}
+
+	gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
+
+	/*
+	 * Check various fields in header for validity/bounds.
+	 */
+	if (			/* entry point must lay with text region */
+	    gz->a_out.a_entry < gz->virtual_offset ||
+	    gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
+
+	/* text and data size must each be page rounded */
+	    gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
+		gz->where = __LINE__;
+		return (-1);
+	}
+	/*
+	 * text/data/bss must not exceed limits
+	 */
+	mtx_assert(&Giant, MA_OWNED);
+	if (			/* text can't exceed maximum text size */
+	    gz->a_out.a_text > maxtsiz ||
+
+	/* data + bss can't exceed rlimit */
+	    gz->a_out.a_data + gz->bss_size >
+	    gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) {
+		gz->where = __LINE__;
+		return (ENOMEM);
+	}
+	/* Find out how far we should go */
+	gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
+
+	/* copy in arguments and/or environment from old process */
+	error = exec_extract_strings(gz->ip);
+	if (error) {
+		gz->where = __LINE__;
+		return (error);
+	}
+	/*
+	 * Destroy old process VM and create a new one (with a new stack)
+	 */
+	exec_new_vmspace(gz->ip);
+
+	vmspace = gz->ip->proc->p_vmspace;
+
+	vmaddr = gz->virtual_offset;
+
+	error = vm_mmap(&vmspace->vm_map,
+			&vmaddr,
+			gz->a_out.a_text + gz->a_out.a_data,
+			VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
+			0,
+			0);
+
+	if (error) {
+		gz->where = __LINE__;
+		return (error);
+	}
+
+	if (gz->bss_size != 0) {
+		/*
+		 * Allocate demand-zeroed area for uninitialized data.
+		 * "bss" = 'block started by symbol' - named after the 
+		 * IBM 7090 instruction of the same name.
+		 */
+		vmaddr = gz->virtual_offset + gz->a_out.a_text + 
+			gz->a_out.a_data;
+		error = vm_map_find(&vmspace->vm_map,
+				NULL,
+				0,
+				&vmaddr, 
+				gz->bss_size,
+				FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+		if (error) {
+			gz->where = __LINE__;
+			return (error);
+		}
+	}
+	/* Fill in process VM information */
+	vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
+	vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset;
+	vmspace->vm_daddr = (caddr_t) (uintptr_t)
+			    (gz->virtual_offset + gz->a_out.a_text);
+
+	/* Fill in image_params */
+	gz->ip->interpreted = 0;
+	gz->ip->entry_addr = gz->a_out.a_entry;
+
+	gz->ip->proc->p_sysent = &aout_sysvec;
+
+	return 0;
+}
+
+static int
+NextByte(void *vp)
+{
+	int             error;
+	struct imgact_gzip *igz = (struct imgact_gzip *) vp;
+
+	if (igz->idx >= igz->len) {
+		igz->where = __LINE__;
+		return GZ_EOF;
+	}
+	if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
+		return igz->inbuf[(igz->idx++) - igz->offset];
+	}
+	if (igz->inbuf) {
+		error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
+			    (vm_offset_t) igz->inbuf + PAGE_SIZE);
+		if (error) {
+			igz->where = __LINE__;
+			igz->error = error;
+			return GZ_EOF;
+		}
+	}
+	igz->offset = igz->idx & ~PAGE_MASK;
+
+	error = vm_mmap(kernel_map,	/* map */
+			(vm_offset_t *) & igz->inbuf,	/* address */
+			PAGE_SIZE,	/* size */
+			VM_PROT_READ,	/* protection */
+			VM_PROT_READ,	/* max protection */
+			0,	/* flags */
+			(caddr_t) igz->ip->vp,	/* vnode */
+			igz->offset);	/* offset */
+	if (error) {
+		igz->where = __LINE__;
+		igz->error = error;
+		return GZ_EOF;
+	}
+	return igz->inbuf[(igz->idx++) - igz->offset];
+}
+
+static int
+Flush(void *vp, u_char * ptr, u_long siz)
+{
+	struct imgact_gzip *gz = (struct imgact_gzip *) vp;
+	u_char         *p = ptr, *q;
+	int             i;
+
+	/* First, find a a.out-header */
+	if (gz->output < sizeof gz->a_out) {
+		q = (u_char *) & gz->a_out;
+		i = min(siz, sizeof gz->a_out - gz->output);
+		bcopy(p, q + gz->output, i);
+		gz->output += i;
+		p += i;
+		siz -= i;
+		if (gz->output == sizeof gz->a_out) {
+			gz->gotheader = 1;
+			i = do_aout_hdr(gz);
+			if (i == -1) {
+				if (!gz->where)
+					gz->where = __LINE__;
+				gz->error = ENOEXEC;
+				return ENOEXEC;
+			} else if (i) {
+				gz->where = __LINE__;
+				gz->error = i;
+				return ENOEXEC;
+			}
+			if (gz->file_offset == 0) {
+				q = (u_char *) (uintptr_t) gz->virtual_offset;
+				copyout(&gz->a_out, q, sizeof gz->a_out);
+			}
+		}
+	}
+	/* Skip over zero-padded first PAGE if needed */
+	if (gz->output < gz->file_offset &&
+	    gz->output + siz > gz->file_offset) {
+		i = min(siz, gz->file_offset - gz->output);
+		gz->output += i;
+		p += i;
+		siz -= i;
+	}
+	if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
+		i = min(siz, gz->file_end - gz->output);
+		q = (u_char *) (uintptr_t)
+		    (gz->virtual_offset + gz->output - gz->file_offset);
+		copyout(p, q, i);
+		gz->output += i;
+		p += i;
+		siz -= i;
+	}
+	gz->output += siz;
+	return 0;
+}
+
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
+EXEC_SET(execgzip, gzip_execsw);
diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c
new file mode 100644
index 0000000..8480fcc
--- /dev/null
+++ b/sys/kern/imgact_shell.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define SHELLMAGIC	0x2123 /* #! */
+#else
+#define SHELLMAGIC	0x2321
+#endif
+
+/*
+ * Shell interpreter image activator. A interpreter name beginning
+ *	at imgp->stringbase is the minimal successful exit requirement.
+ */
+int
+exec_shell_imgact(imgp)
+	struct image_params *imgp;
+{
+	const char *image_header = imgp->image_header;
+	const char *ihp, *line_endp;
+	char *interp;
+
+	/* a shell script? */
+	if (((const short *) image_header)[0] != SHELLMAGIC)
+		return(-1);
+
+	/*
+	 * Don't allow a shell script to be the shell for a shell
+	 *	script. :-)
+	 */
+	if (imgp->interpreted)
+		return(ENOEXEC);
+
+	imgp->interpreted = 1;
+
+	/*
+	 * Copy shell name and arguments from image_header into string
+	 *	buffer.
+	 */
+
+	/*
+	 * Find end of line; return if the line > MAXSHELLCMDLEN long.
+	 */
+	for (ihp = &image_header[2]; *ihp != '\n' && *ihp != '#'; ++ihp) {
+		if (ihp >= &image_header[MAXSHELLCMDLEN])
+			return(ENAMETOOLONG);
+	}
+	line_endp = ihp;
+
+	/* reset for another pass */
+	ihp = &image_header[2];
+
+	/* Skip over leading spaces - until the interpreter name */
+	while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+	/* copy the interpreter name */
+	interp = imgp->interpreter_name;
+	while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t'))
+		*interp++ = *ihp++;
+	*interp = '\0';
+
+	/* Disallow a null interpreter filename */
+	if (*imgp->interpreter_name == '\0')
+		return(ENOEXEC);
+
+	/* reset for another pass */
+	ihp = &image_header[2];
+
+	/* copy the interpreter name and arguments */
+	while (ihp < line_endp) {
+		/* Skip over leading spaces */
+		while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+		if (ihp < line_endp) {
+			/*
+			 * Copy to end of token. No need to watch stringspace
+			 *	because this is at the front of the string buffer
+			 *	and the maximum shell command length is tiny.
+			 */
+			while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) {
+				*imgp->stringp++ = *ihp++;
+				imgp->stringspace--;
+			}
+
+			*imgp->stringp++ = 0;
+			imgp->stringspace--;
+
+			imgp->argc++;
+		}
+	}
+
+	imgp->argv0 = imgp->uap->fname;
+
+	return(0);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw shell_execsw = { exec_shell_imgact, "#!" };
+EXEC_SET(shell, shell_execsw);
diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c
new file mode 100644
index 0000000..2a16ba2
--- /dev/null
+++ b/sys/kern/inflate.c
@@ -0,0 +1,1078 @@
+/*
+ * Most parts of this file are not covered by:
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ *
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/inflate.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#endif
+#include <sys/malloc.h>
+
+#ifdef _KERNEL
+static MALLOC_DEFINE(M_GZIP, "Gzip trees", "Gzip trees");
+#endif
+
+/* needed to make inflate() work */
+#define	uch u_char
+#define	ush u_short
+#define	ulg u_long
+
+/* Stuff to make inflate() work */
+#ifdef _KERNEL
+#define memzero(dest,len)      bzero(dest,len)
+#endif
+#define NOMEMCPY
+#ifdef _KERNEL
+#define FPRINTF printf
+#else
+extern void putstr (char *);
+#define FPRINTF putstr
+#endif
+
+#define FLUSH(x,y) {						\
+	int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y);	\
+	if (foo) 						\
+		return foo;					\
+	}
+
+static const int qflag = 0;
+
+#ifndef _KERNEL /* want to use this file in kzip also */
+extern unsigned char *kzipmalloc (int);
+extern void kzipfree (void*);
+#define malloc(x, y, z) kzipmalloc((x))
+#define free(x, y) kzipfree((x))
+#endif
+
+/*
+ * This came from unzip-5.12.  I have changed it the flow to pass
+ * a structure pointer around, thus hopefully making it re-entrant.
+ * Poul-Henning
+ */
+
+/* inflate.c -- put in the public domain by Mark Adler
+   version c14o, 23 August 1994 */
+
+/* You can do whatever you like with this source file, though I would
+   prefer that if you modify it and redistribute it that you include
+   comments to that effect with your name and the date.  Thank you.
+
+   History:
+   vers    date          who           what
+   ----  ---------  --------------  ------------------------------------
+    a    ~~ Feb 92  M. Adler        used full (large, one-step) lookup table
+    b1   21 Mar 92  M. Adler        first version with partial lookup tables
+    b2   21 Mar 92  M. Adler        fixed bug in fixed-code blocks
+    b3   22 Mar 92  M. Adler        sped up match copies, cleaned up some
+    b4   25 Mar 92  M. Adler        added prototypes; removed window[] (now
+                                    is the responsibility of unzip.h--also
+                                    changed name to slide[]), so needs diffs
+                                    for unzip.c and unzip.h (this allows
+                                    compiling in the small model on MSDOS);
+                                    fixed cast of q in huft_build();
+    b5   26 Mar 92  M. Adler        got rid of unintended macro recursion.
+    b6   27 Mar 92  M. Adler        got rid of nextbyte() routine.  fixed
+                                    bug in inflate_fixed().
+    c1   30 Mar 92  M. Adler        removed lbits, dbits environment variables.
+                                    changed BMAX to 16 for explode.  Removed
+                                    OUTB usage, and replaced it with flush()--
+                                    this was a 20% speed improvement!  Added
+                                    an explode.c (to replace unimplod.c) that
+                                    uses the huft routines here.  Removed
+                                    register union.
+    c2    4 Apr 92  M. Adler        fixed bug for file sizes a multiple of 32k.
+    c3   10 Apr 92  M. Adler        reduced memory of code tables made by
+                                    huft_build significantly (factor of two to
+                                    three).
+    c4   15 Apr 92  M. Adler        added NOMEMCPY do kill use of memcpy().
+                                    worked around a Turbo C optimization bug.
+    c5   21 Apr 92  M. Adler        added the GZ_WSIZE #define to allow reducing
+                                    the 32K window size for specialized
+                                    applications.
+    c6   31 May 92  M. Adler        added some typecasts to eliminate warnings
+    c7   27 Jun 92  G. Roelofs      added some more typecasts (444:  MSC bug).
+    c8    5 Oct 92  J-l. Gailly     added ifdef'd code to deal with PKZIP bug.
+    c9    9 Oct 92  M. Adler        removed a memory error message (~line 416).
+    c10  17 Oct 92  G. Roelofs      changed ULONG/UWORD/byte to ulg/ush/uch,
+                                    removed old inflate, renamed inflate_entry
+                                    to inflate, added Mark's fix to a comment.
+   c10.5 14 Dec 92  M. Adler        fix up error messages for incomplete trees.
+    c11   2 Jan 93  M. Adler        fixed bug in detection of incomplete
+                                    tables, and removed assumption that EOB is
+                                    the longest code (bad assumption).
+    c12   3 Jan 93  M. Adler        make tables for fixed blocks only once.
+    c13   5 Jan 93  M. Adler        allow all zero length codes (pkzip 2.04c
+                                    outputs one zero length code for an empty
+                                    distance tree).
+    c14  12 Mar 93  M. Adler        made inflate.c standalone with the
+                                    introduction of inflate.h.
+   c14b  16 Jul 93  G. Roelofs      added (unsigned) typecast to w at 470.
+   c14c  19 Jul 93  J. Bush         changed v[N_MAX], l[288], ll[28x+3x] arrays
+                                    to static for Amiga.
+   c14d  13 Aug 93  J-l. Gailly     de-complicatified Mark's c[*p++]++ thing.
+   c14e   8 Oct 93  G. Roelofs      changed memset() to memzero().
+   c14f  22 Oct 93  G. Roelofs      renamed quietflg to qflag; made Trace()
+                                    conditional; added inflate_free().
+   c14g  28 Oct 93  G. Roelofs      changed l/(lx+1) macro to pointer (Cray bug)
+   c14h   7 Dec 93  C. Ghisler      huft_build() optimizations.
+   c14i   9 Jan 94  A. Verheijen    set fixed_t{d,l} to NULL after freeing;
+                    G. Roelofs      check NEXTBYTE macro for GZ_EOF.
+   c14j  23 Jan 94  G. Roelofs      removed Ghisler "optimizations"; ifdef'd
+                                    GZ_EOF check.
+   c14k  27 Feb 94  G. Roelofs      added some typecasts to avoid warnings.
+   c14l   9 Apr 94  G. Roelofs      fixed split comments on preprocessor lines
+                                    to avoid bug in Encore compiler.
+   c14m   7 Jul 94  P. Kienitz      modified to allow assembler version of
+                                    inflate_codes() (define ASM_INFLATECODES)
+   c14n  22 Jul 94  G. Roelofs      changed fprintf to FPRINTF for DLL versions
+   c14o  23 Aug 94  C. Spieler      added a newline to a debug statement;
+                    G. Roelofs      added another typecast to avoid MSC warning
+ */
+
+
+/*
+   Inflate deflated (PKZIP's method 8 compressed) data.  The compression
+   method searches for as much of the current string of bytes (up to a
+   length of 258) in the previous 32K bytes.  If it doesn't find any
+   matches (of at least length 3), it codes the next byte.  Otherwise, it
+   codes the length of the matched string and its distance backwards from
+   the current position.  There is a single Huffman code that codes both
+   single bytes (called "literals") and match lengths.  A second Huffman
+   code codes the distance information, which follows a length code.  Each
+   length or distance code actually represents a base value and a number
+   of "extra" (sometimes zero) bits to get to add to the base value.  At
+   the end of each deflated block is a special end-of-block (EOB) literal/
+   length code.  The decoding process is basically: get a literal/length
+   code; if EOB then done; if a literal, emit the decoded byte; if a
+   length then get the distance and emit the referred-to bytes from the
+   sliding window of previously emitted data.
+
+   There are (currently) three kinds of inflate blocks: stored, fixed, and
+   dynamic.  The compressor outputs a chunk of data at a time and decides
+   which method to use on a chunk-by-chunk basis.  A chunk might typically
+   be 32K to 64K, uncompressed.  If the chunk is uncompressible, then the
+   "stored" method is used.  In this case, the bytes are simply stored as
+   is, eight bits per byte, with none of the above coding.  The bytes are
+   preceded by a count, since there is no longer an EOB code.
+
+   If the data is compressible, then either the fixed or dynamic methods
+   are used.  In the dynamic method, the compressed data is preceded by
+   an encoding of the literal/length and distance Huffman codes that are
+   to be used to decode this block.  The representation is itself Huffman
+   coded, and so is preceded by a description of that code.  These code
+   descriptions take up a little space, and so for small blocks, there is
+   a predefined set of codes, called the fixed codes.  The fixed method is
+   used if the block ends up smaller that way (usually for quite small
+   chunks); otherwise the dynamic method is used.  In the latter case, the
+   codes are customized to the probabilities in the current block and so
+   can code it much better than the pre-determined fixed codes can.
+
+   The Huffman codes themselves are decoded using a mutli-level table
+   lookup, in order to maximize the speed of decoding plus the speed of
+   building the decoding tables.  See the comments below that precede the
+   lbits and dbits tuning parameters.
+ */
+
+
+/*
+   Notes beyond the 1.93a appnote.txt:
+
+   1. Distance pointers never point before the beginning of the output
+      stream.
+   2. Distance pointers can point back across blocks, up to 32k away.
+   3. There is an implied maximum of 7 bits for the bit length table and
+      15 bits for the actual data.
+   4. If only one code exists, then it is encoded using one bit.  (Zero
+      would be more efficient, but perhaps a little confusing.)  If two
+      codes exist, they are coded using one bit each (0 and 1).
+   5. There is no way of sending zero distance codes--a dummy must be
+      sent if there are none.  (History: a pre 2.0 version of PKZIP would
+      store blocks with no distance codes, but this was discovered to be
+      too harsh a criterion.)  Valid only for 1.93a.  2.04c does allow
+      zero distance codes, which is sent as one code of zero bits in
+      length.
+   6. There are up to 286 literal/length codes.  Code 256 represents the
+      end-of-block.  Note however that the static length tree defines
+      288 codes just to fill out the Huffman codes.  Codes 286 and 287
+      cannot be used though, since there is no length base or extra bits
+      defined for them.  Similarily, there are up to 30 distance codes.
+      However, static trees define 32 codes (all 5 bits) to fill out the
+      Huffman codes, but the last two had better not show up in the data.
+   7. Unzip can check dynamic Huffman blocks for complete code sets.
+      The exception is that a single code would not be complete (see #4).
+   8. The five bits following the block type is really the number of
+      literal codes sent minus 257.
+   9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
+      (1+6+6).  Therefore, to output three times the length, you output
+      three codes (1+1+1), whereas to output four times the same length,
+      you only need two codes (1+3).  Hmm.
+  10. In the tree reconstruction algorithm, Code = Code + Increment
+      only if BitLength(i) is not zero.  (Pretty obvious.)
+  11. Correction: 4 Bits: # of Bit Length codes - 4     (4 - 19)
+  12. Note: length code 284 can represent 227-258, but length code 285
+      really is 258.  The last length deserves its own, short code
+      since it gets used a lot in very redundant files.  The length
+      258 is special since 258 - 3 (the min match length) is 255.
+  13. The literal/length and distance code bit lengths are read as a
+      single stream of lengths.  It is possible (and advantageous) for
+      a repeat code (16, 17, or 18) to go across the boundary between
+      the two sets of lengths.
+ */
+
+
+#define PKZIP_BUG_WORKAROUND	/* PKZIP 1.93a problem--live with it */
+
+/*
+    inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE,
+    FLUSH() and memzero macros.  If the window size is not 32K, it
+    should also define GZ_WSIZE.  If INFMOD is defined, it can include
+    compiled functions to support the NEXTBYTE and/or FLUSH() macros.
+    There are defaults for NEXTBYTE and FLUSH() below for use as
+    examples of what those functions need to do.  Normally, you would
+    also want FLUSH() to compute a crc on the data.  inflate.h also
+    needs to provide these typedefs:
+
+        typedef unsigned char uch;
+        typedef unsigned short ush;
+        typedef unsigned long ulg;
+
+    This module uses the external functions malloc() and free() (and
+    probably memset() or bzero() in the memzero() macro).  Their
+    prototypes are normally found in <string.h> and <stdlib.h>.
+ */
+#define INFMOD			/* tell inflate.h to include code to be
+				 * compiled */
+
+/* Huffman code lookup table entry--this entry is four bytes for machines
+   that have 16-bit pointers (e.g. PC's in the small or medium model).
+   Valid extra bits are 0..13.  e == 15 is EOB (end of block), e == 16
+   means that v is a literal, 16 < e < 32 means that v is a pointer to
+   the next table, which codes e - 16 bits, and lastly e == 99 indicates
+   an unused code.  If a code with e == 99 is looked up, this implies an
+   error in the data. */
+struct huft {
+	uch             e;	/* number of extra bits or operation */
+	uch             b;	/* number of bits in this code or subcode */
+	union {
+		ush             n;	/* literal, length base, or distance
+					 * base */
+		struct huft    *t;	/* pointer to next level of table */
+	}               v;
+};
+
+
+/* Function prototypes */
+static int huft_build(struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *);
+static int huft_free(struct inflate *, struct huft *);
+static int inflate_codes(struct inflate *, struct huft *, struct huft *, int, int);
+static int inflate_stored(struct inflate *);
+static int xinflate(struct inflate *);
+static int inflate_fixed(struct inflate *);
+static int inflate_dynamic(struct inflate *);
+static int inflate_block(struct inflate *, int *);
+
+/* The inflate algorithm uses a sliding 32K byte window on the uncompressed
+   stream to find repeated byte strings.  This is implemented here as a
+   circular buffer.  The index is updated simply by incrementing and then
+   and'ing with 0x7fff (32K-1). */
+/* It is left to other modules to supply the 32K area.  It is assumed
+   to be usable as if it were declared "uch slide[32768];" or as just
+   "uch *slide;" and then malloc'ed in the latter case.  The definition
+   must be in unzip.h, included above. */
+
+
+/* Tables for deflate from PKZIP's appnote.txt. */
+
+/* Order of the bit length code lengths */
+static const unsigned border[] = {
+	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+static const ush cplens[] = {	/* Copy lengths for literal codes 257..285 */
+	3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+	35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+ /* note: see note #13 above about the 258 in this list. */
+
+static const ush cplext[] = {	/* Extra bits for literal codes 257..285 */
+	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+	3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99};	/* 99==invalid */
+
+static const ush cpdist[] = {	/* Copy offsets for distance codes 0..29 */
+	1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+	257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+	8193, 12289, 16385, 24577};
+
+static const ush cpdext[] = {	/* Extra bits for distance codes */
+	0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+	7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+	12, 12, 13, 13};
+
+/* And'ing with mask[n] masks the lower n bits */
+static const ush mask[] = {
+	0x0000,
+	0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+	0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+
+
+/* Macros for inflate() bit peeking and grabbing.
+   The usage is:
+
+        NEEDBITS(glbl,j)
+        x = b & mask[j];
+        DUMPBITS(j)
+
+   where NEEDBITS makes sure that b has at least j bits in it, and
+   DUMPBITS removes the bits from b.  The macros use the variable k
+   for the number of bits in b.  Normally, b and k are register
+   variables for speed, and are initialized at the begining of a
+   routine that uses these macros from a global bit buffer and count.
+
+   In order to not ask for more bits than there are in the compressed
+   stream, the Huffman tables are constructed to only ask for just
+   enough bits to make up the end-of-block code (value 256).  Then no
+   bytes need to be "returned" to the buffer at the end of the last
+   block.  See the huft_build() routine.
+ */
+
+/*
+ * The following 2 were global variables.
+ * They are now fields of the inflate structure.
+ */
+
+#define NEEDBITS(glbl,n) {						\
+		while(k<(n)) {						\
+			int c=(*glbl->gz_input)(glbl->gz_private);	\
+			if(c==GZ_EOF)					\
+				return 1; 				\
+			b|=((ulg)c)<<k;					\
+			k+=8;						\
+		}							\
+	}
+
+#define DUMPBITS(n) {b>>=(n);k-=(n);}
+
+/*
+   Huffman code decoding is performed using a multi-level table lookup.
+   The fastest way to decode is to simply build a lookup table whose
+   size is determined by the longest code.  However, the time it takes
+   to build this table can also be a factor if the data being decoded
+   is not very long.  The most common codes are necessarily the
+   shortest codes, so those codes dominate the decoding time, and hence
+   the speed.  The idea is you can have a shorter table that decodes the
+   shorter, more probable codes, and then point to subsidiary tables for
+   the longer codes.  The time it costs to decode the longer codes is
+   then traded against the time it takes to make longer tables.
+
+   This results of this trade are in the variables lbits and dbits
+   below.  lbits is the number of bits the first level table for literal/
+   length codes can decode in one step, and dbits is the same thing for
+   the distance codes.  Subsequent tables are also less than or equal to
+   those sizes.  These values may be adjusted either when all of the
+   codes are shorter than that, in which case the longest code length in
+   bits is used, or when the shortest code is *longer* than the requested
+   table size, in which case the length of the shortest code in bits is
+   used.
+
+   There are two different values for the two tables, since they code a
+   different number of possibilities each.  The literal/length table
+   codes 286 possible values, or in a flat code, a little over eight
+   bits.  The distance table codes 30 possible values, or a little less
+   than five bits, flat.  The optimum values for speed end up being
+   about one bit more than those, so lbits is 8+1 and dbits is 5+1.
+   The optimum values may differ though from machine to machine, and
+   possibly even between compilers.  Your mileage may vary.
+ */
+
+static const int lbits = 9;	/* bits in base literal/length lookup table */
+static const int dbits = 6;	/* bits in base distance lookup table */
+
+
+/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */
+#define BMAX 16			/* maximum bit length of any code (16 for
+				 * explode) */
+#define N_MAX 288		/* maximum number of codes in any set */
+
+/* Given a list of code lengths and a maximum table size, make a set of
+   tables to decode that set of codes.  Return zero on success, one if
+   the given code set is incomplete (the tables are still built in this
+   case), two if the input is invalid (all zero length codes or an
+   oversubscribed set of lengths), and three if not enough memory.
+   The code with value 256 is special, and the tables are constructed
+   so that no bits beyond that code are fetched when that code is
+   decoded. */
+static int
+huft_build(glbl, b, n, s, d, e, t, m)
+	struct inflate *glbl;
+	unsigned       *b;	/* code lengths in bits (all assumed <= BMAX) */
+	unsigned        n;	/* number of codes (assumed <= N_MAX) */
+	unsigned        s;	/* number of simple-valued codes (0..s-1) */
+	const ush      *d;	/* list of base values for non-simple codes */
+	const ush      *e;	/* list of extra bits for non-simple codes */
+	struct huft   **t;	/* result: starting table */
+	int            *m;	/* maximum lookup bits, returns actual */
+{
+	unsigned        a;	/* counter for codes of length k */
+	unsigned        c[BMAX + 1];	/* bit length count table */
+	unsigned        el;	/* length of EOB code (value 256) */
+	unsigned        f;	/* i repeats in table every f entries */
+	int             g;	/* maximum code length */
+	int             h;	/* table level */
+	register unsigned i;	/* counter, current code */
+	register unsigned j;	/* counter */
+	register int    k;	/* number of bits in current code */
+	int             lx[BMAX + 1];	/* memory for l[-1..BMAX-1] */
+	int            *l = lx + 1;	/* stack of bits per table */
+	register unsigned *p;	/* pointer into c[], b[], or v[] */
+	register struct huft *q;/* points to current table */
+	struct huft     r;	/* table entry for structure assignment */
+	struct huft    *u[BMAX];/* table stack */
+	unsigned        v[N_MAX];	/* values in order of bit length */
+	register int    w;	/* bits before this table == (l * h) */
+	unsigned        x[BMAX + 1];	/* bit offsets, then code stack */
+	unsigned       *xp;	/* pointer into x */
+	int             y;	/* number of dummy codes added */
+	unsigned        z;	/* number of entries in current table */
+
+	/* Generate counts for each bit length */
+	el = n > 256 ? b[256] : BMAX;	/* set length of EOB code, if any */
+#ifdef _KERNEL
+	memzero((char *) c, sizeof(c));
+#else
+	for (i = 0; i < BMAX+1; i++)
+		c [i] = 0;
+#endif
+	p = b;
+	i = n;
+	do {
+		c[*p]++;
+		p++;		/* assume all entries <= BMAX */
+	} while (--i);
+	if (c[0] == n) {	/* null input--all zero length codes */
+		*t = (struct huft *) NULL;
+		*m = 0;
+		return 0;
+	}
+	/* Find minimum and maximum length, bound *m by those */
+	for (j = 1; j <= BMAX; j++)
+		if (c[j])
+			break;
+	k = j;			/* minimum code length */
+	if ((unsigned) *m < j)
+		*m = j;
+	for (i = BMAX; i; i--)
+		if (c[i])
+			break;
+	g = i;			/* maximum code length */
+	if ((unsigned) *m > i)
+		*m = i;
+
+	/* Adjust last length count to fill out codes, if needed */
+	for (y = 1 << j; j < i; j++, y <<= 1)
+		if ((y -= c[j]) < 0)
+			return 2;	/* bad input: more codes than bits */
+	if ((y -= c[i]) < 0)
+		return 2;
+	c[i] += y;
+
+	/* Generate starting offsets into the value table for each length */
+	x[1] = j = 0;
+	p = c + 1;
+	xp = x + 2;
+	while (--i) {		/* note that i == g from above */
+		*xp++ = (j += *p++);
+	}
+
+	/* Make a table of values in order of bit lengths */
+	p = b;
+	i = 0;
+	do {
+		if ((j = *p++) != 0)
+			v[x[j]++] = i;
+	} while (++i < n);
+
+	/* Generate the Huffman codes and for each, make the table entries */
+	x[0] = i = 0;		/* first Huffman code is zero */
+	p = v;			/* grab values in bit order */
+	h = -1;			/* no tables yet--level -1 */
+	w = l[-1] = 0;		/* no bits decoded yet */
+	u[0] = (struct huft *) NULL;	/* just to keep compilers happy */
+	q = (struct huft *) NULL;	/* ditto */
+	z = 0;			/* ditto */
+
+	/* go through the bit lengths (k already is bits in shortest code) */
+	for (; k <= g; k++) {
+		a = c[k];
+		while (a--) {
+			/*
+			 * here i is the Huffman code of length k bits for
+			 * value *p
+			 */
+			/* make tables up to required level */
+			while (k > w + l[h]) {
+				w += l[h++];	/* add bits already decoded */
+
+				/*
+				 * compute minimum size table less than or
+				 * equal to *m bits
+				 */
+				z = (z = g - w) > (unsigned) *m ? *m : z;	/* upper limit */
+				if ((f = 1 << (j = k - w)) > a + 1) {	/* try a k-w bit table *//* t
+									 * oo few codes for k-w
+									 * bit table */
+					f -= a + 1;	/* deduct codes from
+							 * patterns left */
+					xp = c + k;
+					while (++j < z) {	/* try smaller tables up
+								 * to z bits */
+						if ((f <<= 1) <= *++xp)
+							break;	/* enough codes to use
+								 * up j bits */
+						f -= *xp;	/* else deduct codes
+								 * from patterns */
+					}
+				}
+				if ((unsigned) w + j > el && (unsigned) w < el)
+					j = el - w;	/* make EOB code end at
+							 * table */
+				z = 1 << j;	/* table entries for j-bit
+						 * table */
+				l[h] = j;	/* set table size in stack */
+
+				/* allocate and link in new table */
+				if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) ==
+				    (struct huft *) NULL) {
+					if (h)
+						huft_free(glbl, u[0]);
+					return 3;	/* not enough memory */
+				}
+				glbl->gz_hufts += z + 1;	/* track memory usage */
+				*t = q + 1;	/* link to list for
+						 * huft_free() */
+				*(t = &(q->v.t)) = (struct huft *) NULL;
+				u[h] = ++q;	/* table starts after link */
+
+				/* connect to last table, if there is one */
+				if (h) {
+					x[h] = i;	/* save pattern for
+							 * backing up */
+					r.b = (uch) l[h - 1];	/* bits to dump before
+								 * this table */
+					r.e = (uch) (16 + j);	/* bits in this table */
+					r.v.t = q;	/* pointer to this table */
+					j = (i & ((1 << w) - 1)) >> (w - l[h - 1]);
+					u[h - 1][j] = r;	/* connect to last table */
+				}
+			}
+
+			/* set up table entry in r */
+			r.b = (uch) (k - w);
+			if (p >= v + n)
+				r.e = 99;	/* out of values--invalid
+						 * code */
+			else if (*p < s) {
+				r.e = (uch) (*p < 256 ? 16 : 15);	/* 256 is end-of-block
+									 * code */
+				r.v.n = *p++;	/* simple code is just the
+						 * value */
+			} else {
+				r.e = (uch) e[*p - s];	/* non-simple--look up
+							 * in lists */
+				r.v.n = d[*p++ - s];
+			}
+
+			/* fill code-like entries with r */
+			f = 1 << (k - w);
+			for (j = i >> w; j < z; j += f)
+				q[j] = r;
+
+			/* backwards increment the k-bit code i */
+			for (j = 1 << (k - 1); i & j; j >>= 1)
+				i ^= j;
+			i ^= j;
+
+			/* backup over finished tables */
+			while ((i & ((1 << w) - 1)) != x[h])
+				w -= l[--h];	/* don't need to update q */
+		}
+	}
+
+	/* return actual size of base table */
+	*m = l[0];
+
+	/* Return true (1) if we were given an incomplete table */
+	return y != 0 && g != 1;
+}
+
+static int
+huft_free(glbl, t)
+	struct inflate *glbl;
+	struct huft    *t;	/* table to free */
+/* Free the malloc'ed tables built by huft_build(), which makes a linked
+   list of the tables it made, with the links in a dummy first entry of
+   each table. */
+{
+	register struct huft *p, *q;
+
+	/* Go through linked list, freeing from the malloced (t[-1]) address. */
+	p = t;
+	while (p != (struct huft *) NULL) {
+		q = (--p)->v.t;
+		free(p, M_GZIP);
+		p = q;
+	}
+	return 0;
+}
+
+/* inflate (decompress) the codes in a deflated (compressed) block.
+   Return an error code or zero if it all goes ok. */
+static int
+inflate_codes(glbl, tl, td, bl, bd)
+	struct inflate *glbl;
+	struct huft    *tl, *td;/* literal/length and distance decoder tables */
+	int             bl, bd;	/* number of bits decoded by tl[] and td[] */
+{
+	register unsigned e;	/* table entry flag/number of extra bits */
+	unsigned        n, d;	/* length and index for copy */
+	unsigned        w;	/* current window position */
+	struct huft    *t;	/* pointer to table entry */
+	unsigned        ml, md;	/* masks for bl and bd bits */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local copies of globals */
+	b = glbl->gz_bb;			/* initialize bit buffer */
+	k = glbl->gz_bk;
+	w = glbl->gz_wp;	/* initialize window position */
+
+	/* inflate the coded data */
+	ml = mask[bl];		/* precompute masks for speed */
+	md = mask[bd];
+	while (1) {		/* do until end of block */
+		NEEDBITS(glbl, (unsigned) bl)
+			if ((e = (t = tl + ((unsigned) b & ml))->e) > 16)
+			do {
+				if (e == 99)
+					return 1;
+				DUMPBITS(t->b)
+					e -= 16;
+				NEEDBITS(glbl, e)
+			} while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+		DUMPBITS(t->b)
+			if (e == 16) {	/* then it's a literal */
+			glbl->gz_slide[w++] = (uch) t->v.n;
+			if (w == GZ_WSIZE) {
+				FLUSH(glbl, w);
+				w = 0;
+			}
+		} else {	/* it's an EOB or a length */
+			/* exit if end of block */
+			if (e == 15)
+				break;
+
+			/* get length of block to copy */
+			NEEDBITS(glbl, e)
+				n = t->v.n + ((unsigned) b & mask[e]);
+			DUMPBITS(e);
+
+			/* decode distance of block to copy */
+			NEEDBITS(glbl, (unsigned) bd)
+				if ((e = (t = td + ((unsigned) b & md))->e) > 16)
+				do {
+					if (e == 99)
+						return 1;
+					DUMPBITS(t->b)
+						e -= 16;
+					NEEDBITS(glbl, e)
+				} while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+			DUMPBITS(t->b)
+				NEEDBITS(glbl, e)
+				d = w - t->v.n - ((unsigned) b & mask[e]);
+			DUMPBITS(e)
+			/* do the copy */
+				do {
+				n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e);
+#ifndef NOMEMCPY
+				if (w - d >= e) {	/* (this test assumes
+							 * unsigned comparison) */
+					memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e);
+					w += e;
+					d += e;
+				} else	/* do it slow to avoid memcpy()
+					 * overlap */
+#endif				/* !NOMEMCPY */
+					do {
+						glbl->gz_slide[w++] = glbl->gz_slide[d++];
+					} while (--e);
+				if (w == GZ_WSIZE) {
+					FLUSH(glbl, w);
+					w = 0;
+				}
+			} while (n);
+		}
+	}
+
+	/* restore the globals from the locals */
+	glbl->gz_wp = w;	/* restore global window pointer */
+	glbl->gz_bb = b;			/* restore global bit buffer */
+	glbl->gz_bk = k;
+
+	/* done */
+	return 0;
+}
+
+/* "decompress" an inflated type 0 (stored) block. */
+static int
+inflate_stored(glbl)
+	struct inflate *glbl;
+{
+	unsigned        n;	/* number of bytes in block */
+	unsigned        w;	/* current window position */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local copies of globals */
+	b = glbl->gz_bb;			/* initialize bit buffer */
+	k = glbl->gz_bk;
+	w = glbl->gz_wp;	/* initialize window position */
+
+	/* go to byte boundary */
+	n = k & 7;
+	DUMPBITS(n);
+
+	/* get the length and its complement */
+	NEEDBITS(glbl, 16)
+		n = ((unsigned) b & 0xffff);
+	DUMPBITS(16)
+		NEEDBITS(glbl, 16)
+		if (n != (unsigned) ((~b) & 0xffff))
+		return 1;	/* error in compressed data */
+	DUMPBITS(16)
+	/* read and output the compressed data */
+		while (n--) {
+		NEEDBITS(glbl, 8)
+			glbl->gz_slide[w++] = (uch) b;
+		if (w == GZ_WSIZE) {
+			FLUSH(glbl, w);
+			w = 0;
+		}
+		DUMPBITS(8)
+	}
+
+	/* restore the globals from the locals */
+	glbl->gz_wp = w;	/* restore global window pointer */
+	glbl->gz_bb = b;			/* restore global bit buffer */
+	glbl->gz_bk = k;
+	return 0;
+}
+
+/* decompress an inflated type 1 (fixed Huffman codes) block.  We should
+   either replace this with a custom decoder, or at least precompute the
+   Huffman tables. */
+static int
+inflate_fixed(glbl)
+	struct inflate *glbl;
+{
+	/* if first time, set up tables for fixed blocks */
+	if (glbl->gz_fixed_tl == (struct huft *) NULL) {
+		int             i;	/* temporary variable */
+		static unsigned l[288];	/* length list for huft_build */
+
+		/* literal table */
+		for (i = 0; i < 144; i++)
+			l[i] = 8;
+		for (; i < 256; i++)
+			l[i] = 9;
+		for (; i < 280; i++)
+			l[i] = 7;
+		for (; i < 288; i++)	/* make a complete, but wrong code
+					 * set */
+			l[i] = 8;
+		glbl->gz_fixed_bl = 7;
+		if ((i = huft_build(glbl, l, 288, 257, cplens, cplext,
+			    &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) {
+			glbl->gz_fixed_tl = (struct huft *) NULL;
+			return i;
+		}
+		/* distance table */
+		for (i = 0; i < 30; i++)	/* make an incomplete code
+						 * set */
+			l[i] = 5;
+		glbl->gz_fixed_bd = 5;
+		if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext,
+			     &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) {
+			huft_free(glbl, glbl->gz_fixed_tl);
+			glbl->gz_fixed_tl = (struct huft *) NULL;
+			return i;
+		}
+	}
+	/* decompress until an end-of-block code */
+	return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0;
+}
+
+/* decompress an inflated type 2 (dynamic Huffman codes) block. */
+static int
+inflate_dynamic(glbl)
+	struct inflate *glbl;
+{
+	int             i;	/* temporary variables */
+	unsigned        j;
+	unsigned        l;	/* last length */
+	unsigned        m;	/* mask for bit lengths table */
+	unsigned        n;	/* number of lengths to get */
+	struct huft    *tl;	/* literal/length code table */
+	struct huft    *td;	/* distance code table */
+	int             bl;	/* lookup bits for tl */
+	int             bd;	/* lookup bits for td */
+	unsigned        nb;	/* number of bit length codes */
+	unsigned        nl;	/* number of literal/length codes */
+	unsigned        nd;	/* number of distance codes */
+#ifdef PKZIP_BUG_WORKAROUND
+	unsigned        ll[288 + 32];	/* literal/length and distance code
+					 * lengths */
+#else
+	unsigned        ll[286 + 30];	/* literal/length and distance code
+					 * lengths */
+#endif
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local bit buffer */
+	b = glbl->gz_bb;
+	k = glbl->gz_bk;
+
+	/* read in table lengths */
+	NEEDBITS(glbl, 5)
+		nl = 257 + ((unsigned) b & 0x1f);	/* number of
+							 * literal/length codes */
+	DUMPBITS(5)
+		NEEDBITS(glbl, 5)
+		nd = 1 + ((unsigned) b & 0x1f);	/* number of distance codes */
+	DUMPBITS(5)
+		NEEDBITS(glbl, 4)
+		nb = 4 + ((unsigned) b & 0xf);	/* number of bit length codes */
+	DUMPBITS(4)
+#ifdef PKZIP_BUG_WORKAROUND
+		if (nl > 288 || nd > 32)
+#else
+		if (nl > 286 || nd > 30)
+#endif
+		return 1;	/* bad lengths */
+	/* read in bit-length-code lengths */
+	for (j = 0; j < nb; j++) {
+		NEEDBITS(glbl, 3)
+			ll[border[j]] = (unsigned) b & 7;
+		DUMPBITS(3)
+	}
+	for (; j < 19; j++)
+		ll[border[j]] = 0;
+
+	/* build decoding table for trees--single level, 7 bit lookup */
+	bl = 7;
+	if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) {
+		if (i == 1)
+			huft_free(glbl, tl);
+		return i;	/* incomplete code set */
+	}
+	/* read in literal and distance code lengths */
+	n = nl + nd;
+	m = mask[bl];
+	i = l = 0;
+	while ((unsigned) i < n) {
+		NEEDBITS(glbl, (unsigned) bl)
+			j = (td = tl + ((unsigned) b & m))->b;
+		DUMPBITS(j)
+			j = td->v.n;
+		if (j < 16)	/* length of code in bits (0..15) */
+			ll[i++] = l = j;	/* save last length in l */
+		else if (j == 16) {	/* repeat last length 3 to 6 times */
+			NEEDBITS(glbl, 2)
+				j = 3 + ((unsigned) b & 3);
+			DUMPBITS(2)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = l;
+		} else if (j == 17) {	/* 3 to 10 zero length codes */
+			NEEDBITS(glbl, 3)
+				j = 3 + ((unsigned) b & 7);
+			DUMPBITS(3)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = 0;
+			l = 0;
+		} else {	/* j == 18: 11 to 138 zero length codes */
+			NEEDBITS(glbl, 7)
+				j = 11 + ((unsigned) b & 0x7f);
+			DUMPBITS(7)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = 0;
+			l = 0;
+		}
+	}
+
+	/* free decoding table for trees */
+	huft_free(glbl, tl);
+
+	/* restore the global bit buffer */
+	glbl->gz_bb = b;
+	glbl->gz_bk = k;
+
+	/* build the decoding tables for literal/length and distance codes */
+	bl = lbits;
+	i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl);
+	if (i != 0) {
+		if (i == 1 && !qflag) {
+			FPRINTF("(incomplete l-tree)  ");
+			huft_free(glbl, tl);
+		}
+		return i;	/* incomplete code set */
+	}
+	bd = dbits;
+	i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd);
+	if (i != 0) {
+		if (i == 1 && !qflag) {
+			FPRINTF("(incomplete d-tree)  ");
+#ifdef PKZIP_BUG_WORKAROUND
+			i = 0;
+		}
+#else
+			huft_free(glbl, td);
+		}
+		huft_free(glbl, tl);
+		return i;	/* incomplete code set */
+#endif
+	}
+	/* decompress until an end-of-block code */
+	if (inflate_codes(glbl, tl, td, bl, bd))
+		return 1;
+
+	/* free the decoding tables, return */
+	huft_free(glbl, tl);
+	huft_free(glbl, td);
+	return 0;
+}
+
+/* decompress an inflated block */
+static int
+inflate_block(glbl, e)
+	struct inflate *glbl;
+	int            *e;	/* last block flag */
+{
+	unsigned        t;	/* block type */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local bit buffer */
+	b = glbl->gz_bb;
+	k = glbl->gz_bk;
+
+	/* read in last block bit */
+	NEEDBITS(glbl, 1)
+		* e = (int) b & 1;
+	DUMPBITS(1)
+	/* read in block type */
+		NEEDBITS(glbl, 2)
+		t = (unsigned) b & 3;
+	DUMPBITS(2)
+	/* restore the global bit buffer */
+		glbl->gz_bb = b;
+	glbl->gz_bk = k;
+
+	/* inflate that block type */
+	if (t == 2)
+		return inflate_dynamic(glbl);
+	if (t == 0)
+		return inflate_stored(glbl);
+	if (t == 1)
+		return inflate_fixed(glbl);
+	/* bad block type */
+	return 2;
+}
+
+
+
+/* decompress an inflated entry */
+static int
+xinflate(glbl)
+	struct inflate *glbl;
+{
+	int             e;	/* last block flag */
+	int             r;	/* result code */
+	unsigned        h;	/* maximum struct huft's malloc'ed */
+
+	glbl->gz_fixed_tl = (struct huft *) NULL;
+
+	/* initialize window, bit buffer */
+	glbl->gz_wp = 0;
+	glbl->gz_bk = 0;
+	glbl->gz_bb = 0;
+
+	/* decompress until the last block */
+	h = 0;
+	do {
+		glbl->gz_hufts = 0;
+		if ((r = inflate_block(glbl, &e)) != 0)
+			return r;
+		if (glbl->gz_hufts > h)
+			h = glbl->gz_hufts;
+	} while (!e);
+
+	/* flush out slide */
+	FLUSH(glbl, glbl->gz_wp);
+
+	/* return success */
+	return 0;
+}
+
+/* Nobody uses this - why not? */
+int
+inflate(glbl)
+	struct inflate *glbl;
+{
+	int             i;
+#ifdef _KERNEL
+	u_char		*p = NULL;
+
+	if (!glbl->gz_slide)
+		p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK);
+#endif
+	if (!glbl->gz_slide)
+#ifdef _KERNEL
+		return(ENOMEM);
+#else
+		return 3; /* kzip expects 3 */
+#endif
+	i = xinflate(glbl);
+
+	if (glbl->gz_fixed_td != (struct huft *) NULL) {
+		huft_free(glbl, glbl->gz_fixed_td);
+		glbl->gz_fixed_td = (struct huft *) NULL;
+	}
+	if (glbl->gz_fixed_tl != (struct huft *) NULL) {
+		huft_free(glbl, glbl->gz_fixed_tl);
+		glbl->gz_fixed_tl = (struct huft *) NULL;
+	}
+#ifdef _KERNEL
+	if (p == glbl->gz_slide) {
+		free(glbl->gz_slide, M_GZIP);
+		glbl->gz_slide = NULL;
+	}
+#endif
+	return i;
+}
+/* ----------------------- END INFLATE.C */
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
new file mode 100644
index 0000000..d5c5656
--- /dev/null
+++ b/sys/kern/init_main.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (c) 1995 Terrence R. Lambert
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_init_path.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+#include <sys/sysent.h>
+#include <sys/reboot.h>
+#include <sys/sx.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
+#include <sys/unistd.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+
+#include <machine/cpu.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+#include <sys/copyright.h>
+
+void mi_startup(void);				/* Should be elsewhere */
+
+/* Components of the first process -- never freed. */
+static struct session session0;
+static struct pgrp pgrp0;
+struct	proc proc0;
+struct	thread thread0;
+static struct procsig procsig0;
+static struct filedesc0 filedesc0;
+static struct plimit limit0;
+static struct vmspace vmspace0;
+struct	proc *initproc;
+
+int cmask = CMASK;
+extern int fallback_elf_brand;
+
+struct	vnode *rootvp;
+int	boothowto = 0;		/* initialized so that it can be patched */
+SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "");
+int	bootverbose;
+SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "");
+
+/*
+ * This ensures that there is at least one entry so that the sysinit_set
+ * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
+ * executed.
+ */
+SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL)
+
+/*
+ * The sysinit table itself.  Items are checked off as the are run.
+ * If we want to register new sysinit types, add them to newsysinit.
+ */
+SET_DECLARE(sysinit_set, struct sysinit);
+struct sysinit **sysinit, **sysinit_end;
+struct sysinit **newsysinit, **newsysinit_end;
+
+/*
+ * Merge a new sysinit set into the current set, reallocating it if
+ * necessary.  This can only be called after malloc is running.
+ */
+void
+sysinit_add(struct sysinit **set, struct sysinit **set_end)
+{
+	struct sysinit **newset;
+	struct sysinit **sipp;
+	struct sysinit **xipp;
+	int count;
+
+	count = set_end - set;
+	if (newsysinit)
+		count += newsysinit_end - newsysinit;
+	else
+		count += sysinit_end - sysinit;
+	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
+	if (newset == NULL)
+		panic("cannot malloc for sysinit");
+	xipp = newset;
+	if (newsysinit)
+		for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
+			*xipp++ = *sipp;
+	else
+		for (sipp = sysinit; sipp < sysinit_end; sipp++)
+			*xipp++ = *sipp;
+	for (sipp = set; sipp < set_end; sipp++)
+		*xipp++ = *sipp;
+	if (newsysinit)
+		free(newsysinit, M_TEMP);
+	newsysinit = newset;
+	newsysinit_end = newset + count;
+}
+
+/*
+ * System startup; initialize the world, create process 0, mount root
+ * filesystem, and fork to create init and pagedaemon.  Most of the
+ * hard work is done in the lower-level initialization routines including
+ * startup(), which does memory initialization and autoconfiguration.
+ *
+ * This allows simple addition of new kernel subsystems that require
+ * boot time initialization.  It also allows substitution of subsystem
+ * (for instance, a scheduler, kernel profiler, or VM system) by object
+ * module.  Finally, it allows for optional "kernel threads".
+ */
+void
+mi_startup(void)
+{
+
+	register struct sysinit **sipp;		/* system initialization*/
+	register struct sysinit **xipp;		/* interior loop of sort*/
+	register struct sysinit *save;		/* bubble*/
+
+	if (sysinit == NULL) {
+		sysinit = SET_BEGIN(sysinit_set);
+		sysinit_end = SET_LIMIT(sysinit_set);
+	}
+
+restart:
+	/*
+	 * Perform a bubble sort of the system initialization objects by
+	 * their subsystem (primary key) and order (secondary key).
+	 */
+	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
+		for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
+			if ((*sipp)->subsystem < (*xipp)->subsystem ||
+			     ((*sipp)->subsystem == (*xipp)->subsystem &&
+			      (*sipp)->order <= (*xipp)->order))
+				continue;	/* skip*/
+			save = *sipp;
+			*sipp = *xipp;
+			*xipp = save;
+		}
+	}
+
+	/*
+	 * Traverse the (now) ordered list of system initialization tasks.
+	 * Perform each task, and continue on to the next task.
+	 *
+	 * The last item on the list is expected to be the scheduler,
+	 * which will not return.
+	 */
+	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
+
+		if ((*sipp)->subsystem == SI_SUB_DUMMY)
+			continue;	/* skip dummy task(s)*/
+
+		if ((*sipp)->subsystem == SI_SUB_DONE)
+			continue;
+
+		/* Call function */
+		(*((*sipp)->func))((*sipp)->udata);
+
+		/* Check off the one we're just done */
+		(*sipp)->subsystem = SI_SUB_DONE;
+
+		/* Check if we've installed more sysinit items via KLD */
+		if (newsysinit != NULL) {
+			if (sysinit != SET_BEGIN(sysinit_set))
+				free(sysinit, M_TEMP);
+			sysinit = newsysinit;
+			sysinit_end = newsysinit_end;
+			newsysinit = NULL;
+			newsysinit_end = NULL;
+			goto restart;
+		}
+	}
+
+	panic("Shouldn't get here!");
+	/* NOTREACHED*/
+}
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's belong elsewhere, but have not yet
+ **** been moved.
+ ****
+ ***************************************************************************
+ */
+static void
+print_caddr_t(void *data __unused)
+{
+	printf("%s", (char *)data);
+}
+SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
+SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version)
+
+static void
+set_boot_verbose(void *data __unused)
+{
+
+	if (boothowto & RB_VERBOSE)
+		bootverbose++;
+}
+SYSINIT(boot_verbose, SI_SUB_TUNABLES, SI_ORDER_ANY, set_boot_verbose, NULL)
+
+/*
+ ***************************************************************************
+ ****
+ **** The two following SYSINT's are proc0 specific glue code.  I am not
+ **** convinced that they can not be safely combined, but their order of
+ **** operation has been maintained as the same as the original init_main.c
+ **** for right now.
+ ****
+ **** These probably belong in init_proc.c or kern_proc.c, since they
+ **** deal with proc0 (the fork template process).
+ ****
+ ***************************************************************************
+ */
+/* ARGSUSED*/
+static void
+proc0_init(void *dummy __unused)
+{
+	register struct proc		*p;
+	register struct filedesc0	*fdp;
+	register unsigned i;
+	struct thread *td;
+	struct ksegrp *kg;
+	struct kse *ke;
+
+	GIANT_REQUIRED;
+	p = &proc0;
+	td = &thread0;
+
+	/*
+	 * Initialize magic number.
+	 */
+	p->p_magic = P_MAGIC;
+
+	/*
+	 * Initialize thread, process and pgrp structures.
+	 */
+	procinit();
+
+	/*
+	 * Initialize sleep queue hash table
+	 */
+	sleepinit();
+
+	/*
+	 * additional VM structures
+	 */
+	vm_init2();
+
+	/*
+	 * Create process 0 (the swapper).
+	 */
+	LIST_INSERT_HEAD(&allproc, p, p_list);
+	LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
+	mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
+	p->p_pgrp = &pgrp0;
+	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
+	LIST_INIT(&pgrp0.pg_members);
+	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
+
+	pgrp0.pg_session = &session0;
+	mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
+	session0.s_count = 1;
+	session0.s_leader = p;
+
+#ifdef __ELF__
+	p->p_sysent = &elf_freebsd_sysvec;
+#else
+	p->p_sysent = &aout_sysvec;
+#endif
+
+	ke = &proc0.p_kse;	/* XXXKSE */
+	kg = &proc0.p_ksegrp;	/* XXXKSE */
+	p->p_flag = P_SYSTEM;
+	p->p_sflag = PS_INMEM;
+	p->p_stat = SRUN;
+	p->p_ksegrp.kg_nice = NZERO;
+ 	kg->kg_pri_class = PRI_TIMESHARE;
+ 	kg->kg_user_pri = PUSER;
+ 	td->td_priority = PVM;
+ 	td->td_base_pri = PUSER;
+
+	p->p_peers = 0;
+	p->p_leader = p;
+
+	bcopy("swapper", p->p_comm, sizeof ("swapper"));
+
+	callout_init(&p->p_itcallout, 0);
+	callout_init(&td->td_slpcallout, 1);
+
+	/* Create credentials. */
+	p->p_ucred = crget();
+	p->p_ucred->cr_ngroups = 1;	/* group 0 */
+	p->p_ucred->cr_uidinfo = uifind(0);
+	p->p_ucred->cr_ruidinfo = uifind(0);
+	p->p_ucred->cr_prison = NULL;	/* Don't jail it. */
+	td->td_ucred = crhold(p->p_ucred);
+
+	/* Create procsig. */
+	p->p_procsig = &procsig0;
+	p->p_procsig->ps_refcnt = 1;
+
+	/* Initialize signal state for process 0. */
+	siginit(&proc0);
+
+	/* Create the file descriptor table. */
+	fdp = &filedesc0;
+	p->p_fd = &fdp->fd_fd;
+	mtx_init(&fdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
+	fdp->fd_fd.fd_refcnt = 1;
+	fdp->fd_fd.fd_cmask = cmask;
+	fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
+	fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
+	fdp->fd_fd.fd_nfiles = NDFILE;
+
+	/* Create the limits structures. */
+	p->p_limit = &limit0;
+	for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
+		limit0.pl_rlimit[i].rlim_cur =
+		    limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
+	limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
+	    limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
+	limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
+	    limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
+	i = ptoa(cnt.v_free_count);
+	limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
+	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
+	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
+	limit0.p_cpulimit = RLIM_INFINITY;
+	limit0.p_refcnt = 1;
+
+	/* Allocate a prototype map so we have something to fork. */
+	pmap_pinit0(vmspace_pmap(&vmspace0));
+	p->p_vmspace = &vmspace0;
+	vmspace0.vm_refcnt = 1;
+	vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
+	    trunc_page(VM_MAXUSER_ADDRESS));
+	vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0);
+
+	/*
+	 * We continue to place resource usage info and signal
+	 * actions in the user struct so they're pageable.
+	 */
+	p->p_stats = &p->p_uarea->u_stats;
+	p->p_sigacts = &p->p_uarea->u_sigacts;
+
+	/*
+	 * Charge root for one process.
+	 */
+	(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
+}
+SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
+
+/* ARGSUSED*/
+static void
+proc0_post(void *dummy __unused)
+{
+	struct timespec ts;
+	struct proc *p;
+
+	/*
+	 * Now we can look at the time, having had a chance to verify the
+	 * time from the filesystem.  Pretend that proc0 started now.
+	 */
+	sx_slock(&allproc_lock);
+	LIST_FOREACH(p, &allproc, p_list) {
+		microtime(&p->p_stats->p_start);
+		p->p_runtime.sec = 0;
+		p->p_runtime.frac = 0;
+	}
+	sx_sunlock(&allproc_lock);
+	binuptime(PCPU_PTR(switchtime));
+	PCPU_SET(switchticks, ticks);
+
+	/*
+	 * Give the ``random'' number generator a thump.
+	 */
+	nanotime(&ts);
+	srandom(ts.tv_sec ^ ts.tv_nsec);
+}
+SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's and glue code should be moved to the
+ **** respective files on a per subsystem basis.
+ ****
+ ***************************************************************************
+ */
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following code probably belongs in another file, like
+ **** kern/init_init.c.
+ ****
+ ***************************************************************************
+ */
+
+/*
+ * List of paths to try when searching for "init".
+ */
+static char init_path[MAXPATHLEN] =
+#ifdef	INIT_PATH
+    __XSTRING(INIT_PATH);
+#else
+    "/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall";
+#endif
+SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
+	"Path used to search the init process");
+
+/*
+ * Start the initial user process; try exec'ing each pathname in init_path.
+ * The program is invoked with one argument containing the boot flags.
+ */
+static void
+start_init(void *dummy)
+{
+	vm_offset_t addr;
+	struct execve_args args;
+	int options, error;
+	char *var, *path, *next, *s;
+	char *ucp, **uap, *arg0, *arg1;
+	struct thread *td;
+	struct proc *p;
+	int init_does_devfs = 0;
+
+	mtx_lock(&Giant);
+
+	GIANT_REQUIRED;
+
+	td = curthread;
+	p = td->td_proc;
+
+	vfs_mountroot(NULL);
+
+	/* Get the vnode for '/'.  Set p->p_fd->fd_cdir to reference it. */
+	if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode))
+		panic("cannot find root vnode");
+	FILEDESC_LOCK(p->p_fd);
+	p->p_fd->fd_cdir = rootvnode;
+	VREF(p->p_fd->fd_cdir);
+	p->p_fd->fd_rdir = rootvnode;
+	VREF(p->p_fd->fd_rdir);
+	FILEDESC_UNLOCK(p->p_fd);
+	VOP_UNLOCK(rootvnode, 0, td);
+
+	if (devfs_present) {
+		/*
+		 * For disk based systems, we probably cannot do this yet
+		 * since the fs will be read-only.  But a NFS root
+		 * might be ok.  It is worth a shot.
+		 */
+		error = vn_mkdir("/dev", 0700, UIO_SYSSPACE, td);
+		if (error == EEXIST)
+			error = 0;
+		if (error == 0)
+			error = kernel_vmount(0, "fstype", "devfs",
+			    "fspath", "/dev", NULL);
+		if (error != 0)
+			init_does_devfs = 1;
+	}
+
+	/*
+	 * Need just enough stack to hold the faked-up "execve()" arguments.
+	 */
+	addr = trunc_page(USRSTACK - PAGE_SIZE);
+	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
+			FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
+		panic("init: couldn't allocate argument space");
+	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
+	p->p_vmspace->vm_ssize = 1;
+
+	if ((var = getenv("init_path")) != NULL) {
+		strncpy(init_path, var, sizeof init_path);
+		init_path[sizeof init_path - 1] = 0;
+		freeenv(var);
+	}
+	if ((var = getenv("kern.fallback_elf_brand")) != NULL) {
+		fallback_elf_brand = strtol(var, NULL, 0);
+		freeenv(var);
+	}
+	
+	for (path = init_path; *path != '\0'; path = next) {
+		while (*path == ':')
+			path++;
+		if (*path == '\0')
+			break;
+		for (next = path; *next != '\0' && *next != ':'; next++)
+			/* nothing */ ;
+		if (bootverbose)
+			printf("start_init: trying %.*s\n", (int)(next - path),
+			    path);
+			
+		/*
+		 * Move out the boot flag argument.
+		 */
+		options = 0;
+		ucp = (char *)USRSTACK;
+		(void)subyte(--ucp, 0);		/* trailing zero */
+		if (boothowto & RB_SINGLE) {
+			(void)subyte(--ucp, 's');
+			options = 1;
+		}
+#ifdef notyet
+                if (boothowto & RB_FASTBOOT) {
+			(void)subyte(--ucp, 'f');
+			options = 1;
+		}
+#endif
+
+#ifdef BOOTCDROM
+		(void)subyte(--ucp, 'C');
+		options = 1;
+#endif
+		if (init_does_devfs) {
+			(void)subyte(--ucp, 'd');
+			options = 1;
+		}
+
+		if (options == 0)
+			(void)subyte(--ucp, '-');
+		(void)subyte(--ucp, '-');		/* leading hyphen */
+		arg1 = ucp;
+
+		/*
+		 * Move out the file name (also arg 0).
+		 */
+		(void)subyte(--ucp, 0);
+		for (s = next - 1; s >= path; s--)
+			(void)subyte(--ucp, *s);
+		arg0 = ucp;
+
+		/*
+		 * Move out the arg pointers.
+		 */
+		uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
+		(void)suword((caddr_t)--uap, (long)0);	/* terminator */
+		(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
+		(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
+
+		/*
+		 * Point at the arguments.
+		 */
+		args.fname = arg0;
+		args.argv = uap;
+		args.envv = NULL;
+
+		/*
+		 * Now try to exec the program.  If can't for any reason
+		 * other than it doesn't exist, complain.
+		 *
+		 * Otherwise, return via fork_trampoline() all the way
+		 * to user mode as init!
+		 */
+		if ((error = execve(td, &args)) == 0) {
+			mtx_unlock(&Giant);
+			return;
+		}
+		if (error != ENOENT)
+			printf("exec %.*s: error %d\n", (int)(next - path), 
+			    path, error);
+	}
+	printf("init: not found in path %s\n", init_path);
+	panic("no init");
+}
+
+/*
+ * Like kthread_create(), but runs in it's own address space.
+ * We do this early to reserve pid 1.
+ *
+ * Note special case - do not make it runnable yet.  Other work
+ * in progress will change this more.
+ */
+static void
+create_init(const void *udata __unused)
+{
+	struct ucred *newcred, *oldcred;
+	int error;
+
+	error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, &initproc);
+	if (error)
+		panic("cannot fork init: %d\n", error);
+	/* divorce init's credentials from the kernel's */
+	newcred = crget();
+	PROC_LOCK(initproc);
+	initproc->p_flag |= P_SYSTEM;
+	oldcred = initproc->p_ucred;
+	crcopy(newcred, oldcred);
+	initproc->p_ucred = newcred;
+	PROC_UNLOCK(initproc);
+	crfree(oldcred);
+	mtx_lock_spin(&sched_lock);
+	initproc->p_sflag |= PS_INMEM;
+	mtx_unlock_spin(&sched_lock);
+	cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
+}
+SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
+
+/*
+ * Make it runnable now.
+ */
+static void
+kick_init(const void *udata __unused)
+{
+	struct thread *td;
+
+	td = FIRST_THREAD_IN_PROC(initproc);
+	mtx_lock_spin(&sched_lock);
+	initproc->p_stat = SRUN;
+	setrunqueue(FIRST_THREAD_IN_PROC(initproc)); /* XXXKSE */
+	mtx_unlock_spin(&sched_lock);
+}
+SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
new file mode 100644
index 0000000..425e3b7
--- /dev/null
+++ b/sys/kern/init_sysent.c
@@ -0,0 +1,418 @@
+/*
+ * System call switch table.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.113 2002/06/13 23:43:53 rwatson Exp 
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+#define AS(name) (sizeof(struct name) / sizeof(register_t))
+
+#ifdef COMPAT_43
+#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)
+#else
+#define compat(n, name) 0, (sy_call_t *)nosys
+#endif
+
+/* The casts are bogus but will do for now. */
+struct sysent sysent[] = {
+	{ 0, (sy_call_t *)nosys },			/* 0 = syscall */
+	{ SYF_MPSAFE | AS(sys_exit_args), (sy_call_t *)sys_exit },	/* 1 = exit */
+	{ SYF_MPSAFE | 0, (sy_call_t *)fork },		/* 2 = fork */
+	{ SYF_MPSAFE | AS(read_args), (sy_call_t *)read },	/* 3 = read */
+	{ SYF_MPSAFE | AS(write_args), (sy_call_t *)write },	/* 4 = write */
+	{ AS(open_args), (sy_call_t *)open },		/* 5 = open */
+	{ SYF_MPSAFE | AS(close_args), (sy_call_t *)close },	/* 6 = close */
+	{ SYF_MPSAFE | AS(wait_args), (sy_call_t *)wait4 },	/* 7 = wait4 */
+	{ compat(AS(ocreat_args),creat) },		/* 8 = old creat */
+	{ AS(link_args), (sy_call_t *)link },		/* 9 = link */
+	{ AS(unlink_args), (sy_call_t *)unlink },	/* 10 = unlink */
+	{ 0, (sy_call_t *)nosys },			/* 11 = obsolete execv */
+	{ AS(chdir_args), (sy_call_t *)chdir },		/* 12 = chdir */
+	{ AS(fchdir_args), (sy_call_t *)fchdir },	/* 13 = fchdir */
+	{ AS(mknod_args), (sy_call_t *)mknod },		/* 14 = mknod */
+	{ AS(chmod_args), (sy_call_t *)chmod },		/* 15 = chmod */
+	{ AS(chown_args), (sy_call_t *)chown },		/* 16 = chown */
+	{ SYF_MPSAFE | AS(obreak_args), (sy_call_t *)obreak },	/* 17 = break */
+	{ AS(getfsstat_args), (sy_call_t *)getfsstat },	/* 18 = getfsstat */
+	{ compat(AS(olseek_args),lseek) },		/* 19 = old lseek */
+	{ SYF_MPSAFE | 0, (sy_call_t *)getpid },	/* 20 = getpid */
+	{ AS(mount_args), (sy_call_t *)mount },		/* 21 = mount */
+	{ AS(unmount_args), (sy_call_t *)unmount },	/* 22 = unmount */
+	{ SYF_MPSAFE | AS(setuid_args), (sy_call_t *)setuid },	/* 23 = setuid */
+	{ SYF_MPSAFE | 0, (sy_call_t *)getuid },	/* 24 = getuid */
+	{ SYF_MPSAFE | 0, (sy_call_t *)geteuid },	/* 25 = geteuid */
+	{ AS(ptrace_args), (sy_call_t *)ptrace },	/* 26 = ptrace */
+	{ SYF_MPSAFE | AS(recvmsg_args), (sy_call_t *)recvmsg },	/* 27 = recvmsg */
+	{ SYF_MPSAFE | AS(sendmsg_args), (sy_call_t *)sendmsg },	/* 28 = sendmsg */
+	{ SYF_MPSAFE | AS(recvfrom_args), (sy_call_t *)recvfrom },	/* 29 = recvfrom */
+	{ SYF_MPSAFE | AS(accept_args), (sy_call_t *)accept },	/* 30 = accept */
+	{ SYF_MPSAFE | AS(getpeername_args), (sy_call_t *)getpeername },	/* 31 = getpeername */
+	{ SYF_MPSAFE | AS(getsockname_args), (sy_call_t *)getsockname },	/* 32 = getsockname */
+	{ AS(access_args), (sy_call_t *)access },	/* 33 = access */
+	{ AS(chflags_args), (sy_call_t *)chflags },	/* 34 = chflags */
+	{ AS(fchflags_args), (sy_call_t *)fchflags },	/* 35 = fchflags */
+	{ 0, (sy_call_t *)sync },			/* 36 = sync */
+	{ SYF_MPSAFE | AS(kill_args), (sy_call_t *)kill },	/* 37 = kill */
+	{ compat(AS(ostat_args),stat) },		/* 38 = old stat */
+	{ SYF_MPSAFE | 0, (sy_call_t *)getppid },	/* 39 = getppid */
+	{ compat(AS(olstat_args),lstat) },		/* 40 = old lstat */
+	{ AS(dup_args), (sy_call_t *)dup },		/* 41 = dup */
+	{ 0, (sy_call_t *)pipe },			/* 42 = pipe */
+	{ SYF_MPSAFE | 0, (sy_call_t *)getegid },	/* 43 = getegid */
+	{ SYF_MPSAFE | AS(profil_args), (sy_call_t *)profil },	/* 44 = profil */
+	{ AS(ktrace_args), (sy_call_t *)ktrace },	/* 45 = ktrace */
+	{ compat(SYF_MPSAFE | AS(osigaction_args),sigaction) },	/* 46 = old sigaction */
+	{ SYF_MPSAFE | 0, (sy_call_t *)getgid },	/* 47 = getgid */
+	{ compat(SYF_MPSAFE | AS(osigprocmask_args),sigprocmask) },	/* 48 = old sigprocmask */
+	{ SYF_MPSAFE | AS(getlogin_args), (sy_call_t *)getlogin },	/* 49 = getlogin */
+	{ SYF_MPSAFE | AS(setlogin_args), (sy_call_t *)setlogin },	/* 50 = setlogin */
+	{ SYF_MPSAFE | AS(acct_args), (sy_call_t *)acct },	/* 51 = acct */
+	{ compat(SYF_MPSAFE | 0,sigpending) },		/* 52 = old sigpending */
+	{ SYF_MPSAFE | AS(sigaltstack_args), (sy_call_t *)sigaltstack },	/* 53 = sigaltstack */
+	{ SYF_MPSAFE | AS(ioctl_args), (sy_call_t *)ioctl },	/* 54 = ioctl */
+	{ SYF_MPSAFE | AS(reboot_args), (sy_call_t *)reboot },	/* 55 = reboot */
+	{ AS(revoke_args), (sy_call_t *)revoke },	/* 56 = revoke */
+	{ AS(symlink_args), (sy_call_t *)symlink },	/* 57 = symlink */
+	{ AS(readlink_args), (sy_call_t *)readlink },	/* 58 = readlink */
+	{ SYF_MPSAFE | AS(execve_args), (sy_call_t *)execve },	/* 59 = execve */
+	{ SYF_MPSAFE | AS(umask_args), (sy_call_t *)umask },	/* 60 = umask */
+	{ AS(chroot_args), (sy_call_t *)chroot },	/* 61 = chroot */
+	{ compat(SYF_MPSAFE | AS(ofstat_args),fstat) },	/* 62 = old fstat */
+	{ compat(SYF_MPSAFE | AS(getkerninfo_args),getkerninfo) },	/* 63 = old getkerninfo */
+	{ compat(SYF_MPSAFE | 0,getpagesize) },		/* 64 = old getpagesize */
+	{ AS(msync_args), (sy_call_t *)msync },		/* 65 = msync */
+	{ SYF_MPSAFE | 0, (sy_call_t *)vfork },		/* 66 = vfork */
+	{ 0, (sy_call_t *)nosys },			/* 67 = obsolete vread */
+	{ 0, (sy_call_t *)nosys },			/* 68 = obsolete vwrite */
+	{ SYF_MPSAFE | AS(sbrk_args), (sy_call_t *)sbrk },	/* 69 = sbrk */
+	{ SYF_MPSAFE | AS(sstk_args), (sy_call_t *)sstk },	/* 70 = sstk */
+	{ compat(SYF_MPSAFE | AS(ommap_args),mmap) },	/* 71 = old mmap */
+	{ SYF_MPSAFE | AS(ovadvise_args), (sy_call_t *)ovadvise },	/* 72 = vadvise */
+	{ SYF_MPSAFE | AS(munmap_args), (sy_call_t *)munmap },	/* 73 = munmap */
+	{ SYF_MPSAFE | AS(mprotect_args), (sy_call_t *)mprotect },	/* 74 = mprotect */
+	{ SYF_MPSAFE | AS(madvise_args), (sy_call_t *)madvise },	/* 75 = madvise */
+	{ 0, (sy_call_t *)nosys },			/* 76 = obsolete vhangup */
+	{ 0, (sy_call_t *)nosys },			/* 77 = obsolete vlimit */
+	{ SYF_MPSAFE | AS(mincore_args), (sy_call_t *)mincore },	/* 78 = mincore */
+	{ SYF_MPSAFE | AS(getgroups_args), (sy_call_t *)getgroups },	/* 79 = getgroups */
+	{ SYF_MPSAFE | AS(setgroups_args), (sy_call_t *)setgroups },	/* 80 = setgroups */
+	{ SYF_MPSAFE | 0, (sy_call_t *)getpgrp },	/* 81 = getpgrp */
+	{ SYF_MPSAFE | AS(setpgid_args), (sy_call_t *)setpgid },	/* 82 = setpgid */
+	{ SYF_MPSAFE | AS(setitimer_args), (sy_call_t *)setitimer },	/* 83 = setitimer */
+	{ compat(SYF_MPSAFE | 0,wait) },		/* 84 = old wait */
+	{ SYF_MPSAFE | AS(swapon_args), (sy_call_t *)swapon },	/* 85 = swapon */
+	{ SYF_MPSAFE | AS(getitimer_args), (sy_call_t *)getitimer },	/* 86 = getitimer */
+	{ compat(SYF_MPSAFE | AS(gethostname_args),gethostname) },	/* 87 = old gethostname */
+	{ compat(SYF_MPSAFE | AS(sethostname_args),sethostname) },	/* 88 = old sethostname */
+	{ SYF_MPSAFE | 0, (sy_call_t *)getdtablesize },	/* 89 = getdtablesize */
+	{ SYF_MPSAFE | AS(dup2_args), (sy_call_t *)dup2 },	/* 90 = dup2 */
+	{ 0, (sy_call_t *)nosys },			/* 91 = getdopt */
+	{ SYF_MPSAFE | AS(fcntl_args), (sy_call_t *)fcntl },	/* 92 = fcntl */
+	{ SYF_MPSAFE | AS(select_args), (sy_call_t *)select },	/* 93 = select */
+	{ 0, (sy_call_t *)nosys },			/* 94 = setdopt */
+	{ AS(fsync_args), (sy_call_t *)fsync },		/* 95 = fsync */
+	{ SYF_MPSAFE | AS(setpriority_args), (sy_call_t *)setpriority },	/* 96 = setpriority */
+	{ SYF_MPSAFE | AS(socket_args), (sy_call_t *)socket },	/* 97 = socket */
+	{ SYF_MPSAFE | AS(connect_args), (sy_call_t *)connect },	/* 98 = connect */
+	{ compat(SYF_MPSAFE | AS(accept_args),accept) },	/* 99 = old accept */
+	{ SYF_MPSAFE | AS(getpriority_args), (sy_call_t *)getpriority },	/* 100 = getpriority */
+	{ compat(SYF_MPSAFE | AS(osend_args),send) },	/* 101 = old send */
+	{ compat(SYF_MPSAFE | AS(orecv_args),recv) },	/* 102 = old recv */
+	{ SYF_MPSAFE | AS(osigreturn_args), (sy_call_t *)osigreturn },	/* 103 = osigreturn */
+	{ SYF_MPSAFE | AS(bind_args), (sy_call_t *)bind },	/* 104 = bind */
+	{ SYF_MPSAFE | AS(setsockopt_args), (sy_call_t *)setsockopt },	/* 105 = setsockopt */
+	{ SYF_MPSAFE | AS(listen_args), (sy_call_t *)listen },	/* 106 = listen */
+	{ 0, (sy_call_t *)nosys },			/* 107 = obsolete vtimes */
+	{ compat(SYF_MPSAFE | AS(osigvec_args),sigvec) },	/* 108 = old sigvec */
+	{ compat(SYF_MPSAFE | AS(osigblock_args),sigblock) },	/* 109 = old sigblock */
+	{ compat(SYF_MPSAFE | AS(osigsetmask_args),sigsetmask) },	/* 110 = old sigsetmask */
+	{ compat(SYF_MPSAFE | AS(osigsuspend_args),sigsuspend) },	/* 111 = old sigsuspend */
+	{ compat(SYF_MPSAFE | AS(osigstack_args),sigstack) },	/* 112 = old sigstack */
+	{ compat(SYF_MPSAFE | AS(orecvmsg_args),recvmsg) },	/* 113 = old recvmsg */
+	{ compat(SYF_MPSAFE | AS(osendmsg_args),sendmsg) },	/* 114 = old sendmsg */
+	{ 0, (sy_call_t *)nosys },			/* 115 = obsolete vtrace */
+	{ SYF_MPSAFE | AS(gettimeofday_args), (sy_call_t *)gettimeofday },	/* 116 = gettimeofday */
+	{ SYF_MPSAFE | AS(getrusage_args), (sy_call_t *)getrusage },	/* 117 = getrusage */
+	{ SYF_MPSAFE | AS(getsockopt_args), (sy_call_t *)getsockopt },	/* 118 = getsockopt */
+	{ 0, (sy_call_t *)nosys },			/* 119 = resuba */
+	{ SYF_MPSAFE | AS(readv_args), (sy_call_t *)readv },	/* 120 = readv */
+	{ SYF_MPSAFE | AS(writev_args), (sy_call_t *)writev },	/* 121 = writev */
+	{ SYF_MPSAFE | AS(settimeofday_args), (sy_call_t *)settimeofday },	/* 122 = settimeofday */
+	{ AS(fchown_args), (sy_call_t *)fchown },	/* 123 = fchown */
+	{ AS(fchmod_args), (sy_call_t *)fchmod },	/* 124 = fchmod */
+	{ compat(SYF_MPSAFE | AS(recvfrom_args),recvfrom) },	/* 125 = old recvfrom */
+	{ SYF_MPSAFE | AS(setreuid_args), (sy_call_t *)setreuid },	/* 126 = setreuid */
+	{ SYF_MPSAFE | AS(setregid_args), (sy_call_t *)setregid },	/* 127 = setregid */
+	{ AS(rename_args), (sy_call_t *)rename },	/* 128 = rename */
+	{ compat(AS(otruncate_args),truncate) },	/* 129 = old truncate */
+	{ compat(AS(oftruncate_args),ftruncate) },	/* 130 = old ftruncate */
+	{ SYF_MPSAFE | AS(flock_args), (sy_call_t *)flock },	/* 131 = flock */
+	{ AS(mkfifo_args), (sy_call_t *)mkfifo },	/* 132 = mkfifo */
+	{ SYF_MPSAFE | AS(sendto_args), (sy_call_t *)sendto },	/* 133 = sendto */
+	{ SYF_MPSAFE | AS(shutdown_args), (sy_call_t *)shutdown },	/* 134 = shutdown */
+	{ SYF_MPSAFE | AS(socketpair_args), (sy_call_t *)socketpair },	/* 135 = socketpair */
+	{ AS(mkdir_args), (sy_call_t *)mkdir },		/* 136 = mkdir */
+	{ AS(rmdir_args), (sy_call_t *)rmdir },		/* 137 = rmdir */
+	{ AS(utimes_args), (sy_call_t *)utimes },	/* 138 = utimes */
+	{ 0, (sy_call_t *)nosys },			/* 139 = obsolete 4.2 sigreturn */
+	{ SYF_MPSAFE | AS(adjtime_args), (sy_call_t *)adjtime },	/* 140 = adjtime */
+	{ compat(SYF_MPSAFE | AS(ogetpeername_args),getpeername) },	/* 141 = old getpeername */
+	{ compat(SYF_MPSAFE | 0,gethostid) },		/* 142 = old gethostid */
+	{ compat(SYF_MPSAFE | AS(osethostid_args),sethostid) },	/* 143 = old sethostid */
+	{ compat(SYF_MPSAFE | AS(ogetrlimit_args),getrlimit) },	/* 144 = old getrlimit */
+	{ compat(SYF_MPSAFE | AS(osetrlimit_args),setrlimit) },	/* 145 = old setrlimit */
+	{ compat(SYF_MPSAFE | AS(okillpg_args),killpg) },	/* 146 = old killpg */
+	{ SYF_MPSAFE | 0, (sy_call_t *)setsid },	/* 147 = setsid */
+	{ AS(quotactl_args), (sy_call_t *)quotactl },	/* 148 = quotactl */
+	{ compat(SYF_MPSAFE | 0,quota) },		/* 149 = old quota */
+	{ compat(SYF_MPSAFE | AS(getsockname_args),getsockname) },	/* 150 = old getsockname */
+	{ 0, (sy_call_t *)nosys },			/* 151 = sem_lock */
+	{ 0, (sy_call_t *)nosys },			/* 152 = sem_wakeup */
+	{ 0, (sy_call_t *)nosys },			/* 153 = asyncdaemon */
+	{ 0, (sy_call_t *)nosys },			/* 154 = nosys */
+	{ SYF_MPSAFE | AS(nfssvc_args), (sy_call_t *)nosys },	/* 155 = nfssvc */
+	{ compat(AS(ogetdirentries_args),getdirentries) },	/* 156 = old getdirentries */
+	{ AS(statfs_args), (sy_call_t *)statfs },	/* 157 = statfs */
+	{ AS(fstatfs_args), (sy_call_t *)fstatfs },	/* 158 = fstatfs */
+	{ 0, (sy_call_t *)nosys },			/* 159 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 160 = nosys */
+	{ AS(getfh_args), (sy_call_t *)getfh },		/* 161 = getfh */
+	{ SYF_MPSAFE | AS(getdomainname_args), (sy_call_t *)getdomainname },	/* 162 = getdomainname */
+	{ SYF_MPSAFE | AS(setdomainname_args), (sy_call_t *)setdomainname },	/* 163 = setdomainname */
+	{ SYF_MPSAFE | AS(uname_args), (sy_call_t *)uname },	/* 164 = uname */
+	{ AS(sysarch_args), (sy_call_t *)sysarch },	/* 165 = sysarch */
+	{ SYF_MPSAFE | AS(rtprio_args), (sy_call_t *)rtprio },	/* 166 = rtprio */
+	{ 0, (sy_call_t *)nosys },			/* 167 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 168 = nosys */
+	{ SYF_MPSAFE | AS(semsys_args), (sy_call_t *)lkmressys },	/* 169 = semsys */
+	{ SYF_MPSAFE | AS(msgsys_args), (sy_call_t *)lkmressys },	/* 170 = msgsys */
+	{ SYF_MPSAFE | AS(shmsys_args), (sy_call_t *)lkmressys },	/* 171 = shmsys */
+	{ 0, (sy_call_t *)nosys },			/* 172 = nosys */
+	{ SYF_MPSAFE | AS(pread_args), (sy_call_t *)pread },	/* 173 = pread */
+	{ SYF_MPSAFE | AS(pwrite_args), (sy_call_t *)pwrite },	/* 174 = pwrite */
+	{ 0, (sy_call_t *)nosys },			/* 175 = nosys */
+	{ SYF_MPSAFE | AS(ntp_adjtime_args), (sy_call_t *)ntp_adjtime },	/* 176 = ntp_adjtime */
+	{ 0, (sy_call_t *)nosys },			/* 177 = sfork */
+	{ 0, (sy_call_t *)nosys },			/* 178 = getdescriptor */
+	{ 0, (sy_call_t *)nosys },			/* 179 = setdescriptor */
+	{ 0, (sy_call_t *)nosys },			/* 180 = nosys */
+	{ SYF_MPSAFE | AS(setgid_args), (sy_call_t *)setgid },	/* 181 = setgid */
+	{ SYF_MPSAFE | AS(setegid_args), (sy_call_t *)setegid },	/* 182 = setegid */
+	{ SYF_MPSAFE | AS(seteuid_args), (sy_call_t *)seteuid },	/* 183 = seteuid */
+	{ 0, (sy_call_t *)nosys },			/* 184 = lfs_bmapv */
+	{ 0, (sy_call_t *)nosys },			/* 185 = lfs_markv */
+	{ 0, (sy_call_t *)nosys },			/* 186 = lfs_segclean */
+	{ 0, (sy_call_t *)nosys },			/* 187 = lfs_segwait */
+	{ AS(stat_args), (sy_call_t *)stat },		/* 188 = stat */
+	{ SYF_MPSAFE | AS(fstat_args), (sy_call_t *)fstat },	/* 189 = fstat */
+	{ AS(lstat_args), (sy_call_t *)lstat },		/* 190 = lstat */
+	{ AS(pathconf_args), (sy_call_t *)pathconf },	/* 191 = pathconf */
+	{ SYF_MPSAFE | AS(fpathconf_args), (sy_call_t *)fpathconf },	/* 192 = fpathconf */
+	{ 0, (sy_call_t *)nosys },			/* 193 = nosys */
+	{ SYF_MPSAFE | AS(__getrlimit_args), (sy_call_t *)getrlimit },	/* 194 = getrlimit */
+	{ SYF_MPSAFE | AS(__setrlimit_args), (sy_call_t *)setrlimit },	/* 195 = setrlimit */
+	{ AS(getdirentries_args), (sy_call_t *)getdirentries },	/* 196 = getdirentries */
+	{ SYF_MPSAFE | AS(mmap_args), (sy_call_t *)mmap },	/* 197 = mmap */
+	{ 0, (sy_call_t *)nosys },			/* 198 = __syscall */
+	{ AS(lseek_args), (sy_call_t *)lseek },		/* 199 = lseek */
+	{ AS(truncate_args), (sy_call_t *)truncate },	/* 200 = truncate */
+	{ AS(ftruncate_args), (sy_call_t *)ftruncate },	/* 201 = ftruncate */
+	{ SYF_MPSAFE | AS(sysctl_args), (sy_call_t *)__sysctl },	/* 202 = __sysctl */
+	{ SYF_MPSAFE | AS(mlock_args), (sy_call_t *)mlock },	/* 203 = mlock */
+	{ SYF_MPSAFE | AS(munlock_args), (sy_call_t *)munlock },	/* 204 = munlock */
+	{ AS(undelete_args), (sy_call_t *)undelete },	/* 205 = undelete */
+	{ AS(futimes_args), (sy_call_t *)futimes },	/* 206 = futimes */
+	{ SYF_MPSAFE | AS(getpgid_args), (sy_call_t *)getpgid },	/* 207 = getpgid */
+	{ 0, (sy_call_t *)nosys },			/* 208 = newreboot */
+	{ SYF_MPSAFE | AS(poll_args), (sy_call_t *)poll },	/* 209 = poll */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys },	/* 210 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys },	/* 211 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys },	/* 212 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys },	/* 213 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys },	/* 214 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys },	/* 215 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys },	/* 216 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys },	/* 217 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys },	/* 218 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys },	/* 219 = lkmnosys */
+	{ SYF_MPSAFE | AS(__semctl_args), (sy_call_t *)lkmressys },	/* 220 = __semctl */
+	{ SYF_MPSAFE | AS(semget_args), (sy_call_t *)lkmressys },	/* 221 = semget */
+	{ SYF_MPSAFE | AS(semop_args), (sy_call_t *)lkmressys },	/* 222 = semop */
+	{ 0, (sy_call_t *)nosys },			/* 223 = semconfig */
+	{ SYF_MPSAFE | AS(msgctl_args), (sy_call_t *)lkmressys },	/* 224 = msgctl */
+	{ SYF_MPSAFE | AS(msgget_args), (sy_call_t *)lkmressys },	/* 225 = msgget */
+	{ SYF_MPSAFE | AS(msgsnd_args), (sy_call_t *)lkmressys },	/* 226 = msgsnd */
+	{ SYF_MPSAFE | AS(msgrcv_args), (sy_call_t *)lkmressys },	/* 227 = msgrcv */
+	{ SYF_MPSAFE | AS(shmat_args), (sy_call_t *)lkmressys },	/* 228 = shmat */
+	{ SYF_MPSAFE | AS(shmctl_args), (sy_call_t *)lkmressys },	/* 229 = shmctl */
+	{ SYF_MPSAFE | AS(shmdt_args), (sy_call_t *)lkmressys },	/* 230 = shmdt */
+	{ SYF_MPSAFE | AS(shmget_args), (sy_call_t *)lkmressys },	/* 231 = shmget */
+	{ SYF_MPSAFE | AS(clock_gettime_args), (sy_call_t *)clock_gettime },	/* 232 = clock_gettime */
+	{ SYF_MPSAFE | AS(clock_settime_args), (sy_call_t *)clock_settime },	/* 233 = clock_settime */
+	{ SYF_MPSAFE | AS(clock_getres_args), (sy_call_t *)clock_getres },	/* 234 = clock_getres */
+	{ 0, (sy_call_t *)nosys },			/* 235 = timer_create */
+	{ 0, (sy_call_t *)nosys },			/* 236 = timer_delete */
+	{ 0, (sy_call_t *)nosys },			/* 237 = timer_settime */
+	{ 0, (sy_call_t *)nosys },			/* 238 = timer_gettime */
+	{ 0, (sy_call_t *)nosys },			/* 239 = timer_getoverrun */
+	{ SYF_MPSAFE | AS(nanosleep_args), (sy_call_t *)nanosleep },	/* 240 = nanosleep */
+	{ 0, (sy_call_t *)nosys },			/* 241 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 242 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 243 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 244 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 245 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 246 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 247 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 248 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 249 = nosys */
+	{ SYF_MPSAFE | AS(minherit_args), (sy_call_t *)minherit },	/* 250 = minherit */
+	{ SYF_MPSAFE | AS(rfork_args), (sy_call_t *)rfork },	/* 251 = rfork */
+	{ SYF_MPSAFE | AS(openbsd_poll_args), (sy_call_t *)openbsd_poll },	/* 252 = openbsd_poll */
+	{ 0, (sy_call_t *)issetugid },			/* 253 = issetugid */
+	{ AS(lchown_args), (sy_call_t *)lchown },	/* 254 = lchown */
+	{ 0, (sy_call_t *)nosys },			/* 255 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 256 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 257 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 258 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 259 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 260 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 261 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 262 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 263 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 264 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 265 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 266 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 267 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 268 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 269 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 270 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 271 = nosys */
+	{ AS(getdents_args), (sy_call_t *)getdents },	/* 272 = getdents */
+	{ 0, (sy_call_t *)nosys },			/* 273 = nosys */
+	{ AS(lchmod_args), (sy_call_t *)lchmod },	/* 274 = lchmod */
+	{ AS(lchown_args), (sy_call_t *)lchown },	/* 275 = netbsd_lchown */
+	{ AS(lutimes_args), (sy_call_t *)lutimes },	/* 276 = lutimes */
+	{ SYF_MPSAFE | AS(msync_args), (sy_call_t *)msync },	/* 277 = netbsd_msync */
+	{ AS(nstat_args), (sy_call_t *)nstat },		/* 278 = nstat */
+	{ SYF_MPSAFE | AS(nfstat_args), (sy_call_t *)nfstat },	/* 279 = nfstat */
+	{ AS(nlstat_args), (sy_call_t *)nlstat },	/* 280 = nlstat */
+	{ 0, (sy_call_t *)nosys },			/* 281 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 282 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 283 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 284 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 285 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 286 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 287 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 288 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 289 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 290 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 291 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 292 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 293 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 294 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 295 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 296 = nosys */
+	{ AS(fhstatfs_args), (sy_call_t *)fhstatfs },	/* 297 = fhstatfs */
+	{ AS(fhopen_args), (sy_call_t *)fhopen },	/* 298 = fhopen */
+	{ AS(fhstat_args), (sy_call_t *)fhstat },	/* 299 = fhstat */
+	{ SYF_MPSAFE | AS(modnext_args), (sy_call_t *)modnext },	/* 300 = modnext */
+	{ SYF_MPSAFE | AS(modstat_args), (sy_call_t *)modstat },	/* 301 = modstat */
+	{ SYF_MPSAFE | AS(modfnext_args), (sy_call_t *)modfnext },	/* 302 = modfnext */
+	{ SYF_MPSAFE | AS(modfind_args), (sy_call_t *)modfind },	/* 303 = modfind */
+	{ SYF_MPSAFE | AS(kldload_args), (sy_call_t *)kldload },	/* 304 = kldload */
+	{ SYF_MPSAFE | AS(kldunload_args), (sy_call_t *)kldunload },	/* 305 = kldunload */
+	{ SYF_MPSAFE | AS(kldfind_args), (sy_call_t *)kldfind },	/* 306 = kldfind */
+	{ SYF_MPSAFE | AS(kldnext_args), (sy_call_t *)kldnext },	/* 307 = kldnext */
+	{ SYF_MPSAFE | AS(kldstat_args), (sy_call_t *)kldstat },	/* 308 = kldstat */
+	{ SYF_MPSAFE | AS(kldfirstmod_args), (sy_call_t *)kldfirstmod },	/* 309 = kldfirstmod */
+	{ SYF_MPSAFE | AS(getsid_args), (sy_call_t *)getsid },	/* 310 = getsid */
+	{ SYF_MPSAFE | AS(setresuid_args), (sy_call_t *)setresuid },	/* 311 = setresuid */
+	{ SYF_MPSAFE | AS(setresgid_args), (sy_call_t *)setresgid },	/* 312 = setresgid */
+	{ 0, (sy_call_t *)nosys },			/* 313 = obsolete signanosleep */
+	{ AS(aio_return_args), (sy_call_t *)lkmressys },	/* 314 = aio_return */
+	{ AS(aio_suspend_args), (sy_call_t *)lkmressys },	/* 315 = aio_suspend */
+	{ AS(aio_cancel_args), (sy_call_t *)lkmressys },	/* 316 = aio_cancel */
+	{ AS(aio_error_args), (sy_call_t *)lkmressys },	/* 317 = aio_error */
+	{ AS(aio_read_args), (sy_call_t *)lkmressys },	/* 318 = aio_read */
+	{ AS(aio_write_args), (sy_call_t *)lkmressys },	/* 319 = aio_write */
+	{ AS(lio_listio_args), (sy_call_t *)lkmressys },	/* 320 = lio_listio */
+	{ SYF_MPSAFE | 0, (sy_call_t *)yield },		/* 321 = yield */
+	{ 0, (sy_call_t *)nosys },			/* 322 = obsolete thr_sleep */
+	{ 0, (sy_call_t *)nosys },			/* 323 = obsolete thr_wakeup */
+	{ SYF_MPSAFE | AS(mlockall_args), (sy_call_t *)mlockall },	/* 324 = mlockall */
+	{ SYF_MPSAFE | 0, (sy_call_t *)munlockall },	/* 325 = munlockall */
+	{ AS(__getcwd_args), (sy_call_t *)__getcwd },	/* 326 = __getcwd */
+	{ SYF_MPSAFE | AS(sched_setparam_args), (sy_call_t *)sched_setparam },	/* 327 = sched_setparam */
+	{ SYF_MPSAFE | AS(sched_getparam_args), (sy_call_t *)sched_getparam },	/* 328 = sched_getparam */
+	{ SYF_MPSAFE | AS(sched_setscheduler_args), (sy_call_t *)sched_setscheduler },	/* 329 = sched_setscheduler */
+	{ SYF_MPSAFE | AS(sched_getscheduler_args), (sy_call_t *)sched_getscheduler },	/* 330 = sched_getscheduler */
+	{ SYF_MPSAFE | 0, (sy_call_t *)sched_yield },	/* 331 = sched_yield */
+	{ SYF_MPSAFE | AS(sched_get_priority_max_args), (sy_call_t *)sched_get_priority_max },	/* 332 = sched_get_priority_max */
+	{ SYF_MPSAFE | AS(sched_get_priority_min_args), (sy_call_t *)sched_get_priority_min },	/* 333 = sched_get_priority_min */
+	{ SYF_MPSAFE | AS(sched_rr_get_interval_args), (sy_call_t *)sched_rr_get_interval },	/* 334 = sched_rr_get_interval */
+	{ AS(utrace_args), (sy_call_t *)utrace },	/* 335 = utrace */
+	{ SYF_MPSAFE | AS(sendfile_args), (sy_call_t *)sendfile },	/* 336 = sendfile */
+	{ AS(kldsym_args), (sy_call_t *)kldsym },	/* 337 = kldsym */
+	{ SYF_MPSAFE | AS(jail_args), (sy_call_t *)jail },	/* 338 = jail */
+	{ 0, (sy_call_t *)nosys },			/* 339 = pioctl */
+	{ SYF_MPSAFE | AS(sigprocmask_args), (sy_call_t *)sigprocmask },	/* 340 = sigprocmask */
+	{ SYF_MPSAFE | AS(sigsuspend_args), (sy_call_t *)sigsuspend },	/* 341 = sigsuspend */
+	{ SYF_MPSAFE | AS(sigaction_args), (sy_call_t *)sigaction },	/* 342 = sigaction */
+	{ SYF_MPSAFE | AS(sigpending_args), (sy_call_t *)sigpending },	/* 343 = sigpending */
+	{ SYF_MPSAFE | AS(sigreturn_args), (sy_call_t *)sigreturn },	/* 344 = sigreturn */
+	{ 0, (sy_call_t *)nosys },			/* 345 = sigtimedwait */
+	{ 0, (sy_call_t *)nosys },			/* 346 = sigwaitinfo */
+	{ SYF_MPSAFE | AS(__acl_get_file_args), (sy_call_t *)__acl_get_file },	/* 347 = __acl_get_file */
+	{ SYF_MPSAFE | AS(__acl_set_file_args), (sy_call_t *)__acl_set_file },	/* 348 = __acl_set_file */
+	{ SYF_MPSAFE | AS(__acl_get_fd_args), (sy_call_t *)__acl_get_fd },	/* 349 = __acl_get_fd */
+	{ SYF_MPSAFE | AS(__acl_set_fd_args), (sy_call_t *)__acl_set_fd },	/* 350 = __acl_set_fd */
+	{ SYF_MPSAFE | AS(__acl_delete_file_args), (sy_call_t *)__acl_delete_file },	/* 351 = __acl_delete_file */
+	{ SYF_MPSAFE | AS(__acl_delete_fd_args), (sy_call_t *)__acl_delete_fd },	/* 352 = __acl_delete_fd */
+	{ SYF_MPSAFE | AS(__acl_aclcheck_file_args), (sy_call_t *)__acl_aclcheck_file },	/* 353 = __acl_aclcheck_file */
+	{ SYF_MPSAFE | AS(__acl_aclcheck_fd_args), (sy_call_t *)__acl_aclcheck_fd },	/* 354 = __acl_aclcheck_fd */
+	{ AS(extattrctl_args), (sy_call_t *)extattrctl },	/* 355 = extattrctl */
+	{ AS(extattr_set_file_args), (sy_call_t *)extattr_set_file },	/* 356 = extattr_set_file */
+	{ AS(extattr_get_file_args), (sy_call_t *)extattr_get_file },	/* 357 = extattr_get_file */
+	{ AS(extattr_delete_file_args), (sy_call_t *)extattr_delete_file },	/* 358 = extattr_delete_file */
+	{ AS(aio_waitcomplete_args), (sy_call_t *)lkmressys },	/* 359 = aio_waitcomplete */
+	{ SYF_MPSAFE | AS(getresuid_args), (sy_call_t *)getresuid },	/* 360 = getresuid */
+	{ SYF_MPSAFE | AS(getresgid_args), (sy_call_t *)getresgid },	/* 361 = getresgid */
+	{ SYF_MPSAFE | 0, (sy_call_t *)kqueue },	/* 362 = kqueue */
+	{ SYF_MPSAFE | AS(kevent_args), (sy_call_t *)kevent },	/* 363 = kevent */
+	{ 0, (sy_call_t *)nosys },			/* 364 = __cap_get_proc */
+	{ 0, (sy_call_t *)nosys },			/* 365 = __cap_set_proc */
+	{ 0, (sy_call_t *)nosys },			/* 366 = __cap_get_fd */
+	{ 0, (sy_call_t *)nosys },			/* 367 = __cap_get_file */
+	{ 0, (sy_call_t *)nosys },			/* 368 = __cap_set_fd */
+	{ 0, (sy_call_t *)nosys },			/* 369 = __cap_set_file */
+	{ AS(nosys_args), (sy_call_t *)lkmressys },	/* 370 = lkmressys */
+	{ AS(extattr_set_fd_args), (sy_call_t *)extattr_set_fd },	/* 371 = extattr_set_fd */
+	{ AS(extattr_get_fd_args), (sy_call_t *)extattr_get_fd },	/* 372 = extattr_get_fd */
+	{ AS(extattr_delete_fd_args), (sy_call_t *)extattr_delete_fd },	/* 373 = extattr_delete_fd */
+	{ SYF_MPSAFE | AS(__setugid_args), (sy_call_t *)__setugid },	/* 374 = __setugid */
+	{ AS(nfsclnt_args), (sy_call_t *)nosys },	/* 375 = nfsclnt */
+	{ AS(eaccess_args), (sy_call_t *)eaccess },	/* 376 = eaccess */
+	{ 0, (sy_call_t *)nosys },			/* 377 = afs_syscall */
+	{ AS(nmount_args), (sy_call_t *)nmount },	/* 378 = nmount */
+	{ 0, (sy_call_t *)kse_exit },			/* 379 = kse_exit */
+	{ 0, (sy_call_t *)kse_wakeup },			/* 380 = kse_wakeup */
+	{ AS(kse_new_args), (sy_call_t *)kse_new },	/* 381 = kse_new */
+	{ AS(thread_wakeup_args), (sy_call_t *)thread_wakeup },	/* 382 = thread_wakeup */
+	{ 0, (sy_call_t *)kse_yield },			/* 383 = kse_yield */
+	{ 0, (sy_call_t *)nosys },			/* 384 = __mac_get_proc */
+	{ 0, (sy_call_t *)nosys },			/* 385 = __mac_set_proc */
+	{ 0, (sy_call_t *)nosys },			/* 386 = __mac_get_fd */
+	{ 0, (sy_call_t *)nosys },			/* 387 = __mac_get_file */
+	{ 0, (sy_call_t *)nosys },			/* 388 = __mac_set_fd */
+	{ 0, (sy_call_t *)nosys },			/* 389 = __mac_set_file */
+	{ AS(kenv_args), (sy_call_t *)kenv },		/* 390 = kenv */
+	{ AS(lchflags_args), (sy_call_t *)lchflags },	/* 391 = lchflags */
+	{ AS(uuidgen_args), (sy_call_t *)uuidgen },	/* 392 = uuidgen */
+};
diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c
new file mode 100644
index 0000000..6626197
--- /dev/null
+++ b/sys/kern/kern_acct.c
@@ -0,0 +1,345 @@
+/*-
+ * Copyright (c) 1994 Christopher G. Demetriou
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_acct.c	8.1 (Berkeley) 6/14/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/fcntl.h>
+#include <sys/syslog.h>
+#include <sys/kernel.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/namei.h>
+#include <sys/acct.h>
+#include <sys/resourcevar.h>
+#include <sys/tty.h>
+
+/*
+ * The routines implemented in this file are described in:
+ *      Leffler, et al.: The Design and Implementation of the 4.3BSD
+ *	    UNIX Operating System (Addison Welley, 1989)
+ * on pages 62-63.
+ *
+ * Arguably, to simplify accounting operations, this mechanism should
+ * be replaced by one in which an accounting log file (similar to /dev/klog)
+ * is read by a user process, etc.  However, that has its own problems.
+ */
+
+/*
+ * Internal accounting functions.
+ * The former's operation is described in Leffler, et al., and the latter
+ * was provided by UCB with the 4.4BSD-Lite release
+ */
+static comp_t	encode_comp_t(u_long, u_long);
+static void	acctwatch(void *);
+
+/*
+ * Accounting callout used for periodic scheduling of acctwatch.
+ */
+static struct	callout acctwatch_callout;
+
+/*
+ * Accounting vnode pointer, and saved vnode pointer.
+ */
+static struct	vnode *acctp;
+static struct	vnode *savacctp;
+
+/*
+ * Values associated with enabling and disabling accounting
+ */
+static int acctsuspend = 2;	/* stop accounting when < 2% free space left */
+SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
+	&acctsuspend, 0, "percentage of free disk space below which accounting stops");
+
+static int acctresume = 4;	/* resume when free space risen to > 4% */
+SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
+	&acctresume, 0, "percentage of free disk space above which accounting resumes");
+
+static int acctchkfreq = 15;	/* frequency (in seconds) to check space */
+SYSCTL_INT(_kern, OID_AUTO, acct_chkfreq, CTLFLAG_RW,
+	&acctchkfreq, 0, "frequency for checking the free space");
+
+/*
+ * Accounting system call.  Written based on the specification and
+ * previous implementation done by Mark Tinguely.
+ *
+ * MPSAFE
+ */
+int
+acct(td, uap)
+	struct thread *td;
+	struct acct_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	struct nameidata nd;
+	int error, flags;
+
+	/* Make sure that the caller is root. */
+	error = suser(td);
+	if (error)
+		return (error);
+
+	mtx_lock(&Giant);
+	/*
+	 * If accounting is to be started to a file, open that file for
+	 * writing and make sure it's a 'normal'.
+	 */
+	if (SCARG(uap, path) != NULL) {
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path),
+		       td);
+		flags = FWRITE;
+		error = vn_open(&nd, &flags, 0);
+		if (error)
+			goto done2;
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		VOP_UNLOCK(nd.ni_vp, 0, td);
+		if (nd.ni_vp->v_type != VREG) {
+			vn_close(nd.ni_vp, FWRITE, td->td_ucred, td);
+			error = EACCES;
+			goto done2;
+		}
+	}
+
+	/*
+	 * If accounting was previously enabled, kill the old space-watcher,
+	 * close the file, and (if no new file was specified, leave).
+	 */
+	if (acctp != NULLVP || savacctp != NULLVP) {
+		callout_stop(&acctwatch_callout);
+		error = vn_close((acctp != NULLVP ? acctp : savacctp), FWRITE,
+		    td->td_ucred, td);
+		acctp = savacctp = NULLVP;
+	}
+	if (SCARG(uap, path) == NULL)
+		goto done2;
+
+	/*
+	 * Save the new accounting file vnode, and schedule the new
+	 * free space watcher.
+	 */
+	acctp = nd.ni_vp;
+	callout_init(&acctwatch_callout, 0);
+	acctwatch(NULL);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Write out process accounting information, on process exit.
+ * Data to be written out is specified in Leffler, et al.
+ * and are enumerated below.  (They're also noted in the system
+ * "acct.h" header file.)
+ */
+
+int
+acct_process(td)
+	struct thread *td;
+{
+	struct proc *p = td->td_proc;
+	struct acct acct;
+	struct rusage *r;
+	struct timeval ut, st, tmp;
+	int t;
+	struct vnode *vp;
+
+	/* If accounting isn't enabled, don't bother */
+	vp = acctp;
+	if (vp == NULLVP)
+		return (0);
+
+	/*
+	 * Get process accounting information.
+	 */
+
+	/* (1) The name of the command that ran */
+	bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
+
+	/* (2) The amount of user and system time that was used */
+	mtx_lock_spin(&sched_lock);
+	calcru(p, &ut, &st, NULL);
+	mtx_unlock_spin(&sched_lock);
+	acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec);
+	acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec);
+
+	/* (3) The elapsed time the commmand ran (and its starting time) */
+	acct.ac_btime = p->p_stats->p_start.tv_sec;
+	microtime(&tmp);
+	timevalsub(&tmp, &p->p_stats->p_start);
+	acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec);
+
+	/* (4) The average amount of memory used */
+	r = &p->p_stats->p_ru;
+	tmp = ut;
+	timevaladd(&tmp, &st);
+	t = tmp.tv_sec * hz + tmp.tv_usec / tick;
+	if (t)
+		acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
+	else
+		acct.ac_mem = 0;
+
+	/* (5) The number of disk I/O operations done */
+	acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
+
+	/* (6) The UID and GID of the process */
+	acct.ac_uid = p->p_ucred->cr_ruid;
+	acct.ac_gid = p->p_ucred->cr_rgid;
+
+	/* (7) The terminal from which the process was started */
+	PROC_LOCK(p);
+	SESS_LOCK(p->p_session);
+	if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
+		acct.ac_tty = dev2udev(p->p_pgrp->pg_session->s_ttyp->t_dev);
+	else
+		acct.ac_tty = NOUDEV;
+	SESS_UNLOCK(p->p_session);
+	PROC_UNLOCK(p);
+
+	/* (8) The boolean flags that tell how the process terminated, etc. */
+	acct.ac_flag = p->p_acflag;
+
+	/*
+	 * Eliminate any file size rlimit.
+	 */
+	if (p->p_limit->p_refcnt > 1 &&
+	    (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
+		p->p_limit->p_refcnt--;
+		p->p_limit = limcopy(p->p_limit);
+	} 
+	p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+
+	/*
+	 * Write the accounting information to the file.
+	 */
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	return (vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct),
+	    (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, td->td_ucred,
+	    (int *)0, td));
+}
+
+/*
+ * Encode_comp_t converts from ticks in seconds and microseconds
+ * to ticks in 1/AHZ seconds.  The encoding is described in
+ * Leffler, et al., on page 63.
+ */
+
+#define	MANTSIZE	13			/* 13 bit mantissa. */
+#define	EXPSIZE		3			/* Base 8 (3 bit) exponent. */
+#define	MAXFRACT	((1 << MANTSIZE) - 1)	/* Maximum fractional value. */
+
+static comp_t
+encode_comp_t(s, us)
+	u_long s, us;
+{
+	int exp, rnd;
+
+	exp = 0;
+	rnd = 0;
+	s *= AHZ;
+	s += us / (1000000 / AHZ);	/* Maximize precision. */
+
+	while (s > MAXFRACT) {
+	rnd = s & (1 << (EXPSIZE - 1));	/* Round up? */
+		s >>= EXPSIZE;		/* Base 8 exponent == 3 bit shift. */
+		exp++;
+	}
+
+	/* If we need to round up, do it (and handle overflow correctly). */
+	if (rnd && (++s > MAXFRACT)) {
+		s >>= EXPSIZE;
+		exp++;
+	}
+
+	/* Clean it up and polish it off. */
+	exp <<= MANTSIZE;		/* Shift the exponent into place */
+	exp += s;			/* and add on the mantissa. */
+	return (exp);
+}
+
+/*
+ * Periodically check the filesystem to see if accounting
+ * should be turned on or off.  Beware the case where the vnode
+ * has been vgone()'d out from underneath us, e.g. when the file
+ * system containing the accounting file has been forcibly unmounted.
+ */
+/* ARGSUSED */
+static void
+acctwatch(a)
+	void *a;
+{
+	struct statfs sb;
+
+	if (savacctp != NULLVP) {
+		if (savacctp->v_type == VBAD) {
+			(void) vn_close(savacctp, FWRITE, NOCRED, NULL);
+			savacctp = NULLVP;
+			return;
+		}
+		(void)VFS_STATFS(savacctp->v_mount, &sb, (struct thread *)0);
+		if (sb.f_bavail > acctresume * sb.f_blocks / 100) {
+			acctp = savacctp;
+			savacctp = NULLVP;
+			log(LOG_NOTICE, "Accounting resumed\n");
+		}
+	} else {
+		if (acctp == NULLVP)
+			return;
+		if (acctp->v_type == VBAD) {
+			(void) vn_close(acctp, FWRITE, NOCRED, NULL);
+			acctp = NULLVP;
+			return;
+		}
+		(void)VFS_STATFS(acctp->v_mount, &sb, (struct thread *)0);
+		if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) {
+			savacctp = acctp;
+			acctp = NULLVP;
+			log(LOG_NOTICE, "Accounting suspended\n");
+		}
+	}
+	callout_reset(&acctwatch_callout, acctchkfreq * hz, acctwatch, NULL);
+}
diff --git a/sys/kern/kern_acl.c b/sys/kern/kern_acl.c
new file mode 100644
index 0000000..70be0ec
--- /dev/null
+++ b/sys/kern/kern_acl.c
@@ -0,0 +1,830 @@
+/*-
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ * Support for POSIX.1e access control lists.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+
+MALLOC_DEFINE(M_ACL, "acl", "access control list");
+
+static int	vacl_set_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_get_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+
+/*
+ * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
+ * Return 0 on success, else an errno value.  Should be merged into
+ * vaccess() eventually.
+ */
+int
+vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
+    struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
+{
+	struct acl_entry *acl_other, *acl_mask;
+	mode_t dac_granted;
+	mode_t cap_granted;
+	mode_t acl_mask_granted;
+	int group_matched, i;
+
+	/*
+	 * Look for a normal, non-privileged way to access the file/directory
+	 * as requested.  If it exists, go with that.  Otherwise, attempt
+	 * to use privileges granted via cap_granted.  In some cases,
+	 * which privileges to use may be ambiguous due to "best match",
+	 * in which case fall back on first match for the time being.
+	 */
+	if (privused != NULL)
+		*privused = 0;
+
+	/*
+	 * Determine privileges now, but don't apply until we've found
+	 * a DAC entry that matches but has failed to allow access.
+	 */
+#ifndef CAPABILITIES
+	if (suser_cred(cred, PRISON_ROOT) == 0)
+		cap_granted = (VEXEC | VREAD | VWRITE | VADMIN);
+	else
+		cap_granted = 0;
+#else
+	cap_granted = 0;
+
+	if (type == VDIR) {
+		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+		     CAP_DAC_READ_SEARCH, PRISON_ROOT))
+			cap_granted |= VEXEC;
+	} else {
+		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+		    CAP_DAC_EXECUTE, PRISON_ROOT))
+			cap_granted |= VEXEC;
+	}
+
+	if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
+	    PRISON_ROOT))
+		cap_granted |= VREAD;
+
+	if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE,
+	    PRISON_ROOT))
+		cap_granted |= VWRITE;
+
+	if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
+	    PRISON_ROOT))
+		cap_granted |= VADMIN;
+#endif /* CAPABILITIES */
+
+	/*
+	 * The owner matches if the effective uid associated with the
+	 * credential matches that of the ACL_USER_OBJ entry.  While we're
+	 * doing the first scan, also cache the location of the ACL_MASK
+	 * and ACL_OTHER entries, preventing some future iterations.
+	 */
+	acl_mask = acl_other = NULL;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			if (file_uid != cred->cr_uid)
+				break;
+			dac_granted = 0;
+			dac_granted |= VADMIN;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+			if ((acc_mode & (dac_granted | cap_granted)) ==
+			    acc_mode) {
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+			}
+			goto error;
+
+		case ACL_MASK:
+			acl_mask = &acl->acl_entry[i];
+			break;
+
+		case ACL_OTHER:
+			acl_other = &acl->acl_entry[i];
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * An ACL_OTHER entry should always exist in a valid access
+	 * ACL.  If it doesn't, then generate a serious failure.  For now,
+	 * this means a debugging message and EPERM, but in the future
+	 * should probably be a panic.
+	 */
+	if (acl_other == NULL) {
+		/*
+		 * XXX This should never happen
+		 */
+		printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
+		return (EPERM);
+	}
+
+	/*
+	 * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
+	 * are masked by an ACL_MASK entry, if any.  As such, first identify
+	 * the ACL_MASK field, then iterate through identifying potential
+	 * user matches, then group matches.  If there is no ACL_MASK,
+	 * assume that the mask allows all requests to succeed.
+	 */
+	if (acl_mask != NULL) {
+		acl_mask_granted = 0;
+		if (acl_mask->ae_perm & ACL_EXECUTE)
+			acl_mask_granted |= VEXEC;
+		if (acl_mask->ae_perm & ACL_READ)
+			acl_mask_granted |= VREAD;
+		if (acl_mask->ae_perm & ACL_WRITE)
+			acl_mask_granted |= VWRITE;
+	} else
+		acl_mask_granted = VEXEC | VREAD | VWRITE;
+
+	/*
+	 * Iterate through user ACL entries.  Do checks twice, first
+	 * without privilege, and then if a match is found but failed,
+	 * a second time with privilege.
+	 */
+
+	/*
+	 * Check ACL_USER ACL entries.
+	 */
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER:
+			if (acl->acl_entry[i].ae_id != cred->cr_uid)
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			dac_granted &= acl_mask_granted;
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+			if ((acc_mode & (dac_granted | cap_granted)) !=
+			    acc_mode)
+				goto error;
+
+			if (privused != NULL)
+				*privused = 1;
+			return (0);
+		}
+	}
+
+	/*
+	 * Group match is best-match, not first-match, so find a 
+	 * "best" match.  Iterate across, testing each potential group
+	 * match.  Make sure we keep track of whether we found a match
+	 * or not, so that we know if we should try again with any
+	 * available privilege, or if we should move on to ACL_OTHER.
+	 */
+	group_matched = 0;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_GROUP_OBJ:
+			if (!groupmember(file_gid, cred))
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			dac_granted  &= acl_mask_granted;
+
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+
+			group_matched = 1;
+			break;
+
+		case ACL_GROUP:
+			if (!groupmember(acl->acl_entry[i].ae_id, cred))
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			dac_granted  &= acl_mask_granted;
+
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+
+			group_matched = 1;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	if (group_matched == 1) {
+		/*
+		 * There was a match, but it did not grant rights via
+		 * pure DAC.  Try again, this time with privilege.
+		 */
+		for (i = 0; i < acl->acl_cnt; i++) {
+			switch (acl->acl_entry[i].ae_tag) {
+			case ACL_GROUP_OBJ:
+				if (!groupmember(file_gid, cred))
+					break;
+				dac_granted = 0;
+				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+					dac_granted |= VEXEC;
+				if (acl->acl_entry[i].ae_perm & ACL_READ)
+					dac_granted |= VREAD;
+				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+					dac_granted |= VWRITE;
+				dac_granted &= acl_mask_granted;
+
+				if ((acc_mode & (dac_granted | cap_granted)) !=
+				    acc_mode)
+					break;
+
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+
+			case ACL_GROUP:
+				if (!groupmember(acl->acl_entry[i].ae_id,
+				    cred))
+					break;
+				dac_granted = 0;
+				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+				if (acl->acl_entry[i].ae_perm & ACL_READ)
+					dac_granted |= VREAD;
+				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+					dac_granted |= VWRITE;
+				dac_granted &= acl_mask_granted;
+
+				if ((acc_mode & (dac_granted | cap_granted)) !=
+				    acc_mode)
+					break;
+
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+
+			default:
+				break;
+			}
+		}
+		/*
+		 * Even with privilege, group membership was not sufficient.
+		 * Return failure.
+		 */
+		goto error;
+	}
+		
+	/*
+	 * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
+	 */
+	dac_granted = 0;
+	if (acl_other->ae_perm & ACL_EXECUTE)
+		dac_granted |= VEXEC;
+	if (acl_other->ae_perm & ACL_READ)
+		dac_granted |= VREAD;
+	if (acl_other->ae_perm & ACL_WRITE)
+		dac_granted |= VWRITE;
+
+	if ((acc_mode & dac_granted) == acc_mode)
+		return (0);
+	if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
+		if (privused != NULL)
+			*privused = 1;
+		return (0);
+	}
+
+error:
+	return ((acc_mode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * For the purposes of filesystems maintaining the _OBJ entries in an
+ * inode with a mode_t field, this routine converts a mode_t entry
+ * to an acl_perm_t.
+ */
+acl_perm_t
+acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
+{
+	acl_perm_t	perm = 0;
+
+	switch(tag) {
+	case ACL_USER_OBJ:
+		if (mode & S_IXUSR)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IRUSR)
+			perm |= ACL_READ;
+		if (mode & S_IWUSR)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	case ACL_GROUP_OBJ:
+		if (mode & S_IXGRP)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IRGRP)
+			perm |= ACL_READ;
+		if (mode & S_IWGRP)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	case ACL_OTHER:
+		if (mode & S_IXOTH)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IROTH)
+			perm |= ACL_READ;
+		if (mode & S_IWOTH)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	default:
+		printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
+		return (0);
+	}
+}
+
+/*
+ * Given inode information (uid, gid, mode), return an acl entry of the
+ * appropriate type.
+ */
+struct acl_entry
+acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
+{
+	struct acl_entry	acl_entry;
+
+	acl_entry.ae_tag = tag;
+	acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
+	switch(tag) {
+	case ACL_USER_OBJ:
+		acl_entry.ae_id = uid;
+		break;
+
+	case ACL_GROUP_OBJ:
+		acl_entry.ae_id = gid;
+		break;
+
+	case ACL_OTHER:
+		acl_entry.ae_id = ACL_UNDEFINED_ID;
+		break;
+
+	default:
+		acl_entry.ae_id = ACL_UNDEFINED_ID;
+		printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
+	}
+
+	return (acl_entry);
+}
+
+/*
+ * Utility function to generate a file mode given appropriate ACL entries.
+ */
+mode_t
+acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
+    struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
+{
+	mode_t	mode;
+
+	mode = 0;
+	if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXUSR;
+	if (acl_user_obj_entry->ae_perm & ACL_READ)
+		mode |= S_IRUSR;
+	if (acl_user_obj_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWUSR;
+	if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXGRP;
+	if (acl_group_obj_entry->ae_perm & ACL_READ)
+		mode |= S_IRGRP;
+	if (acl_group_obj_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWGRP;
+	if (acl_other_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXOTH;
+	if (acl_other_entry->ae_perm & ACL_READ)
+		mode |= S_IROTH;
+	if (acl_other_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWOTH;
+
+	return (mode);
+}
+
+/*
+ * Perform a syntactic check of the ACL, sufficient to allow an
+ * implementing filesystem to determine if it should accept this and
+ * rely on the POSIX.1e ACL properties.
+ */
+int
+acl_posix1e_check(struct acl *acl)
+{
+	int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
+	int num_acl_mask, num_acl_other, i;
+
+	/*
+	 * Verify that the number of entries does not exceed the maximum
+	 * defined for acl_t.
+	 * Verify that the correct number of various sorts of ae_tags are
+	 * present:
+	 *   Exactly one ACL_USER_OBJ
+	 *   Exactly one ACL_GROUP_OBJ
+	 *   Exactly one ACL_OTHER
+	 *   If any ACL_USER or ACL_GROUP entries appear, then exactly one
+	 *   ACL_MASK entry must also appear.
+	 * Verify that all ae_perm entries are in ACL_PERM_BITS.
+	 * Verify all ae_tag entries are understood by this implementation.
+	 * Note: Does not check for uniqueness of qualifier (ae_id) field.
+	 */
+	num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
+	    num_acl_mask = num_acl_other = 0;
+	if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
+		return (EINVAL);
+	for (i = 0; i < acl->acl_cnt; i++) {
+		/*
+		 * Check for a valid tag.
+		 */
+		switch(acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_user_obj++;
+			break;
+		case ACL_GROUP_OBJ:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_group_obj++;
+			break;
+		case ACL_USER:
+			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_user++;
+			break;
+		case ACL_GROUP:
+			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_group++;
+			break;
+		case ACL_OTHER:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_other++;
+			break;
+		case ACL_MASK:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_mask++;
+			break;
+		default:
+			return (EINVAL);
+		}
+		/*
+		 * Check for valid perm entries.
+		 */
+		if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
+		    ACL_PERM_BITS)
+			return (EINVAL);
+	}
+	if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
+	    (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
+		return (EINVAL);
+	if (((num_acl_group != 0) || (num_acl_user != 0)) &&
+	    (num_acl_mask != 1))
+		return (EINVAL);
+	return (0);
+}
+
+/*
+ * These calls wrap the real vnode operations, and are called by the 
+ * syscall code once the syscall has converted the path or file
+ * descriptor to a vnode (unlocked).  The aclp pointer is assumed
+ * still to point to userland, so this should not be consumed within
+ * the kernel except by syscall code.  Other code should directly
+ * invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernacl;
+	struct mount *mp;
+	int error;
+
+	error = copyin(aclp, &inkernacl, sizeof(struct acl));
+	if (error)
+		return(error);
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return(error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernelacl;
+	int error;
+
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	if (error == 0)
+		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
+	return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+	struct mount *mp;
+	int error;
+
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_SETACL(vp, type, NULL, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernelacl;
+	int error;
+
+	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
+	if (error)
+		return(error);
+	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
+	return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
+ * Don't need to lock, as the vacl_ code will get/release any locks
+ * required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), 
+			    SCARG(uap, aclp));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type),
+			    SCARG(uap, aclp));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_get_acl(td, (struct vnode *)fp->f_data,
+			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_set_acl(td, (struct vnode *)fp->f_data,
+			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_delete(td, nd.ni_vp, SCARG(uap, type));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_delete(td, (struct vnode *)fp->f_data, 
+			    SCARG(uap, type));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+	struct nameidata	nd;
+	int	error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type),
+			    SCARG(uap, aclp));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_aclcheck(td, (struct vnode *)fp->f_data,
+			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
new file mode 100644
index 0000000..2e7ca8b
--- /dev/null
+++ b/sys/kern/kern_clock.c
@@ -0,0 +1,492 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_ntp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dkstat.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/ktr.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/smp.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+
+#include <machine/cpu.h>
+#include <machine/limits.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+#ifdef DEVICE_POLLING
+extern void init_device_poll(void);
+extern void hardclock_device_poll(void);
+#endif /* DEVICE_POLLING */
+
+static void initclocks(void *dummy);
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+long cp_time[CPUSTATES];
+
+SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time),
+    "LU", "CPU time statistics");
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other.
+ *
+ * The main timer, running hz times per second, is used to trigger interval
+ * timers, timeouts and rescheduling as needed.
+ *
+ * The second timer handles kernel and user profiling,
+ * and does resource use estimation.  If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks.  For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires.  Otherwise, it would never accumulate
+ * cpu ticks.  The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock.  This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling.  This profile clock runs at profhz.  We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics.  (For profiling, every tick counts.)
+ *
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
+ */
+
+int	stathz;
+int	profhz;
+static int profprocs;
+int	ticks;
+static int psdiv, pscnt;		/* prof => stat divider */
+int	psratio;			/* ratio: prof / stat */
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+	void *dummy;
+{
+	register int i;
+
+	/*
+	 * Set divisors to 1 (normal case) and let the machine-specific
+	 * code do its bit.
+	 */
+	psdiv = pscnt = 1;
+	cpu_initclocks();
+
+#ifdef DEVICE_POLLING
+	init_device_poll();
+#endif
+	/*
+	 * Compute profhz/stathz, and fix profhz if needed.
+	 */
+	i = stathz ? stathz : hz;
+	if (profhz == 0)
+		profhz = i;
+	psratio = profhz / i;
+}
+
+/*
+ * Each time the real-time timer fires, this function is called on all CPUs
+ * with each CPU passing in its curthread as the first argument.  If possible
+ * a nice optimization in the future would be to allow the CPU receiving the
+ * actual real-time timer interrupt to call this function on behalf of the
+ * other CPUs rather than sending an IPI to all other CPUs so that they
+ * can call this function.  Note that hardclock() calls hardclock_process()
+ * for the CPU receiving the timer interrupt, so only the other CPUs in the
+ * system need to call this function (or have it called on their behalf.
+ */
+void
+hardclock_process(td, user)
+	struct thread *td;
+	int user;
+{
+	struct pstats *pstats;
+	struct proc *p = td->td_proc;
+
+	/*
+	 * Run current process's virtual and profile time, as needed.
+	 */
+	mtx_assert(&sched_lock, MA_OWNED);
+	if (p->p_flag & P_KSES) {
+		/* XXXKSE What to do? */
+	} else {
+		pstats = p->p_stats;
+		if (user &&
+		    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
+			p->p_sflag |= PS_ALRMPEND;
+			td->td_kse->ke_flags |= KEF_ASTPENDING;
+		}
+		if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
+			p->p_sflag |= PS_PROFPEND;
+			td->td_kse->ke_flags |= KEF_ASTPENDING;
+		}
+	}
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(frame)
+	register struct clockframe *frame;
+{
+	int need_softclock = 0;
+
+	CTR0(KTR_CLK, "hardclock fired");
+	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
+	hardclock_process(curthread, CLKF_USERMODE(frame));
+	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
+
+	/*
+	 * If no separate statistics clock is available, run it from here.
+	 *
+	 * XXX: this only works for UP
+	 */
+	if (stathz == 0)
+		statclock(frame);
+
+#ifdef DEVICE_POLLING
+	hardclock_device_poll();	/* this is very short and quick */
+#endif /* DEVICE_POLLING */
+
+	/*
+	 * Process callouts at a very low cpu priority, so we don't keep the
+	 * relatively high clock interrupt priority any longer than necessary.
+	 */
+	mtx_lock_spin_flags(&callout_lock, MTX_QUIET);
+	ticks++;
+	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
+		need_softclock = 1;
+	} else if (softticks + 1 == ticks)
+		++softticks;
+	mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
+
+	/*
+	 * swi_sched acquires sched_lock, so we don't want to call it with
+	 * callout_lock held; incorrect locking order.
+	 */
+	if (need_softclock)
+		swi_sched(softclock_ih, 0);
+}
+
+/*
+ * Compute number of ticks in the specified amount of time.
+ */
+int
+tvtohz(tv)
+	struct timeval *tv;
+{
+	register unsigned long ticks;
+	register long sec, usec;
+
+	/*
+	 * If the number of usecs in the whole seconds part of the time
+	 * difference fits in a long, then the total number of usecs will
+	 * fit in an unsigned long.  Compute the total and convert it to
+	 * ticks, rounding up and adding 1 to allow for the current tick
+	 * to expire.  Rounding also depends on unsigned long arithmetic
+	 * to avoid overflow.
+	 *
+	 * Otherwise, if the number of ticks in the whole seconds part of
+	 * the time difference fits in a long, then convert the parts to
+	 * ticks separately and add, using similar rounding methods and
+	 * overflow avoidance.  This method would work in the previous
+	 * case but it is slightly slower and assumes that hz is integral.
+	 *
+	 * Otherwise, round the time difference down to the maximum
+	 * representable value.
+	 *
+	 * If ints have 32 bits, then the maximum value for any timeout in
+	 * 10ms ticks is 248 days.
+	 */
+	sec = tv->tv_sec;
+	usec = tv->tv_usec;
+	if (usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+	if (sec < 0) {
+#ifdef DIAGNOSTIC
+		if (usec > 0) {
+			sec++;
+			usec -= 1000000;
+		}
+		printf("tvotohz: negative time difference %ld sec %ld usec\n",
+		       sec, usec);
+#endif
+		ticks = 1;
+	} else if (sec <= LONG_MAX / 1000000)
+		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+			/ tick + 1;
+	else if (sec <= LONG_MAX / hz)
+		ticks = sec * hz
+			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
+	else
+		ticks = LONG_MAX;
+	if (ticks > INT_MAX)
+		ticks = INT_MAX;
+	return ((int)ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+	register struct proc *p;
+{
+	int s;
+
+	/*
+	 * XXX; Right now sched_lock protects statclock(), but perhaps
+	 * it should be protected later on by a time_lock, which would
+	 * cover psdiv, etc. as well.
+	 */
+	mtx_lock_spin(&sched_lock);
+	if ((p->p_sflag & PS_PROFIL) == 0) {
+		p->p_sflag |= PS_PROFIL;
+		if (++profprocs == 1 && stathz != 0) {
+			s = splstatclock();
+			psdiv = pscnt = psratio;
+			setstatclockrate(profhz);
+			splx(s);
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+	register struct proc *p;
+{
+	int s;
+
+	mtx_lock_spin(&sched_lock);
+	if (p->p_sflag & PS_PROFIL) {
+		p->p_sflag &= ~PS_PROFIL;
+		if (--profprocs == 0 && stathz != 0) {
+			s = splstatclock();
+			psdiv = pscnt = 1;
+			setstatclockrate(stathz);
+			splx(s);
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Do process and kernel statistics.  Most of the statistics are only
+ * used by user-level statistics programs.  The main exceptions are
+ * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu.  This function
+ * should be called by all CPUs in the system for each statistics clock
+ * interrupt.  See the description of hardclock_process for more detail on
+ * this function's relationship to statclock.
+ */
+void
+statclock_process(ke, pc, user)
+	struct kse *ke;
+	register_t pc;
+	int user;
+{
+#ifdef GPROF
+	struct gmonparam *g;
+	int i;
+#endif
+	struct pstats *pstats;
+	long rss;
+	struct rusage *ru;
+	struct vmspace *vm;
+	struct proc *p = ke->ke_proc;
+	struct thread *td = ke->ke_thread; /* current thread */
+
+	KASSERT(ke == curthread->td_kse, ("statclock_process: td != curthread"));
+	mtx_assert(&sched_lock, MA_OWNED);
+	if (user) {
+		/*
+		 * Came from user mode; CPU was in user state.
+		 * If this process is being profiled, record the tick.
+		 */
+		if (p->p_sflag & PS_PROFIL)
+			addupc_intr(ke, pc, 1);
+		if (pscnt < psdiv)
+			return;
+		/*
+		 * Charge the time as appropriate.
+		 */
+		ke->ke_uticks++;
+		if (ke->ke_ksegrp->kg_nice > NZERO)
+			cp_time[CP_NICE]++;
+		else
+			cp_time[CP_USER]++;
+	} else {
+#ifdef GPROF
+		/*
+		 * Kernel statistics are just like addupc_intr, only easier.
+		 */
+		g = &_gmonparam;
+		if (g->state == GMON_PROF_ON) {
+			i = pc - g->lowpc;
+			if (i < g->textsize) {
+				i /= HISTFRACTION * sizeof(*g->kcount);
+				g->kcount[i]++;
+			}
+		}
+#endif
+		if (pscnt < psdiv)
+			return;
+		/*
+		 * Came from kernel mode, so we were:
+		 * - handling an interrupt,
+		 * - doing syscall or trap work on behalf of the current
+		 *   user process, or
+		 * - spinning in the idle loop.
+		 * Whichever it is, charge the time as appropriate.
+		 * Note that we charge interrupts to the current process,
+		 * regardless of whether they are ``for'' that process,
+		 * so that we know how much of its real time was spent
+		 * in ``non-process'' (i.e., interrupt) work.
+		 */
+		if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) {
+			ke->ke_iticks++;
+			cp_time[CP_INTR]++;
+		} else {
+			ke->ke_sticks++;
+			if (p != PCPU_GET(idlethread)->td_proc)
+				cp_time[CP_SYS]++;
+			else
+				cp_time[CP_IDLE]++;
+		}
+	}
+
+	schedclock(ke->ke_thread);
+
+	/* Update resource usage integrals and maximums. */
+	if ((pstats = p->p_stats) != NULL &&
+	    (ru = &pstats->p_ru) != NULL &&
+	    (vm = p->p_vmspace) != NULL) {
+		ru->ru_ixrss += pgtok(vm->vm_tsize);
+		ru->ru_idrss += pgtok(vm->vm_dsize);
+		ru->ru_isrss += pgtok(vm->vm_ssize);
+		rss = pgtok(vmspace_resident_count(vm));
+		if (ru->ru_maxrss < rss)
+			ru->ru_maxrss = rss;
+	}
+}
+
+/*
+ * Statistics clock.  Grab profile sample, and if divider reaches 0,
+ * do process and kernel statistics.  Most of the statistics are only
+ * used by user-level statistics programs.  The main exceptions are
+ * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu.
+ */
+void
+statclock(frame)
+	register struct clockframe *frame;
+{
+
+	CTR0(KTR_CLK, "statclock fired");
+	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
+	if (--pscnt == 0)
+		pscnt = psdiv;
+	statclock_process(curthread->td_kse, CLKF_PC(frame), CLKF_USERMODE(frame));
+	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
+{
+	struct clockinfo clkinfo;
+	/*
+	 * Construct clockinfo structure.
+	 */
+	bzero(&clkinfo, sizeof(clkinfo));
+	clkinfo.hz = hz;
+	clkinfo.tick = tick;
+	clkinfo.profhz = profhz;
+	clkinfo.stathz = stathz ? stathz : hz;
+	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+	0, 0, sysctl_kern_clockrate, "S,clockinfo",
+	"Rate and period of various kernel clocks");
diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c
new file mode 100644
index 0000000..9d30d25
--- /dev/null
+++ b/sys/kern/kern_condvar.c
@@ -0,0 +1,579 @@
+/*-
+ * Copyright (c) 2000 Jake Burkholder <jake@freebsd.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/condvar.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+/*
+ * Common sanity checks for cv_wait* functions.
+ */
+#define	CV_ASSERT(cvp, mp, td) do {					\
+	KASSERT((td) != NULL, ("%s: curthread NULL", __func__));	\
+	KASSERT((td)->td_proc->p_stat == SRUN, ("%s: not SRUN", __func__));	\
+	KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__));		\
+	KASSERT((mp) != NULL, ("%s: mp NULL", __func__));		\
+	mtx_assert((mp), MA_OWNED | MA_NOTRECURSED);			\
+} while (0)
+
+#ifdef INVARIANTS
+#define	CV_WAIT_VALIDATE(cvp, mp) do {					\
+	if (TAILQ_EMPTY(&(cvp)->cv_waitq)) {				\
+		/* Only waiter. */					\
+		(cvp)->cv_mtx = (mp);					\
+	} else {							\
+		/*							\
+		 * Other waiter; assert that we're using the		\
+		 * same mutex.						\
+		 */							\
+		KASSERT((cvp)->cv_mtx == (mp),				\
+		    ("%s: Multiple mutexes", __func__));		\
+	}								\
+} while (0)
+#define	CV_SIGNAL_VALIDATE(cvp) do {					\
+	if (!TAILQ_EMPTY(&(cvp)->cv_waitq)) {				\
+		KASSERT(mtx_owned((cvp)->cv_mtx),			\
+		    ("%s: Mutex not owned", __func__));		\
+	}								\
+} while (0)
+#else
+#define	CV_WAIT_VALIDATE(cvp, mp)
+#define	CV_SIGNAL_VALIDATE(cvp)
+#endif
+
+static void cv_timedwait_end(void *arg);
+
+/*
+ * Initialize a condition variable.  Must be called before use.
+ */
+void
+cv_init(struct cv *cvp, const char *desc)
+{
+
+	TAILQ_INIT(&cvp->cv_waitq);
+	cvp->cv_mtx = NULL;
+	cvp->cv_description = desc;
+}
+
+/*
+ * Destroy a condition variable.  The condition variable must be re-initialized
+ * in order to be re-used.
+ */
+void
+cv_destroy(struct cv *cvp)
+{
+
+	KASSERT(cv_waitq_empty(cvp), ("%s: cv_waitq non-empty", __func__));
+}
+
+/*
+ * Common code for cv_wait* functions.  All require sched_lock.
+ */
+
+/*
+ * Switch context.
+ */
+static __inline void
+cv_switch(struct thread *td)
+{
+
+	td->td_proc->p_stat = SSLEEP;
+	td->td_proc->p_stats->p_ru.ru_nvcsw++;
+	mi_switch();
+	CTR3(KTR_PROC, "cv_switch: resume thread %p (pid %d, %s)", td,
+	    td->td_proc->p_pid, td->td_proc->p_comm);
+}
+
+/*
+ * Switch context, catching signals.
+ */
+static __inline int
+cv_switch_catch(struct thread *td)
+{
+	struct proc *p;
+	int sig;
+
+	/*
+	 * We put ourselves on the sleep queue and start our timeout before
+	 * calling cursig, as we could stop there, and a wakeup or a SIGCONT (or
+	 * both) could occur while we were stopped.  A SIGCONT would cause us to
+	 * be marked as SSLEEP without resuming us, thus we must be ready for
+	 * sleep when cursig is called.  If the wakeup happens while we're
+	 * stopped, td->td_wchan will be 0 upon return from cursig.
+	 */
+	td->td_flags |= TDF_SINTR;
+	mtx_unlock_spin(&sched_lock);
+	p = td->td_proc;
+	PROC_LOCK(p);
+	sig = cursig(p);	/* XXXKSE */
+	mtx_lock_spin(&sched_lock);
+	PROC_UNLOCK(p);
+	if (sig != 0) {
+		if (td->td_wchan != NULL)
+			cv_waitq_remove(td);
+		td->td_proc->p_stat = SRUN;
+	} else if (td->td_wchan != NULL) {
+		cv_switch(td);
+	}
+	td->td_flags &= ~TDF_SINTR;
+
+	return sig;
+}
+
+/*
+ * Add a thread to the wait queue of a condition variable.
+ */
+static __inline void
+cv_waitq_add(struct cv *cvp, struct thread *td)
+{
+
+	/*
+	 * Process may be sitting on a slpque if asleep() was called, remove it
+	 * before re-adding.
+	 */
+	if (td->td_wchan != NULL)
+		unsleep(td);
+
+	td->td_flags |= TDF_CVWAITQ;
+	td->td_wchan = cvp;
+	td->td_wmesg = cvp->cv_description;
+	td->td_kse->ke_slptime = 0; /* XXXKSE */
+	td->td_ksegrp->kg_slptime = 0; /* XXXKSE */
+	td->td_base_pri = td->td_priority;
+	CTR3(KTR_PROC, "cv_waitq_add: thread %p (pid %d, %s)", td,
+	    td->td_proc->p_pid, td->td_proc->p_comm);
+	TAILQ_INSERT_TAIL(&cvp->cv_waitq, td, td_slpq);
+}
+
+/*
+ * Wait on a condition variable.  The current thread is placed on the condition
+ * variable's wait queue and suspended.  A cv_signal or cv_broadcast on the same
+ * condition variable will resume the thread.  The mutex is released before
+ * sleeping and will be held on return.  It is recommended that the mutex be
+ * held when cv_signal or cv_broadcast are called.
+ */
+void
+cv_wait(struct cv *cvp, struct mtx *mp)
+{
+	struct thread *td;
+	WITNESS_SAVE_DECL(mp);
+
+	td = curthread;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0);
+#endif
+	CV_ASSERT(cvp, mp, td);
+	WITNESS_SLEEP(0, &mp->mtx_object);
+	WITNESS_SAVE(&mp->mtx_object, mp);
+
+	if (cold || panicstr) {
+		/*
+		 * After a panic, or during autoconfiguration, just give
+		 * interrupts a chance, then just return; don't run any other
+		 * thread or panic below, in case this is the idle process and
+		 * already asleep.
+		 */
+		return;
+	}
+
+	mtx_lock_spin(&sched_lock);
+
+	CV_WAIT_VALIDATE(cvp, mp);
+
+	DROP_GIANT();
+	mtx_unlock(mp);
+
+	cv_waitq_add(cvp, td);
+	cv_switch(td);
+
+	mtx_unlock_spin(&sched_lock);
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0);
+#endif
+	PICKUP_GIANT();
+	mtx_lock(mp);
+	WITNESS_RESTORE(&mp->mtx_object, mp);
+}
+
+/*
+ * Wait on a condition variable, allowing interruption by signals.  Return 0 if
+ * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if
+ * a signal was caught.  If ERESTART is returned the system call should be
+ * restarted if possible.
+ */
+int
+cv_wait_sig(struct cv *cvp, struct mtx *mp)
+{
+	struct thread *td;
+	struct proc *p;
+	int rval;
+	int sig;
+	WITNESS_SAVE_DECL(mp);
+
+	td = curthread;
+	p = td->td_proc;
+	rval = 0;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0);
+#endif
+	CV_ASSERT(cvp, mp, td);
+	WITNESS_SLEEP(0, &mp->mtx_object);
+	WITNESS_SAVE(&mp->mtx_object, mp);
+
+	if (cold || panicstr) {
+		/*
+		 * After a panic, or during autoconfiguration, just give
+		 * interrupts a chance, then just return; don't run any other
+		 * procs or panic below, in case this is the idle process and
+		 * already asleep.
+		 */
+		return 0;
+	}
+
+	mtx_lock_spin(&sched_lock);
+
+	CV_WAIT_VALIDATE(cvp, mp);
+
+	DROP_GIANT();
+	mtx_unlock(mp);
+
+	cv_waitq_add(cvp, td);
+	sig = cv_switch_catch(td);
+
+	mtx_unlock_spin(&sched_lock);
+
+	PROC_LOCK(p);
+	if (sig == 0)
+		sig = cursig(p);  /* XXXKSE */
+	if (sig != 0) {
+		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
+			rval = EINTR;
+		else
+			rval = ERESTART;
+	}
+	PROC_UNLOCK(p);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0);
+#endif
+	PICKUP_GIANT();
+	mtx_lock(mp);
+	WITNESS_RESTORE(&mp->mtx_object, mp);
+
+	return (rval);
+}
+
+/*
+ * Wait on a condition variable for at most timo/hz seconds.  Returns 0 if the
+ * process was resumed by cv_signal or cv_broadcast, EWOULDBLOCK if the timeout
+ * expires.
+ */
+int
+cv_timedwait(struct cv *cvp, struct mtx *mp, int timo)
+{
+	struct thread *td;
+	int rval;
+	WITNESS_SAVE_DECL(mp);
+
+	td = curthread;
+	rval = 0;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0);
+#endif
+	CV_ASSERT(cvp, mp, td);
+	WITNESS_SLEEP(0, &mp->mtx_object);
+	WITNESS_SAVE(&mp->mtx_object, mp);
+
+	if (cold || panicstr) {
+		/*
+		 * After a panic, or during autoconfiguration, just give
+		 * interrupts a chance, then just return; don't run any other
+		 * thread or panic below, in case this is the idle process and
+		 * already asleep.
+		 */
+		return 0;
+	}
+
+	mtx_lock_spin(&sched_lock);
+
+	CV_WAIT_VALIDATE(cvp, mp);
+
+	DROP_GIANT();
+	mtx_unlock(mp);
+
+	cv_waitq_add(cvp, td);
+	callout_reset(&td->td_slpcallout, timo, cv_timedwait_end, td);
+	cv_switch(td);
+
+	if (td->td_flags & TDF_TIMEOUT) {
+		td->td_flags &= ~TDF_TIMEOUT;
+		rval = EWOULDBLOCK;
+	} else if (td->td_flags & TDF_TIMOFAIL)
+		td->td_flags &= ~TDF_TIMOFAIL;
+	else if (callout_stop(&td->td_slpcallout) == 0) {
+		/*
+		 * Work around race with cv_timedwait_end similar to that
+		 * between msleep and endtsleep.
+		 */
+		td->td_flags |= TDF_TIMEOUT;
+		td->td_proc->p_stats->p_ru.ru_nivcsw++;
+		mi_switch();
+	}
+
+	mtx_unlock_spin(&sched_lock);
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0);
+#endif
+	PICKUP_GIANT();
+	mtx_lock(mp);
+	WITNESS_RESTORE(&mp->mtx_object, mp);
+
+	return (rval);
+}
+
+/*
+ * Wait on a condition variable for at most timo/hz seconds, allowing
+ * interruption by signals.  Returns 0 if the thread was resumed by cv_signal
+ * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if
+ * a signal was caught.
+ */
+int
+cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)
+{
+	struct thread *td;
+	struct proc *p;
+	int rval;
+	int sig;
+	WITNESS_SAVE_DECL(mp);
+
+	td = curthread;
+	p = td->td_proc;
+	rval = 0;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0);
+#endif
+	CV_ASSERT(cvp, mp, td);
+	WITNESS_SLEEP(0, &mp->mtx_object);
+	WITNESS_SAVE(&mp->mtx_object, mp);
+
+	if (cold || panicstr) {
+		/*
+		 * After a panic, or during autoconfiguration, just give
+		 * interrupts a chance, then just return; don't run any other
+		 * thread or panic below, in case this is the idle process and
+		 * already asleep.
+		 */
+		return 0;
+	}
+
+	mtx_lock_spin(&sched_lock);
+
+	CV_WAIT_VALIDATE(cvp, mp);
+
+	DROP_GIANT();
+	mtx_unlock(mp);
+
+	cv_waitq_add(cvp, td);
+	callout_reset(&td->td_slpcallout, timo, cv_timedwait_end, td);
+	sig = cv_switch_catch(td);
+
+	if (td->td_flags & TDF_TIMEOUT) {
+		td->td_flags &= ~TDF_TIMEOUT;
+		rval = EWOULDBLOCK;
+	} else if (td->td_flags & TDF_TIMOFAIL)
+		td->td_flags &= ~TDF_TIMOFAIL;
+	else if (callout_stop(&td->td_slpcallout) == 0) {
+		/*
+		 * Work around race with cv_timedwait_end similar to that
+		 * between msleep and endtsleep.
+		 */
+		td->td_flags |= TDF_TIMEOUT;
+		td->td_proc->p_stats->p_ru.ru_nivcsw++;
+		mi_switch();
+	}
+
+	mtx_unlock_spin(&sched_lock);
+
+	PROC_LOCK(p);
+	if (sig == 0)
+		sig = cursig(p);
+	if (sig != 0) {
+		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
+			rval = EINTR;
+		else
+			rval = ERESTART;
+	}
+	PROC_UNLOCK(p);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0);
+#endif
+	PICKUP_GIANT();
+	mtx_lock(mp);
+	WITNESS_RESTORE(&mp->mtx_object, mp);
+
+	return (rval);
+}
+
+/*
+ * Common code for signal and broadcast.  Assumes waitq is not empty.  Must be
+ * called with sched_lock held.
+ */
+static __inline void
+cv_wakeup(struct cv *cvp)
+{
+	struct thread *td;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	td = TAILQ_FIRST(&cvp->cv_waitq);
+	KASSERT(td->td_wchan == cvp, ("%s: bogus wchan", __func__));
+	KASSERT(td->td_flags & TDF_CVWAITQ, ("%s: not on waitq", __func__));
+	TAILQ_REMOVE(&cvp->cv_waitq, td, td_slpq);
+	td->td_flags &= ~TDF_CVWAITQ;
+	td->td_wchan = 0;
+	if (td->td_proc->p_stat == SSLEEP) {
+		/* OPTIMIZED EXPANSION OF setrunnable(td); */
+		CTR3(KTR_PROC, "cv_signal: thread %p (pid %d, %s)",
+		    td, td->td_proc->p_pid, td->td_proc->p_comm);
+		if (td->td_ksegrp->kg_slptime > 1) /* XXXKSE */
+			updatepri(td);
+		td->td_kse->ke_slptime = 0;
+		td->td_ksegrp->kg_slptime = 0;
+		td->td_proc->p_stat = SRUN;
+		if (td->td_proc->p_sflag & PS_INMEM) {
+			setrunqueue(td);
+			maybe_resched(td);
+		} else {
+			td->td_proc->p_sflag |= PS_SWAPINREQ;
+			wakeup(&proc0); /* XXXKSE */
+		}
+		/* END INLINE EXPANSION */
+	}
+}
+
+/*
+ * Signal a condition variable, wakes up one waiting thread.  Will also wakeup
+ * the swapper if the process is not in memory, so that it can bring the
+ * sleeping process in.  Note that this may also result in additional threads
+ * being made runnable.  Should be called with the same mutex as was passed to
+ * cv_wait held.
+ */
+void
+cv_signal(struct cv *cvp)
+{
+
+	KASSERT(cvp != NULL, ("%s: cvp NULL", __func__));
+	mtx_lock_spin(&sched_lock);
+	if (!TAILQ_EMPTY(&cvp->cv_waitq)) {
+		CV_SIGNAL_VALIDATE(cvp);
+		cv_wakeup(cvp);
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Broadcast a signal to a condition variable.  Wakes up all waiting threads.
+ * Should be called with the same mutex as was passed to cv_wait held.
+ */
+void
+cv_broadcast(struct cv *cvp)
+{
+
+	KASSERT(cvp != NULL, ("%s: cvp NULL", __func__));
+	mtx_lock_spin(&sched_lock);
+	CV_SIGNAL_VALIDATE(cvp);
+	while (!TAILQ_EMPTY(&cvp->cv_waitq))
+		cv_wakeup(cvp);
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Remove a thread from the wait queue of its condition variable.  This may be
+ * called externally.
+ */
+void
+cv_waitq_remove(struct thread *td)
+{
+	struct cv *cvp;
+
+	mtx_lock_spin(&sched_lock);
+	if ((cvp = td->td_wchan) != NULL && td->td_flags & TDF_CVWAITQ) {
+		TAILQ_REMOVE(&cvp->cv_waitq, td, td_slpq);
+		td->td_flags &= ~TDF_CVWAITQ;
+		td->td_wchan = NULL;
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Timeout function for cv_timedwait.  Put the thread on the runqueue and set
+ * its timeout flag.
+ */
+static void
+cv_timedwait_end(void *arg)
+{
+	struct thread *td;
+
+	td = arg;
+	CTR3(KTR_PROC, "cv_timedwait_end: thread %p (pid %d, %s)", td, td->td_proc->p_pid,
+	    td->td_proc->p_comm);
+	mtx_lock_spin(&sched_lock);
+	if (td->td_flags & TDF_TIMEOUT) {
+		td->td_flags &= ~TDF_TIMEOUT;
+		setrunqueue(td);
+	} else if (td->td_wchan != NULL) {
+		if (td->td_proc->p_stat == SSLEEP) /* XXXKSE */
+			setrunnable(td);
+		else
+			cv_waitq_remove(td);
+		td->td_flags |= TDF_TIMEOUT;
+	} else
+		td->td_flags |= TDF_TIMOFAIL;
+	mtx_unlock_spin(&sched_lock);
+}
diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c
new file mode 100644
index 0000000..d1ce2fc
--- /dev/null
+++ b/sys/kern/kern_conf.c
@@ -0,0 +1,491 @@
+/*-
+ * Parts Copyright (c) 1995 Terrence R. Lambert
+ * Copyright (c) 1995 Julian R. Elischer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Terrence R. Lambert.
+ * 4. The name Terrence R. Lambert may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Julian R. Elischer ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/vnode.h>
+#include <sys/queue.h>
+#include <sys/ctype.h>
+#include <machine/stdarg.h>
+
+#define cdevsw_ALLOCSTART	(NUMCDEVSW/2)
+
+static struct cdevsw 	*cdevsw[NUMCDEVSW];
+
+static MALLOC_DEFINE(M_DEVT, "dev_t", "dev_t storage");
+
+/*
+ * This is the number of hash-buckets.  Experiements with 'real-life'
+ * udev_t's show that a prime halfway between two powers of two works
+ * best.
+ */
+#define DEVT_HASH 83
+
+/* The number of dev_t's we can create before malloc(9) kick in.  */
+#define DEVT_STASH 50
+
+static struct specinfo devt_stash[DEVT_STASH];
+
+static LIST_HEAD(, specinfo) dev_hash[DEVT_HASH];
+
+static LIST_HEAD(, specinfo) dev_free;
+
+devfs_create_t *devfs_create_hook;
+devfs_destroy_t *devfs_destroy_hook;
+int devfs_present;
+
+static int ready_for_devs;
+
+static int free_devt;
+SYSCTL_INT(_debug, OID_AUTO, free_devt, CTLFLAG_RW, &free_devt, 0, "");
+
+/* XXX: This is a hack */
+void disk_dev_synth(dev_t dev);
+
+struct cdevsw *
+devsw(dev_t dev)
+{
+	if (dev->si_devsw)
+		return (dev->si_devsw);
+	/* XXX: Hack around our backwards disk code */
+	disk_dev_synth(dev);
+	if (dev->si_devsw)
+		return (dev->si_devsw);
+	if (devfs_present)
+		return (NULL);
+        return(cdevsw[major(dev)]);
+}
+
+/*
+ *  Add a cdevsw entry
+ */
+
+int
+cdevsw_add(struct cdevsw *newentry)
+{
+
+	if (newentry->d_maj < 0 || newentry->d_maj >= NUMCDEVSW) {
+		printf("%s: ERROR: driver has bogus cdevsw->d_maj = %d\n",
+		    newentry->d_name, newentry->d_maj);
+		return (EINVAL);
+	}
+
+	if (cdevsw[newentry->d_maj]) {
+		printf("WARNING: \"%s\" is usurping \"%s\"'s cdevsw[]\n",
+		    newentry->d_name, cdevsw[newentry->d_maj]->d_name);
+	}
+
+	cdevsw[newentry->d_maj] = newentry;
+
+	return (0);
+}
+
+/*
+ *  Remove a cdevsw entry
+ */
+
+int
+cdevsw_remove(struct cdevsw *oldentry)
+{
+	if (oldentry->d_maj < 0 || oldentry->d_maj >= NUMCDEVSW) {
+		printf("%s: ERROR: driver has bogus cdevsw->d_maj = %d\n",
+		    oldentry->d_name, oldentry->d_maj);
+		return EINVAL;
+	}
+
+	cdevsw[oldentry->d_maj] = NULL;
+
+	return 0;
+}
+
+/*
+ * dev_t and u_dev_t primitives
+ */
+
+int
+major(dev_t x)
+{
+	if (x == NODEV)
+		return NOUDEV;
+	return((x->si_udev >> 8) & 0xff);
+}
+
+int
+minor(dev_t x)
+{
+	if (x == NODEV)
+		return NOUDEV;
+	return(x->si_udev & 0xffff00ff);
+}
+
+int
+dev2unit(dev_t x)
+{
+	int i;
+
+	if (x == NODEV)
+		return NOUDEV;
+	i = minor(x);
+	return ((i & 0xff) | (i >> 8));
+}
+
+int
+unit2minor(int unit)
+{
+
+	KASSERT(unit <= 0xffffff, ("Invalid unit (%d) in unit2minor", unit));
+	return ((unit & 0xff) | ((unit << 8) & ~0xffff));
+}
+
+static dev_t
+allocdev(void)
+{
+	static int stashed;
+	struct specinfo *si;
+
+	if (stashed >= DEVT_STASH) {
+		MALLOC(si, struct specinfo *, sizeof(*si), M_DEVT,
+		    M_USE_RESERVE | M_ZERO);
+	} else if (LIST_FIRST(&dev_free)) {
+		si = LIST_FIRST(&dev_free);
+		LIST_REMOVE(si, si_hash);
+	} else {
+		si = devt_stash + stashed++;
+		bzero(si, sizeof *si);
+	si->si_flags |= SI_STASHED;
+	}
+	LIST_INIT(&si->si_children);
+	TAILQ_INIT(&si->si_snapshots);
+	return (si);
+}
+
+dev_t
+makedev(int x, int y)
+{
+	struct specinfo *si;
+	udev_t	udev;
+	int hash;
+
+	if (x == umajor(NOUDEV) && y == uminor(NOUDEV))
+		panic("makedev of NOUDEV");
+	udev = (x << 8) | y;
+	hash = udev % DEVT_HASH;
+	LIST_FOREACH(si, &dev_hash[hash], si_hash) {
+		if (si->si_udev == udev)
+			return (si);
+	}
+	si = allocdev();
+	si->si_udev = udev;
+	LIST_INSERT_HEAD(&dev_hash[hash], si, si_hash);
+        return (si);
+}
+
+void
+freedev(dev_t dev)
+{
+
+	if (!free_devt)
+		return;
+	if (SLIST_FIRST(&dev->si_hlist))
+		return;
+	if (dev->si_devsw || dev->si_drv1 || dev->si_drv2)
+		return;
+	LIST_REMOVE(dev, si_hash);
+	if (dev->si_flags & SI_STASHED) {
+		bzero(dev, sizeof(*dev));
+		dev->si_flags |= SI_STASHED;
+		LIST_INSERT_HEAD(&dev_free, dev, si_hash);
+	} else {
+		FREE(dev, M_DEVT);
+	}
+}
+
+udev_t
+dev2udev(dev_t x)
+{
+	if (x == NODEV)
+		return NOUDEV;
+	return (x->si_udev);
+}
+
+dev_t
+udev2dev(udev_t x, int b)
+{
+
+	if (x == NOUDEV)
+		return (NODEV);
+	switch (b) {
+		case 0:
+			return makedev(umajor(x), uminor(x));
+		case 1:
+			return (NODEV);
+		default:
+			Debugger("udev2dev(...,X)");
+			return NODEV;
+	}
+}
+
+int
+uminor(udev_t dev)
+{
+	return(dev & 0xffff00ff);
+}
+
+int
+umajor(udev_t dev)
+{
+	return((dev & 0xff00) >> 8);
+}
+
+udev_t
+makeudev(int x, int y)
+{
+        return ((x << 8) | y);
+}
+
+dev_t
+make_dev(struct cdevsw *devsw, int minor, uid_t uid, gid_t gid, int perms, const char *fmt, ...)
+{
+	dev_t	dev;
+	va_list ap;
+	int i;
+
+	KASSERT(umajor(makeudev(devsw->d_maj, minor)) == devsw->d_maj,
+	    ("Invalid minor (%d) in make_dev", minor));
+
+	if (!ready_for_devs) {
+		printf("WARNING: Driver mistake: make_dev(%s) called before SI_SUB_DRIVERS\n",
+		       fmt);
+		/* XXX panic here once drivers are cleaned up */
+	}
+
+	dev = makedev(devsw->d_maj, minor);
+	if (dev->si_flags & SI_NAMED) {
+		printf( "WARNING: Driver mistake: repeat make_dev(\"%s\")\n",
+		    dev->si_name);
+		panic("don't do that");
+		return (dev);
+	}
+	va_start(ap, fmt);
+	i = kvprintf(fmt, NULL, dev->si_name, 32, ap);
+	dev->si_name[i] = '\0';
+	va_end(ap);
+	dev->si_devsw = devsw;
+	dev->si_uid = uid;
+	dev->si_gid = gid;
+	dev->si_mode = perms;
+	dev->si_flags |= SI_NAMED;
+
+	if (devfs_create_hook)
+		devfs_create_hook(dev);
+	return (dev);
+}
+
+int
+dev_named(dev_t pdev, const char *name)
+{
+	dev_t cdev;
+
+	if (strcmp(devtoname(pdev), name) == 0)
+		return (1);
+	LIST_FOREACH(cdev, &pdev->si_children, si_siblings)
+		if (strcmp(devtoname(cdev), name) == 0)
+			return (1);
+	return (0);
+}
+
+void
+dev_depends(dev_t pdev, dev_t cdev)
+{
+
+	cdev->si_parent = pdev;
+	cdev->si_flags |= SI_CHILD;
+	LIST_INSERT_HEAD(&pdev->si_children, cdev, si_siblings);
+}
+
+dev_t
+make_dev_alias(dev_t pdev, const char *fmt, ...)
+{
+	dev_t	dev;
+	va_list ap;
+	int i;
+
+	dev = allocdev();
+	dev->si_flags |= SI_ALIAS;
+	dev->si_flags |= SI_NAMED;
+	dev_depends(pdev, dev);
+	va_start(ap, fmt);
+	i = kvprintf(fmt, NULL, dev->si_name, 32, ap);
+	dev->si_name[i] = '\0';
+	va_end(ap);
+
+	if (devfs_create_hook)
+		devfs_create_hook(dev);
+	return (dev);
+}
+
+void
+revoke_and_destroy_dev(dev_t dev)
+{
+	struct vnode *vp;
+
+	GIANT_REQUIRED;
+
+	vp = SLIST_FIRST(&dev->si_hlist);
+	if (vp != NULL)
+		VOP_REVOKE(vp, REVOKEALL);
+	destroy_dev(dev);
+}
+
+void
+destroy_dev(dev_t dev)
+{
+	
+	if (!(dev->si_flags & SI_NAMED)) {
+		printf( "WARNING: Driver mistake: destroy_dev on %d/%d\n",
+		    major(dev), minor(dev));
+		panic("don't do that");
+		return;
+	}
+		
+	if (devfs_destroy_hook)
+		devfs_destroy_hook(dev);
+	if (dev->si_flags & SI_CHILD) {
+		LIST_REMOVE(dev, si_siblings);
+		dev->si_flags &= ~SI_CHILD;
+	}
+	while (!LIST_EMPTY(&dev->si_children))
+		destroy_dev(LIST_FIRST(&dev->si_children));
+	dev->si_drv1 = 0;
+	dev->si_drv2 = 0;
+	dev->si_devsw = 0;
+	bzero(&dev->__si_u, sizeof(dev->__si_u));
+	dev->si_flags &= ~SI_NAMED;
+	dev->si_flags &= ~SI_ALIAS;
+	freedev(dev);
+}
+
+const char *
+devtoname(dev_t dev)
+{
+	char *p;
+	int mynor;
+
+	if (dev->si_name[0] == '#' || dev->si_name[0] == '\0') {
+		p = dev->si_name;
+		if (devsw(dev))
+			sprintf(p, "#%s/", devsw(dev)->d_name);
+		else
+			sprintf(p, "#%d/", major(dev));
+		p += strlen(p);
+		mynor = minor(dev);
+		if (mynor < 0 || mynor > 255)
+			sprintf(p, "%#x", (u_int)mynor);
+		else
+			sprintf(p, "%d", mynor);
+	}
+	return (dev->si_name);
+}
+
+int
+dev_stdclone(char *name, char **namep, const char *stem, int *unit)
+{
+	int u, i;
+
+	i = strlen(stem);
+	if (bcmp(stem, name, i) != 0)
+		return (0);
+	if (!isdigit(name[i]))
+		return (0);
+	u = 0;
+	if (name[i] == '0' && isdigit(name[i+1]))
+		return (0);
+	while (isdigit(name[i])) {
+		u *= 10;
+		u += name[i++] - '0';
+	}
+	*unit = u;
+	if (namep)
+		*namep = &name[i];
+	if (name[i]) 
+		return (2);
+	return (1);
+}
+
+/*
+ * Helper sysctl for devname(3).  We're given a {u}dev_t and return
+ * the name, if any, registered by the device driver.
+ */
+static int
+sysctl_devname(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	udev_t ud;
+	dev_t dev;
+
+	error = SYSCTL_IN(req, &ud, sizeof (ud));
+	if (error)
+		return (error);
+	if (ud == NOUDEV)
+		return(EINVAL);
+	dev = makedev(umajor(ud), uminor(ud));
+	if (dev->si_name[0] == '\0')
+		error = ENOENT;
+	else
+		error = SYSCTL_OUT(req, dev->si_name, strlen(dev->si_name) + 1);
+	freedev(dev);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, devname, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_ANYBODY,
+	NULL, 0, sysctl_devname, "", "devname(3) handler");
+
+/*
+ * Set ready_for_devs; prior to this point, device creation is not allowed.
+ */	
+static void
+dev_set_ready(void *junk)
+{
+	ready_for_devs = 1;
+}
+
+SYSINIT(dev_ready, SI_SUB_DEVFS, SI_ORDER_FIRST, dev_set_ready, NULL);
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
new file mode 100644
index 0000000..15837d3
--- /dev/null
+++ b/sys/kern/kern_descrip.c
@@ -0,0 +1,2210 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/conf.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/unistd.h>
+#include <sys/resourcevar.h>
+#include <sys/event.h>
+#include <sys/sx.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+
+#include <machine/limits.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
+static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
+
+uma_zone_t file_zone;
+
+static	 d_open_t  fdopen;
+#define NUMFDESC 64
+
+#define CDEV_MAJOR 22
+static struct cdevsw fildesc_cdevsw = {
+	/* open */	fdopen,
+	/* close */	noclose,
+	/* read */	noread,
+	/* write */	nowrite,
+	/* ioctl */	noioctl,
+	/* poll */	nopoll,
+	/* mmap */	nommap,
+	/* strategy */	nostrategy,
+	/* name */	"FD",
+	/* maj */	CDEV_MAJOR,
+	/* dump */	nodump,
+	/* psize */	nopsize,
+	/* flags */	0,
+};
+
+static int do_dup(struct filedesc *fdp, int old, int new, register_t *retval, struct thread *td);
+static int badfo_readwrite(struct file *fp, struct uio *uio,
+    struct ucred *cred, int flags, struct thread *td);
+static int badfo_ioctl(struct file *fp, u_long com, caddr_t data,
+    struct thread *td);
+static int badfo_poll(struct file *fp, int events,
+    struct ucred *cred, struct thread *td);
+static int badfo_kqfilter(struct file *fp, struct knote *kn);
+static int badfo_stat(struct file *fp, struct stat *sb, struct thread *td);
+static int badfo_close(struct file *fp, struct thread *td);
+
+/*
+ * Descriptor management.
+ */
+struct filelist filehead;	/* head of list of open files */
+int nfiles;			/* actual number of open files */
+extern int cmask;	
+struct sx filelist_lock;	/* sx to protect filelist */
+struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
+
+/*
+ * System calls on descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdtablesize_args {
+	int	dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getdtablesize(td, uap)
+	struct thread *td;
+	struct getdtablesize_args *uap;
+{
+	struct proc *p = td->td_proc;
+
+	mtx_lock(&Giant);
+	td->td_retval[0] = 
+	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+/*
+ * Duplicate a file descriptor to a particular value.
+ *
+ * note: keep in mind that a potential race condition exists when closing
+ * descriptors from a shared descriptor table (via rfork).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup2_args {
+	u_int	from;
+	u_int	to;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+dup2(td, uap)
+	struct thread *td;
+	struct dup2_args *uap;
+{
+	struct proc *p = td->td_proc;
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	register u_int old = uap->from, new = uap->to;
+	int i, error;
+
+	FILEDESC_LOCK(fdp);
+retry:
+	if (old >= fdp->fd_nfiles ||
+	    fdp->fd_ofiles[old] == NULL ||
+	    new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
+	    new >= maxfilesperproc) {
+		FILEDESC_UNLOCK(fdp);
+		return (EBADF);
+	}
+	if (old == new) {
+		td->td_retval[0] = new;
+		FILEDESC_UNLOCK(fdp);
+		return (0);
+	}
+	if (new >= fdp->fd_nfiles) {
+		if ((error = fdalloc(td, new, &i))) {
+			FILEDESC_UNLOCK(fdp);
+			return (error);
+		}
+		/*
+		 * fdalloc() may block, retest everything.
+		 */
+		goto retry;
+	}
+	error = do_dup(fdp, (int)old, (int)new, td->td_retval, td);
+	return(error);
+}
+
+/*
+ * Duplicate a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup_args {
+	u_int	fd;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+dup(td, uap)
+	struct thread *td;
+	struct dup_args *uap;
+{
+	register struct filedesc *fdp;
+	u_int old;
+	int new, error;
+
+	old = uap->fd;
+	fdp = td->td_proc->p_fd;
+	FILEDESC_LOCK(fdp);
+	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
+		FILEDESC_UNLOCK(fdp);
+		return (EBADF);
+	}
+	if ((error = fdalloc(td, 0, &new))) {
+		FILEDESC_UNLOCK(fdp);
+		return (error);
+	}
+	error = do_dup(fdp, (int)old, new, td->td_retval, td);
+	return (error);
+}
+
+/*
+ * The file control system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fcntl_args {
+	int	fd;
+	int	cmd;
+	long	arg;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+fcntl(td, uap)
+	struct thread *td;
+	register struct fcntl_args *uap;
+{
+	register struct proc *p = td->td_proc;
+	register struct filedesc *fdp;
+	register struct file *fp;
+	register char *pop;
+	struct vnode *vp;
+	int i, tmp, error = 0, flg = F_POSIX;
+	struct flock fl;
+	u_int newmin;
+	struct proc *leaderp;
+
+	mtx_lock(&Giant);
+
+	fdp = p->p_fd;
+	FILEDESC_LOCK(fdp);
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL) {
+		FILEDESC_UNLOCK(fdp);
+		error = EBADF;
+		goto done2;
+	}
+	pop = &fdp->fd_ofileflags[uap->fd];
+
+	switch (uap->cmd) {
+	case F_DUPFD:
+		newmin = uap->arg;
+		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
+		    newmin >= maxfilesperproc) {
+			FILEDESC_UNLOCK(fdp);
+			error = EINVAL;
+			break;
+		}
+		if ((error = fdalloc(td, newmin, &i))) {
+			FILEDESC_UNLOCK(fdp);
+			break;
+		}
+		error = do_dup(fdp, uap->fd, i, td->td_retval, td);
+		break;
+
+	case F_GETFD:
+		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
+		FILEDESC_UNLOCK(fdp);
+		break;
+
+	case F_SETFD:
+		*pop = (*pop &~ UF_EXCLOSE) |
+		    (uap->arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
+		FILEDESC_UNLOCK(fdp);
+		break;
+
+	case F_GETFL:
+		FILE_LOCK(fp);
+		FILEDESC_UNLOCK(fdp);
+		td->td_retval[0] = OFLAGS(fp->f_flag);
+		FILE_UNLOCK(fp);
+		break;
+
+	case F_SETFL:
+		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
+		fp->f_flag &= ~FCNTLFLAGS;
+		fp->f_flag |= FFLAGS(uap->arg & ~O_ACCMODE) & FCNTLFLAGS;
+		tmp = fp->f_flag & FNONBLOCK;
+		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
+		if (error) {
+			fdrop(fp, td);
+			break;
+		}
+		tmp = fp->f_flag & FASYNC;
+		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
+		if (!error) {
+			fdrop(fp, td);
+			break;
+		}
+		fp->f_flag &= ~FNONBLOCK;
+		tmp = 0;
+		(void)fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
+		fdrop(fp, td);
+		break;
+
+	case F_GETOWN:
+		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
+		error = fo_ioctl(fp, FIOGETOWN, (caddr_t)td->td_retval, td);
+		fdrop(fp, td);
+		break;
+
+	case F_SETOWN:
+		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
+		error = fo_ioctl(fp, FIOSETOWN, (caddr_t)&uap->arg, td);
+		fdrop(fp, td);
+		break;
+
+	case F_SETLKW:
+		flg |= F_WAIT;
+		/* Fall into F_SETLK */
+
+	case F_SETLK:
+		if (fp->f_type != DTYPE_VNODE) {
+			FILEDESC_UNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
+		vp = (struct vnode *)fp->f_data;
+		/*
+		 * copyin/lockop may block
+		 */
+		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
+		vp = (struct vnode *)fp->f_data;
+
+		/* Copy in the lock structure */
+		error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
+		    sizeof(fl));
+		if (error) {
+			fdrop(fp, td);
+			break;
+		}
+		if (fl.l_whence == SEEK_CUR) {
+			if (fp->f_offset < 0 ||
+			    (fl.l_start > 0 &&
+			     fp->f_offset > OFF_MAX - fl.l_start)) {
+				fdrop(fp, td);
+				error = EOVERFLOW;
+				break;
+			}
+			fl.l_start += fp->f_offset;
+		}
+
+		switch (fl.l_type) {
+		case F_RDLCK:
+			if ((fp->f_flag & FREAD) == 0) {
+				error = EBADF;
+				break;
+			}
+			PROC_LOCK(p);
+			p->p_flag |= P_ADVLOCK;
+			leaderp = p->p_leader;
+			PROC_UNLOCK(p);
+			error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_SETLK,
+			    &fl, flg);
+			break;
+		case F_WRLCK:
+			if ((fp->f_flag & FWRITE) == 0) {
+				error = EBADF;
+				break;
+			}
+			PROC_LOCK(p);
+			p->p_flag |= P_ADVLOCK;
+			leaderp = p->p_leader;
+			PROC_UNLOCK(p);
+			error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_SETLK,
+			    &fl, flg);
+			break;
+		case F_UNLCK:
+			PROC_LOCK(p);
+			leaderp = p->p_leader;
+			PROC_UNLOCK(p);
+			error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_UNLCK,
+				&fl, F_POSIX);
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+		fdrop(fp, td);
+		break;
+
+	case F_GETLK:
+		if (fp->f_type != DTYPE_VNODE) {
+			FILEDESC_UNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
+		vp = (struct vnode *)fp->f_data;
+		/*
+		 * copyin/lockop may block
+		 */
+		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
+		vp = (struct vnode *)fp->f_data;
+
+		/* Copy in the lock structure */
+		error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
+		    sizeof(fl));
+		if (error) {
+			fdrop(fp, td);
+			break;
+		}
+		if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK &&
+		    fl.l_type != F_UNLCK) {
+			fdrop(fp, td);
+			error = EINVAL;
+			break;
+		}
+		if (fl.l_whence == SEEK_CUR) {
+			if ((fl.l_start > 0 &&
+			     fp->f_offset > OFF_MAX - fl.l_start) ||
+			    (fl.l_start < 0 &&
+			     fp->f_offset < OFF_MIN - fl.l_start)) {
+				fdrop(fp, td);
+				error = EOVERFLOW;
+				break;
+			}
+			fl.l_start += fp->f_offset;
+		}
+		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK,
+			    &fl, F_POSIX);
+		fdrop(fp, td);
+		if (error == 0) {
+			error = copyout((caddr_t)&fl,
+				    (caddr_t)(intptr_t)uap->arg, sizeof(fl));
+		}
+		break;
+	default:
+		FILEDESC_UNLOCK(fdp);
+		error = EINVAL;
+		break;
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Common code for dup, dup2, and fcntl(F_DUPFD).
+ * filedesc must be locked, but will be unlocked as a side effect.
+ */
+static int
+do_dup(fdp, old, new, retval, td)
+	register struct filedesc *fdp;
+	register int old, new;
+	register_t *retval;
+	struct thread *td;
+{
+	struct file *fp;
+	struct file *delfp;
+
+	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+	/*
+	 * Save info on the descriptor being overwritten.  We have
+	 * to do the unmap now, but we cannot close it without
+	 * introducing an ownership race for the slot.
+	 */
+	delfp = fdp->fd_ofiles[new];
+#if 0
+	if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED))
+		(void) munmapfd(td, new);
+#endif
+
+	/*
+	 * Duplicate the source descriptor, update lastfile
+	 */
+	fp = fdp->fd_ofiles[old];
+	fdp->fd_ofiles[new] = fp;
+	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
+	fhold(fp);
+	if (new > fdp->fd_lastfile)
+		fdp->fd_lastfile = new;
+	*retval = new;
+
+	FILEDESC_UNLOCK(fdp);
+
+	/*
+	 * If we dup'd over a valid file, we now own the reference to it
+	 * and must dispose of it using closef() semantics (as if a
+	 * close() were performed on it).
+	 */
+	if (delfp) {
+		mtx_lock(&Giant);
+		(void) closef(delfp, td);
+		mtx_unlock(&Giant);
+	}
+	return (0);
+}
+
+/*
+ * If sigio is on the list associated with a process or process group,
+ * disable signalling from the device, remove sigio from the list and
+ * free sigio.
+ */
+void
+funsetown(sigiop)
+	struct sigio **sigiop;
+{
+	struct sigio *sigio;
+
+	SIGIO_LOCK();
+	sigio = *sigiop;
+	if (sigio == NULL) {
+		SIGIO_UNLOCK();
+		return;
+	}
+	*(sigio->sio_myref) = NULL;
+	if ((sigio)->sio_pgid < 0) {
+		struct pgrp *pg = (sigio)->sio_pgrp;
+		PGRP_LOCK(pg);
+		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
+			     sigio, sio_pgsigio);
+		PGRP_UNLOCK(pg);
+	} else {
+		struct proc *p = (sigio)->sio_proc;
+		PROC_LOCK(p);
+		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
+			     sigio, sio_pgsigio);
+		PROC_UNLOCK(p);
+	}
+	SIGIO_UNLOCK();
+	crfree(sigio->sio_ucred);
+	FREE(sigio, M_SIGIO);
+}
+
+/*
+ * Free a list of sigio structures.
+ * We only need to lock the SIGIO_LOCK because we have made ourselves
+ * inaccessable to callers of fsetown and therefore do not need to lock
+ * the proc or pgrp struct for the list manipulation.
+ */
+void
+funsetownlst(sigiolst)
+	struct sigiolst *sigiolst;
+{
+	struct sigio *sigio;
+	struct proc *p;
+	struct pgrp *pg;
+
+	sigio = SLIST_FIRST(sigiolst);
+	if (sigio == NULL)
+		return;
+
+	p = NULL;
+	pg = NULL;
+
+	/*
+	 * Every entry of the list should belong
+	 * to a single proc or pgrp.
+	 */
+	if (sigio->sio_pgid < 0) {
+		pg = sigio->sio_pgrp;
+		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
+	} else /* if (sigio->sio_pgid > 0) */ {
+		p = sigio->sio_proc;
+		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+	}
+
+	SIGIO_LOCK();
+	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
+		*(sigio->sio_myref) = NULL;
+		if (pg != NULL) {
+			KASSERT(sigio->sio_pgid < 0,
+			    ("Proc sigio in pgrp sigio list"));
+			KASSERT(sigio->sio_pgrp == pg,
+			    ("Bogus pgrp in sigio list"));
+			PGRP_LOCK(pg);
+			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
+			    sio_pgsigio);
+			PGRP_UNLOCK(pg);
+		} else /* if (p != NULL) */ {
+			KASSERT(sigio->sio_pgid > 0,
+			    ("Pgrp sigio in proc sigio list"));
+			KASSERT(sigio->sio_proc == p,
+			    ("Bogus proc in sigio list"));
+			PROC_LOCK(p);
+			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
+			    sio_pgsigio);
+			PROC_UNLOCK(p);
+		}
+		SIGIO_UNLOCK();
+		crfree(sigio->sio_ucred);
+		FREE(sigio, M_SIGIO);
+		SIGIO_LOCK();
+	}
+	SIGIO_UNLOCK();
+}
+
+/*
+ * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
+ *
+ * After permission checking, add a sigio structure to the sigio list for
+ * the process or process group.
+ */
+int
+fsetown(pgid, sigiop)
+	pid_t pgid;
+	struct sigio **sigiop;
+{
+	struct proc *proc;
+	struct pgrp *pgrp;
+	struct sigio *sigio;
+	int ret;
+
+	if (pgid == 0) {
+		funsetown(sigiop);
+		return (0);
+	}
+
+	ret = 0;
+
+	/* Allocate and fill in the new sigio out of locks. */
+	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
+	sigio->sio_pgid = pgid;
+	sigio->sio_ucred = crhold(curthread->td_ucred);
+	sigio->sio_myref = sigiop;
+
+	sx_slock(&proctree_lock);
+	if (pgid > 0) {
+		proc = pfind(pgid);
+		if (proc == NULL) {
+			ret = ESRCH;
+			goto fail;
+		}
+
+		/*
+		 * Policy - Don't allow a process to FSETOWN a process
+		 * in another session.
+		 *
+		 * Remove this test to allow maximum flexibility or
+		 * restrict FSETOWN to the current process or process
+		 * group for maximum safety.
+		 */
+		PROC_UNLOCK(proc);
+		if (proc->p_session != curthread->td_proc->p_session) {
+			ret = EPERM;
+			goto fail;
+		}
+
+		pgrp = NULL;
+	} else /* if (pgid < 0) */ {
+		pgrp = pgfind(-pgid);
+		if (pgrp == NULL) {
+			ret = ESRCH;
+			goto fail;
+		}
+		PGRP_UNLOCK(pgrp);
+
+		/*
+		 * Policy - Don't allow a process to FSETOWN a process
+		 * in another session.
+		 *
+		 * Remove this test to allow maximum flexibility or
+		 * restrict FSETOWN to the current process or process
+		 * group for maximum safety.
+		 */
+		if (pgrp->pg_session != curthread->td_proc->p_session) {
+			ret = EPERM;
+			goto fail;
+		}
+
+		proc = NULL;
+	}
+	funsetown(sigiop);
+	if (pgid > 0) {
+		PROC_LOCK(proc);
+		/* 
+		 * since funsetownlst() is called without the proctree
+		 * locked we need to check for P_WEXIT.
+		 * XXX: is ESRCH correct?
+		 */
+		if ((proc->p_flag & P_WEXIT) != 0) {
+			PROC_UNLOCK(proc);
+			ret = ESRCH;
+			goto fail;
+		}
+		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
+		sigio->sio_proc = proc;
+		PROC_UNLOCK(proc);
+	} else {
+		PGRP_LOCK(pgrp);
+		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
+		sigio->sio_pgrp = pgrp;
+		PGRP_UNLOCK(pgrp);
+	}
+	sx_sunlock(&proctree_lock);
+	SIGIO_LOCK();
+	*sigiop = sigio;
+	SIGIO_UNLOCK();
+	return (0);
+
+fail:
+	sx_sunlock(&proctree_lock);
+	crfree(sigio->sio_ucred);
+	FREE(sigio, M_SIGIO);
+	return (ret);
+}
+
+/*
+ * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
+ */
+pid_t
+fgetown(sigio)
+	struct sigio *sigio;
+{
+	return (sigio != NULL ? sigio->sio_pgid : 0);
+}
+
+/*
+ * Close a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+        int     fd;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+close(td, uap)
+	struct thread *td;
+	struct close_args *uap;
+{
+	register struct filedesc *fdp;
+	register struct file *fp;
+	register int fd = uap->fd;
+	int error = 0;
+
+	mtx_lock(&Giant);
+	fdp = td->td_proc->p_fd;
+	FILEDESC_LOCK(fdp);
+	if ((unsigned)fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[fd]) == NULL) {
+		FILEDESC_UNLOCK(fdp);
+		error = EBADF;
+		goto done2;
+	}
+#if 0
+	if (fdp->fd_ofileflags[fd] & UF_MAPPED)
+		(void) munmapfd(td, fd);
+#endif
+	fdp->fd_ofiles[fd] = NULL;
+	fdp->fd_ofileflags[fd] = 0;
+
+	/*
+	 * we now hold the fp reference that used to be owned by the descriptor
+	 * array.
+	 */
+	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+		fdp->fd_lastfile--;
+	if (fd < fdp->fd_freefile)
+		fdp->fd_freefile = fd;
+	if (fd < fdp->fd_knlistsize) {
+		FILEDESC_UNLOCK(fdp);
+		knote_fdclose(td, fd);
+	} else
+		FILEDESC_UNLOCK(fdp);
+
+	error = closef(fp, td);
+done2:
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ofstat_args {
+	int	fd;
+	struct	ostat *sb;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ofstat(td, uap)
+	struct thread *td;
+	register struct ofstat_args *uap;
+{
+	struct file *fp;
+	struct stat ub;
+	struct ostat oub;
+	int error;
+
+	mtx_lock(&Giant);
+	if ((error = fget(td, uap->fd, &fp)) != 0)
+		goto done2;
+	error = fo_stat(fp, &ub, td);
+	if (error == 0) {
+		cvtstat(&ub, &oub);
+		error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub));
+	}
+	fdrop(fp, td);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstat_args {
+	int	fd;
+	struct	stat *sb;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+fstat(td, uap)
+	struct thread *td;
+	struct fstat_args *uap;
+{
+	struct file *fp;
+	struct stat ub;
+	int error;
+
+	mtx_lock(&Giant);
+	if ((error = fget(td, uap->fd, &fp)) != 0)
+		goto done2;
+	error = fo_stat(fp, &ub, td);
+	if (error == 0)
+		error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub));
+	fdrop(fp, td);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nfstat_args {
+	int	fd;
+	struct	nstat *sb;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+nfstat(td, uap)
+	struct thread *td;
+	register struct nfstat_args *uap;
+{
+	struct file *fp;
+	struct stat ub;
+	struct nstat nub;
+	int error;
+
+	mtx_lock(&Giant);
+	if ((error = fget(td, uap->fd, &fp)) != 0)
+		goto done2;
+	error = fo_stat(fp, &ub, td);
+	if (error == 0) {
+		cvtnstat(&ub, &nub);
+		error = copyout((caddr_t)&nub, (caddr_t)uap->sb, sizeof (nub));
+	}
+	fdrop(fp, td);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Return pathconf information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fpathconf_args {
+	int	fd;
+	int	name;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+fpathconf(td, uap)
+	struct thread *td;
+	register struct fpathconf_args *uap;
+{
+	struct file *fp;
+	struct vnode *vp;
+	int error;
+
+	if ((error = fget(td, uap->fd, &fp)) != 0)
+		return (error);
+
+	switch (fp->f_type) {
+	case DTYPE_PIPE:
+	case DTYPE_SOCKET:
+		if (uap->name != _PC_PIPE_BUF) {
+			error = EINVAL;
+		} else {
+			td->td_retval[0] = PIPE_BUF;
+			error = 0;
+		}
+		break;
+	case DTYPE_FIFO:
+	case DTYPE_VNODE:
+		vp = (struct vnode *)fp->f_data;
+		mtx_lock(&Giant);
+		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
+		mtx_unlock(&Giant);
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	fdrop(fp, td);
+	return(error);
+}
+
+/*
+ * Allocate a file descriptor for the process.
+ */
+static int fdexpand;
+SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
+
+int
+fdalloc(td, want, result)
+	struct thread *td;
+	int want;
+	int *result;
+{
+	struct proc *p = td->td_proc;
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	register int i;
+	int lim, last, nfiles;
+	struct file **newofile, **oldofile;
+	char *newofileflags;
+
+	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+	/*
+	 * Search for a free descriptor starting at the higher
+	 * of want or fd_freefile.  If that fails, consider
+	 * expanding the ofile array.
+	 */
+	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+	for (;;) {
+		last = min(fdp->fd_nfiles, lim);
+		if ((i = want) < fdp->fd_freefile)
+			i = fdp->fd_freefile;
+		for (; i < last; i++) {
+			if (fdp->fd_ofiles[i] == NULL) {
+				fdp->fd_ofileflags[i] = 0;
+				if (i > fdp->fd_lastfile)
+					fdp->fd_lastfile = i;
+				if (want <= fdp->fd_freefile)
+					fdp->fd_freefile = i;
+				*result = i;
+				return (0);
+			}
+		}
+
+		/*
+		 * No space in current array.  Expand?
+		 */
+		if (fdp->fd_nfiles >= lim)
+			return (EMFILE);
+		if (fdp->fd_nfiles < NDEXTENT)
+			nfiles = NDEXTENT;
+		else
+			nfiles = 2 * fdp->fd_nfiles;
+		FILEDESC_UNLOCK(fdp);
+		mtx_lock(&Giant);
+		MALLOC(newofile, struct file **, nfiles * OFILESIZE,
+		    M_FILEDESC, M_WAITOK);
+		mtx_unlock(&Giant);
+		FILEDESC_LOCK(fdp);
+
+		/*
+		 * deal with file-table extend race that might have occured
+		 * when malloc was blocked.
+		 */
+		if (fdp->fd_nfiles >= nfiles) {
+			FILEDESC_UNLOCK(fdp);
+			mtx_lock(&Giant);
+			FREE(newofile, M_FILEDESC);
+			mtx_unlock(&Giant);
+			FILEDESC_LOCK(fdp);
+			continue;
+		}
+		newofileflags = (char *) &newofile[nfiles];
+		/*
+		 * Copy the existing ofile and ofileflags arrays
+		 * and zero the new portion of each array.
+		 */
+		bcopy(fdp->fd_ofiles, newofile,
+			(i = sizeof(struct file *) * fdp->fd_nfiles));
+		bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i);
+		bcopy(fdp->fd_ofileflags, newofileflags,
+			(i = sizeof(char) * fdp->fd_nfiles));
+		bzero(newofileflags + i, nfiles * sizeof(char) - i);
+		if (fdp->fd_nfiles > NDFILE)
+			oldofile = fdp->fd_ofiles;
+		else
+			oldofile = NULL;
+		fdp->fd_ofiles = newofile;
+		fdp->fd_ofileflags = newofileflags;
+		fdp->fd_nfiles = nfiles;
+		fdexpand++;
+		if (oldofile != NULL) {
+			FILEDESC_UNLOCK(fdp);
+			mtx_lock(&Giant);
+			FREE(oldofile, M_FILEDESC);
+			mtx_unlock(&Giant);
+			FILEDESC_LOCK(fdp);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Check to see whether n user file descriptors
+ * are available to the process p.
+ */
+int
+fdavail(td, n)
+	struct thread *td;
+	register int n;
+{
+	struct proc *p = td->td_proc;
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	register struct file **fpp;
+	register int i, lim, last;
+
+	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
+	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
+		return (1);
+
+	last = min(fdp->fd_nfiles, lim);
+	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
+	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
+		if (*fpp == NULL && --n <= 0)
+			return (1);
+	}
+	return (0);
+}
+
+/*
+ * Create a new open file structure and allocate
+ * a file decriptor for the process that refers to it.
+ */
+int
+falloc(td, resultfp, resultfd)
+	register struct thread *td;
+	struct file **resultfp;
+	int *resultfd;
+{
+	struct proc *p = td->td_proc;
+	register struct file *fp, *fq;
+	int error, i;
+
+	sx_xlock(&filelist_lock);
+	if (nfiles >= maxfiles) {
+		sx_xunlock(&filelist_lock);
+		tablefull("file");
+		return (ENFILE);
+	}
+	nfiles++;
+	sx_xunlock(&filelist_lock);
+	/*
+	 * Allocate a new file descriptor.
+	 * If the process has file descriptor zero open, add to the list
+	 * of open files at that point, otherwise put it at the front of
+	 * the list of open files.
+	 */
+	fp = uma_zalloc(file_zone, M_WAITOK);
+	bzero(fp, sizeof(*fp));
+
+	/*
+	 * wait until after malloc (which may have blocked) returns before
+	 * allocating the slot, else a race might have shrunk it if we had
+	 * allocated it before the malloc.
+	 */
+	FILEDESC_LOCK(p->p_fd);
+	if ((error = fdalloc(td, 0, &i))) {
+		FILEDESC_UNLOCK(p->p_fd);
+		sx_xlock(&filelist_lock);
+		nfiles--;
+		sx_xunlock(&filelist_lock);
+		uma_zfree(file_zone, fp);
+		return (error);
+	}
+	fp->f_mtxp = mtx_pool_alloc();
+	fp->f_gcflag = 0;
+	fp->f_count = 1;
+	fp->f_cred = crhold(td->td_ucred);
+	fp->f_ops = &badfileops;
+	fp->f_seqcount = 1;
+	FILEDESC_UNLOCK(p->p_fd);
+	sx_xlock(&filelist_lock);
+	FILEDESC_LOCK(p->p_fd);
+	if ((fq = p->p_fd->fd_ofiles[0])) {
+		LIST_INSERT_AFTER(fq, fp, f_list);
+	} else {
+		LIST_INSERT_HEAD(&filehead, fp, f_list);
+	}
+	p->p_fd->fd_ofiles[i] = fp;
+	FILEDESC_UNLOCK(p->p_fd);
+	sx_xunlock(&filelist_lock);
+	if (resultfp)
+		*resultfp = fp;
+	if (resultfd)
+		*resultfd = i;
+	return (0);
+}
+
+/*
+ * Free a file descriptor.
+ */
+void
+ffree(fp)
+	register struct file *fp;
+{
+
+	KASSERT((fp->f_count == 0), ("ffree: fp_fcount not 0!"));
+	sx_xlock(&filelist_lock);
+	LIST_REMOVE(fp, f_list);
+	nfiles--;
+	sx_xunlock(&filelist_lock);
+	crfree(fp->f_cred);
+	uma_zfree(file_zone, fp);
+}
+
+/*
+ * Build a new filedesc structure.
+ */
+struct filedesc *
+fdinit(td)
+	struct thread *td;
+{
+	register struct filedesc0 *newfdp;
+	register struct filedesc *fdp = td->td_proc->p_fd;
+
+	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
+	    M_FILEDESC, M_WAITOK | M_ZERO);
+	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
+	FILEDESC_LOCK(&newfdp->fd_fd);
+	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
+	if (newfdp->fd_fd.fd_cdir)
+		VREF(newfdp->fd_fd.fd_cdir);
+	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
+	if (newfdp->fd_fd.fd_rdir)
+		VREF(newfdp->fd_fd.fd_rdir);
+	newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
+	if (newfdp->fd_fd.fd_jdir)
+		VREF(newfdp->fd_fd.fd_jdir);
+
+	/* Create the file descriptor table. */
+	newfdp->fd_fd.fd_refcnt = 1;
+	newfdp->fd_fd.fd_cmask = cmask;
+	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
+	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
+	newfdp->fd_fd.fd_nfiles = NDFILE;
+	newfdp->fd_fd.fd_knlistsize = -1;
+	FILEDESC_UNLOCK(&newfdp->fd_fd);
+
+	return (&newfdp->fd_fd);
+}
+
+/*
+ * Share a filedesc structure.
+ */
+struct filedesc *
+fdshare(p)
+	struct proc *p;
+{
+	FILEDESC_LOCK(p->p_fd);
+	p->p_fd->fd_refcnt++;
+	FILEDESC_UNLOCK(p->p_fd);
+	return (p->p_fd);
+}
+
+/*
+ * Copy a filedesc structure.
+ */
+struct filedesc *
+fdcopy(td)
+	struct thread *td;
+{
+	register struct filedesc *newfdp, *fdp = td->td_proc->p_fd;
+	register struct file **fpp;
+	register int i, j;
+
+	/* Certain daemons might not have file descriptors. */
+	if (fdp == NULL)
+		return (NULL);
+
+	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+	FILEDESC_UNLOCK(fdp);
+	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
+	    M_FILEDESC, M_WAITOK);
+	FILEDESC_LOCK(fdp);
+	bcopy(fdp, newfdp, sizeof(struct filedesc));
+	FILEDESC_UNLOCK(fdp);
+	bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx));
+	mtx_init(&newfdp->fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
+	if (newfdp->fd_cdir)
+		VREF(newfdp->fd_cdir);
+	if (newfdp->fd_rdir)
+		VREF(newfdp->fd_rdir);
+	if (newfdp->fd_jdir)
+		VREF(newfdp->fd_jdir);
+	newfdp->fd_refcnt = 1;
+
+	/*
+	 * If the number of open files fits in the internal arrays
+	 * of the open file structure, use them, otherwise allocate
+	 * additional memory for the number of descriptors currently
+	 * in use.
+	 */
+	FILEDESC_LOCK(fdp);
+	newfdp->fd_lastfile = fdp->fd_lastfile;
+	newfdp->fd_nfiles = fdp->fd_nfiles;
+	if (newfdp->fd_lastfile < NDFILE) {
+		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
+		newfdp->fd_ofileflags =
+		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
+		i = NDFILE;
+	} else {
+		/*
+		 * Compute the smallest multiple of NDEXTENT needed
+		 * for the file descriptors currently in use,
+		 * allowing the table to shrink.
+		 */
+retry:
+		i = newfdp->fd_nfiles;
+		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
+			i /= 2;
+		FILEDESC_UNLOCK(fdp);
+		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
+		    M_FILEDESC, M_WAITOK);
+		FILEDESC_LOCK(fdp);
+		newfdp->fd_lastfile = fdp->fd_lastfile;
+		newfdp->fd_nfiles = fdp->fd_nfiles;
+		j = newfdp->fd_nfiles;
+		while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2)
+			j /= 2;
+		if (i != j) {
+			/*
+			 * The size of the original table has changed.
+			 * Go over once again.
+			 */
+			FILEDESC_UNLOCK(fdp);
+			FREE(newfdp->fd_ofiles, M_FILEDESC);
+			FILEDESC_LOCK(fdp);
+			newfdp->fd_lastfile = fdp->fd_lastfile;
+			newfdp->fd_nfiles = fdp->fd_nfiles;
+			goto retry;
+		}
+		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
+	}
+	newfdp->fd_nfiles = i;
+	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
+	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
+
+	/*
+	 * kq descriptors cannot be copied.
+	 */
+	if (newfdp->fd_knlistsize != -1) {
+		fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile];
+		for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) {
+			if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) {
+				*fpp = NULL;
+				if (i < newfdp->fd_freefile)
+					newfdp->fd_freefile = i;
+			}
+			if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0)
+				newfdp->fd_lastfile--;
+		}
+		newfdp->fd_knlist = NULL;
+		newfdp->fd_knlistsize = -1;
+		newfdp->fd_knhash = NULL;
+		newfdp->fd_knhashmask = 0;
+	}
+
+	fpp = newfdp->fd_ofiles;
+	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) {
+		if (*fpp != NULL) {
+			fhold(*fpp);
+		}
+	}
+	return (newfdp);
+}
+
+/*
+ * Release a filedesc structure.
+ */
+void
+fdfree(td)
+	struct thread *td;
+{
+	register struct filedesc *fdp;
+	struct file **fpp;
+	register int i;
+
+	fdp = td->td_proc->p_fd;
+	/* Certain daemons might not have file descriptors. */
+	if (fdp == NULL)
+		return;
+
+	FILEDESC_LOCK(fdp);
+	if (--fdp->fd_refcnt > 0) {
+		FILEDESC_UNLOCK(fdp);
+		return;
+	}
+	/*
+	 * we are the last reference to the structure, we can
+	 * safely assume it will not change out from under us.
+	 */
+	FILEDESC_UNLOCK(fdp);
+	fpp = fdp->fd_ofiles;
+	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
+		if (*fpp)
+			(void) closef(*fpp, td);
+	}
+
+	PROC_LOCK(td->td_proc);
+	td->td_proc->p_fd = NULL;
+	PROC_UNLOCK(td->td_proc);
+
+	if (fdp->fd_nfiles > NDFILE)
+		FREE(fdp->fd_ofiles, M_FILEDESC);
+	if (fdp->fd_cdir)
+		vrele(fdp->fd_cdir);
+	if (fdp->fd_rdir)
+		vrele(fdp->fd_rdir);
+	if (fdp->fd_jdir)
+		vrele(fdp->fd_jdir);
+	if (fdp->fd_knlist)
+		FREE(fdp->fd_knlist, M_KQUEUE);
+	if (fdp->fd_knhash)
+		FREE(fdp->fd_knhash, M_KQUEUE);
+	mtx_destroy(&fdp->fd_mtx);
+	FREE(fdp, M_FILEDESC);
+}
+
+/*
+ * For setugid programs, we don't want to people to use that setugidness
+ * to generate error messages which write to a file which otherwise would
+ * otherwise be off-limits to the process.
+ *
+ * This is a gross hack to plug the hole.  A better solution would involve
+ * a special vop or other form of generalized access control mechanism.  We
+ * go ahead and just reject all procfs filesystems accesses as dangerous.
+ *
+ * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
+ * sufficient.  We also don't for check setugidness since we know we are.
+ */
+static int
+is_unsafe(struct file *fp)
+{
+	if (fp->f_type == DTYPE_VNODE && 
+	    ((struct vnode *)(fp->f_data))->v_tag == VT_PROCFS)
+		return (1);
+	return (0);
+}
+
+/*
+ * Make this setguid thing safe, if at all possible.
+ */
+void
+setugidsafety(td)
+	struct thread *td;
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	register int i;
+
+	/* Certain daemons might not have file descriptors. */
+	if (fdp == NULL)
+		return;
+
+	/*
+	 * note: fdp->fd_ofiles may be reallocated out from under us while
+	 * we are blocked in a close.  Be careful!
+	 */
+	FILEDESC_LOCK(fdp);
+	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		if (i > 2)
+			break;
+		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
+			struct file *fp;
+
+#if 0
+			if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0)
+				(void) munmapfd(td, i);
+#endif
+			if (i < fdp->fd_knlistsize) {
+				FILEDESC_UNLOCK(fdp);
+				knote_fdclose(td, i);
+				FILEDESC_LOCK(fdp);
+			}
+			/*
+			 * NULL-out descriptor prior to close to avoid
+			 * a race while close blocks.
+			 */
+			fp = fdp->fd_ofiles[i];
+			fdp->fd_ofiles[i] = NULL;
+			fdp->fd_ofileflags[i] = 0;
+			if (i < fdp->fd_freefile)
+				fdp->fd_freefile = i;
+			FILEDESC_UNLOCK(fdp);
+			(void) closef(fp, td);
+			FILEDESC_LOCK(fdp);
+		}
+	}
+	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+		fdp->fd_lastfile--;
+	FILEDESC_UNLOCK(fdp);
+}
+
+/*
+ * Close any files on exec?
+ */
+void
+fdcloseexec(td)
+	struct thread *td;
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	register int i;
+
+	/* Certain daemons might not have file descriptors. */
+	if (fdp == NULL)
+		return;
+
+	FILEDESC_LOCK(fdp);
+
+	/*
+	 * We cannot cache fd_ofiles or fd_ofileflags since operations
+	 * may block and rip them out from under us.
+	 */
+	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		if (fdp->fd_ofiles[i] != NULL &&
+		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
+			struct file *fp;
+
+#if 0
+			if (fdp->fd_ofileflags[i] & UF_MAPPED)
+				(void) munmapfd(td, i);
+#endif
+			if (i < fdp->fd_knlistsize) {
+				FILEDESC_UNLOCK(fdp);
+				knote_fdclose(td, i);
+				FILEDESC_LOCK(fdp);
+			}
+			/*
+			 * NULL-out descriptor prior to close to avoid
+			 * a race while close blocks.
+			 */
+			fp = fdp->fd_ofiles[i];
+			fdp->fd_ofiles[i] = NULL;
+			fdp->fd_ofileflags[i] = 0;
+			if (i < fdp->fd_freefile)
+				fdp->fd_freefile = i;
+			FILEDESC_UNLOCK(fdp);
+			(void) closef(fp, td);
+			FILEDESC_LOCK(fdp);
+		}
+	}
+	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+		fdp->fd_lastfile--;
+	FILEDESC_UNLOCK(fdp);
+}
+
+/*
+ * It is unsafe for set[ug]id processes to be started with file
+ * descriptors 0..2 closed, as these descriptors are given implicit
+ * significance in the Standard C library.  fdcheckstd() will create a
+ * descriptor referencing /dev/null for each of stdin, stdout, and
+ * stderr that is not already open.
+ */
+int
+fdcheckstd(td)
+	struct thread *td;
+{
+	struct nameidata nd;
+	struct filedesc *fdp;
+	struct file *fp;
+	register_t retval;
+	int fd, i, error, flags, devnull;
+
+	fdp = td->td_proc->p_fd;
+	if (fdp == NULL)
+		return (0);
+	devnull = -1;
+	error = 0;
+	for (i = 0; i < 3; i++) {
+		if (fdp->fd_ofiles[i] != NULL)
+			continue;
+		if (devnull < 0) {
+			error = falloc(td, &fp, &fd);
+			if (error != 0)
+				break;
+			NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null",
+			    td);
+			flags = FREAD | FWRITE;
+			error = vn_open(&nd, &flags, 0);
+			if (error != 0) {
+				FILEDESC_LOCK(fdp);
+				fdp->fd_ofiles[i] = NULL;
+				FILEDESC_UNLOCK(fdp);
+				fdrop(fp, td);
+				break;
+			}
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			fp->f_data = (caddr_t)nd.ni_vp;
+			fp->f_flag = flags;
+			fp->f_ops = &vnops;
+			fp->f_type = DTYPE_VNODE;
+			VOP_UNLOCK(nd.ni_vp, 0, td);
+			devnull = fd;
+		} else {
+			FILEDESC_LOCK(fdp);
+			error = fdalloc(td, 0, &fd);
+			if (error != 0) {
+				FILEDESC_UNLOCK(fdp);
+				break;
+			}
+			error = do_dup(fdp, devnull, fd, &retval, td);
+			if (error != 0)
+				break;
+		}
+	}
+	return (error);
+}
+
+/*
+ * Internal form of close.
+ * Decrement reference count on file structure.
+ * Note: td may be NULL when closing a file
+ * that was being passed in a message.
+ */
+int
+closef(fp, td)
+	register struct file *fp;
+	register struct thread *td;
+{
+	struct vnode *vp;
+	struct flock lf;
+
+	if (fp == NULL)
+		return (0);
+	/*
+	 * POSIX record locking dictates that any close releases ALL
+	 * locks owned by this process.  This is handled by setting
+	 * a flag in the unlock to free ONLY locks obeying POSIX
+	 * semantics, and not to free BSD-style file locks.
+	 * If the descriptor was in a message, POSIX-style locks
+	 * aren't passed with the descriptor.
+	 */
+	if (td && (td->td_proc->p_flag & P_ADVLOCK) &&
+	    fp->f_type == DTYPE_VNODE) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		lf.l_type = F_UNLCK;
+		vp = (struct vnode *)fp->f_data;
+		(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
+		    F_UNLCK, &lf, F_POSIX);
+	}
+	return (fdrop(fp, td));
+}
+
+/*
+ * Drop reference on struct file passed in, may call closef if the
+ * reference hits zero.
+ */
+int
+fdrop(fp, td)
+	struct file *fp;
+	struct thread *td;
+{
+
+	FILE_LOCK(fp);
+	return (fdrop_locked(fp, td));
+}
+
+/*
+ * Extract the file pointer associated with the specified descriptor for
+ * the current user process.
+ *
+ * If the descriptor doesn't exist, EBADF is returned.
+ *
+ * If the descriptor exists but doesn't match 'flags' then
+ * return EBADF for read attempts and EINVAL for write attempts.
+ *
+ * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
+ * It should be droped with fdrop().
+ * If it is not set, then the refcount will not be bumped however the
+ * thread's filedesc struct will be returned locked (for fgetsock).
+ *
+ * If an error occured the non-zero error is returned and *fpp is set to NULL.
+ * Otherwise *fpp is set and zero is returned.
+ */
+static __inline
+int
+_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+
+	*fpp = NULL;
+	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
+		return(EBADF);
+	FILEDESC_LOCK(fdp);
+	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
+		FILEDESC_UNLOCK(fdp);
+		return(EBADF);
+	}
+
+	/*
+	 * Note: FREAD failures returns EBADF to maintain backwards
+	 * compatibility with what routines returned before.
+	 *
+	 * Only one flag, or 0, may be specified.
+	 */
+	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
+		FILEDESC_UNLOCK(fdp);
+		return(EBADF);
+	}
+	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
+		FILEDESC_UNLOCK(fdp);
+		return(EINVAL);
+	}
+	if (hold) {
+		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
+	}
+	*fpp = fp;
+	return(0);
+}
+
+int
+fget(struct thread *td, int fd, struct file **fpp)
+{
+    return(_fget(td, fd, fpp, 0, 1));
+}
+
+int
+fget_read(struct thread *td, int fd, struct file **fpp)
+{
+    return(_fget(td, fd, fpp, FREAD, 1));
+}
+
+int
+fget_write(struct thread *td, int fd, struct file **fpp)
+{
+    return(_fget(td, fd, fpp, FWRITE, 1));
+}
+
+/*
+ * Like fget() but loads the underlying vnode, or returns an error if
+ * the descriptor does not represent a vnode.  Note that pipes use vnodes
+ * but never have VM objects (so VOP_GETVOBJECT() calls will return an
+ * error).  The returned vnode will be vref()d.
+ */
+
+static __inline
+int
+_fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
+{
+	struct file *fp;
+	int error;
+
+	*vpp = NULL;
+	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
+		return (error);
+	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
+		error = EINVAL;
+	} else {
+		*vpp = (struct vnode *)fp->f_data;
+		vref(*vpp);
+	}
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	return (error);
+}
+
+int
+fgetvp(struct thread *td, int fd, struct vnode **vpp)
+{
+	return(_fgetvp(td, fd, vpp, 0));
+}
+
+int
+fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
+{
+	return(_fgetvp(td, fd, vpp, FREAD));
+}
+
+int
+fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
+{
+	return(_fgetvp(td, fd, vpp, FWRITE));
+}
+
+/*
+ * Like fget() but loads the underlying socket, or returns an error if
+ * the descriptor does not represent a socket.
+ *
+ * We bump the ref count on the returned socket.  XXX Also obtain the SX lock in
+ * the future.
+ */
+int
+fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
+{
+	struct file *fp;
+	int error;
+
+	*spp = NULL;
+	if (fflagp)
+		*fflagp = 0;
+	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
+		return (error);
+	if (fp->f_type != DTYPE_SOCKET) {
+		error = ENOTSOCK;
+	} else {
+		*spp = (struct socket *)fp->f_data;
+		if (fflagp)
+			*fflagp = fp->f_flag;
+		soref(*spp);
+	}
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	return(error);
+}
+
+/*
+ * Drop the reference count on the the socket and XXX release the SX lock in
+ * the future.  The last reference closes the socket.
+ */
+void
+fputsock(struct socket *so)
+{
+	sorele(so);
+}
+
+/*
+ * Drop reference on struct file passed in, may call closef if the
+ * reference hits zero.
+ * Expects struct file locked, and will unlock it.
+ */
+int
+fdrop_locked(fp, td)
+	struct file *fp;
+	struct thread *td;
+{
+	struct flock lf;
+	struct vnode *vp;
+	int error;
+
+	FILE_LOCK_ASSERT(fp, MA_OWNED);
+
+	if (--fp->f_count > 0) {
+		FILE_UNLOCK(fp);
+		return (0);
+	}
+	mtx_lock(&Giant);
+	if (fp->f_count < 0)
+		panic("fdrop: count < 0");
+	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		lf.l_type = F_UNLCK;
+		vp = (struct vnode *)fp->f_data;
+		FILE_UNLOCK(fp);
+		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+	} else
+		FILE_UNLOCK(fp);
+	if (fp->f_ops != &badfileops)
+		error = fo_close(fp, td);
+	else
+		error = 0;
+	ffree(fp);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Apply an advisory lock on a file descriptor.
+ *
+ * Just attempt to get a record lock of the requested type on
+ * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct flock_args {
+	int	fd;
+	int	how;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+flock(td, uap)
+	struct thread *td;
+	register struct flock_args *uap;
+{
+	struct file *fp;
+	struct vnode *vp;
+	struct flock lf;
+	int error;
+
+	if ((error = fget(td, uap->fd, &fp)) != 0)
+		return (error);
+	if (fp->f_type != DTYPE_VNODE) {
+		fdrop(fp, td);
+		return (EOPNOTSUPP);
+	}
+
+	mtx_lock(&Giant);
+	vp = (struct vnode *)fp->f_data;
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	if (uap->how & LOCK_UN) {
+		lf.l_type = F_UNLCK;
+		FILE_LOCK(fp);
+		fp->f_flag &= ~FHASLOCK;
+		FILE_UNLOCK(fp);
+		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+		goto done2;
+	}
+	if (uap->how & LOCK_EX)
+		lf.l_type = F_WRLCK;
+	else if (uap->how & LOCK_SH)
+		lf.l_type = F_RDLCK;
+	else {
+		error = EBADF;
+		goto done2;
+	}
+	FILE_LOCK(fp);
+	fp->f_flag |= FHASLOCK;
+	FILE_UNLOCK(fp);
+	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
+done2:
+	fdrop(fp, td);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * File Descriptor pseudo-device driver (/dev/fd/).
+ *
+ * Opening minor device N dup()s the file (if any) connected to file
+ * descriptor N belonging to the calling process.  Note that this driver
+ * consists of only the ``open()'' routine, because all subsequent
+ * references to this file will be direct to the other driver.
+ */
+/* ARGSUSED */
+static int
+fdopen(dev, mode, type, td)
+	dev_t dev;
+	int mode, type;
+	struct thread *td;
+{
+
+	/*
+	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
+	 * the file descriptor being sought for duplication. The error
+	 * return ensures that the vnode for this device will be released
+	 * by vn_open. Open will detect this special error and take the
+	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
+	 * will simply report the error.
+	 */
+	td->td_dupfd = dev2unit(dev);
+	return (ENODEV);
+}
+
+/*
+ * Duplicate the specified descriptor to a free descriptor.
+ */
+int
+dupfdopen(td, fdp, indx, dfd, mode, error)
+	struct thread *td;
+	struct filedesc *fdp;
+	int indx, dfd;
+	int mode;
+	int error;
+{
+	register struct file *wfp;
+	struct file *fp;
+
+	/*
+	 * If the to-be-dup'd fd number is greater than the allowed number
+	 * of file descriptors, or the fd to be dup'd has already been
+	 * closed, then reject.
+	 */
+	FILEDESC_LOCK(fdp);
+	if ((u_int)dfd >= fdp->fd_nfiles ||
+	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
+		FILEDESC_UNLOCK(fdp);
+		return (EBADF);
+	}
+
+	/*
+	 * There are two cases of interest here.
+	 *
+	 * For ENODEV simply dup (dfd) to file descriptor
+	 * (indx) and return.
+	 *
+	 * For ENXIO steal away the file structure from (dfd) and
+	 * store it in (indx).  (dfd) is effectively closed by
+	 * this operation.
+	 *
+	 * Any other error code is just returned.
+	 */
+	switch (error) {
+	case ENODEV:
+		/*
+		 * Check that the mode the file is being opened for is a
+		 * subset of the mode of the existing descriptor.
+		 */
+		FILE_LOCK(wfp);
+		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
+			FILE_UNLOCK(wfp);
+			FILEDESC_UNLOCK(fdp);
+			return (EACCES);
+		}
+		fp = fdp->fd_ofiles[indx];
+#if 0
+		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
+			(void) munmapfd(td, indx);
+#endif
+		fdp->fd_ofiles[indx] = wfp;
+		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
+		fhold_locked(wfp);
+		FILE_UNLOCK(wfp);
+		if (indx > fdp->fd_lastfile)
+			fdp->fd_lastfile = indx;
+		if (fp != NULL)
+			FILE_LOCK(fp);
+		FILEDESC_UNLOCK(fdp);
+		/*
+		 * we now own the reference to fp that the ofiles[] array
+		 * used to own.  Release it.
+		 */
+		if (fp != NULL)
+			fdrop_locked(fp, td);
+		return (0);
+
+	case ENXIO:
+		/*
+		 * Steal away the file pointer from dfd, and stuff it into indx.
+		 */
+		fp = fdp->fd_ofiles[indx];
+#if 0
+		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
+			(void) munmapfd(td, indx);
+#endif
+		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
+		fdp->fd_ofiles[dfd] = NULL;
+		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
+		fdp->fd_ofileflags[dfd] = 0;
+
+		/*
+		 * Complete the clean up of the filedesc structure by
+		 * recomputing the various hints.
+		 */
+		if (indx > fdp->fd_lastfile) {
+			fdp->fd_lastfile = indx;
+		} else {
+			while (fdp->fd_lastfile > 0 &&
+			   fdp->fd_ofiles[fdp->fd_lastfile] == NULL) {
+				fdp->fd_lastfile--;
+			}
+			if (dfd < fdp->fd_freefile)
+				fdp->fd_freefile = dfd;
+		}
+		if (fp != NULL)
+			FILE_LOCK(fp);
+		FILEDESC_UNLOCK(fdp);
+
+		/*
+		 * we now own the reference to fp that the ofiles[] array
+		 * used to own.  Release it.
+		 */
+		if (fp != NULL)
+			fdrop_locked(fp, td);
+		return (0);
+
+	default:
+		FILEDESC_UNLOCK(fdp);
+		return (error);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Get file structures.
+ */
+static int
+sysctl_kern_file(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct file *fp;
+
+	sx_slock(&filelist_lock);
+	if (!req->oldptr) {
+		/*
+		 * overestimate by 10 files
+		 */
+		error = SYSCTL_OUT(req, 0, sizeof(filehead) + 
+				   (nfiles + 10) * sizeof(struct file));
+		sx_sunlock(&filelist_lock);
+		return (error);
+	}
+
+	error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead));
+	if (error) {
+		sx_sunlock(&filelist_lock);
+		return (error);
+	}
+
+	/*
+	 * followed by an array of file structures
+	 */
+	LIST_FOREACH(fp, &filehead, f_list) {
+		error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file));
+		if (error) {
+			sx_sunlock(&filelist_lock);
+			return (error);
+		}
+	}
+	sx_sunlock(&filelist_lock);
+	return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
+    0, 0, sysctl_kern_file, "S,file", "Entire file table");
+
+SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 
+    &maxfilesperproc, 0, "Maximum files allowed open per process");
+
+SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 
+    &maxfiles, 0, "Maximum number of files");
+
+SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 
+    &nfiles, 0, "System-wide number of open files");
+
+static void
+fildesc_drvinit(void *unused)
+{
+	dev_t dev;
+
+	dev = make_dev(&fildesc_cdevsw, 0, UID_BIN, GID_BIN, 0666, "fd/0");
+	make_dev_alias(dev, "stdin");
+	dev = make_dev(&fildesc_cdevsw, 1, UID_BIN, GID_BIN, 0666, "fd/1");
+	make_dev_alias(dev, "stdout");
+	dev = make_dev(&fildesc_cdevsw, 2, UID_BIN, GID_BIN, 0666, "fd/2");
+	make_dev_alias(dev, "stderr");
+	if (!devfs_present) {
+		int fd;
+
+		for (fd = 3; fd < NUMFDESC; fd++)
+			make_dev(&fildesc_cdevsw, fd, UID_BIN, GID_BIN, 0666,
+			    "fd/%d", fd);
+	}
+}
+
+struct fileops badfileops = {
+	badfo_readwrite,
+	badfo_readwrite,
+	badfo_ioctl,
+	badfo_poll,
+	badfo_kqfilter,
+	badfo_stat,
+	badfo_close
+};
+
+static int
+badfo_readwrite(fp, uio, cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+	struct thread *td;
+	int flags;
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_ioctl(fp, com, data, td)
+	struct file *fp;
+	u_long com;
+	caddr_t data;
+	struct thread *td;
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_poll(fp, events, cred, td)
+	struct file *fp;
+	int events;
+	struct ucred *cred;
+	struct thread *td;
+{
+
+	return (0);
+}
+
+static int
+badfo_kqfilter(fp, kn)
+	struct file *fp;
+	struct knote *kn;
+{
+
+	return (0);
+}
+
+static int
+badfo_stat(fp, sb, td)
+	struct file *fp;
+	struct stat *sb;
+	struct thread *td;
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_close(fp, td)
+	struct file *fp;
+	struct thread *td;
+{
+
+	return (EBADF);
+}
+
+SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
+					fildesc_drvinit,NULL)
+
+static void filelistinit(void *);
+SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
+
+/* ARGSUSED*/
+static void
+filelistinit(dummy)
+	void *dummy;
+{
+	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, 0);
+
+	sx_init(&filelist_lock, "filelist lock");
+	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
+}
diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c
new file mode 100644
index 0000000..a33b0c7
--- /dev/null
+++ b/sys/kern/kern_environment.c
@@ -0,0 +1,461 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * The unified bootloader passes us a pointer to a preserved copy of
+ * bootstrap/kernel environment variables.  We convert them to a
+ * dynamic array of strings later when the VM subsystem is up.
+ *
+ * We make these available through the kenv(2) syscall for userland
+ * and through getenv()/freeenv() setenv() unsetenv() testenv() for
+ * the kernel.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/sx.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/libkern.h>
+#include <sys/kenv.h>
+
+MALLOC_DEFINE(M_KENV, "kenv", "kernel environment");
+
+#define KENV_SIZE	512	/* Maximum number of environment strings */
+
+/* pointer to the static environment */
+char		*kern_envp;
+static char	*kernenv_next(char *);
+
+/* dynamic environment variables */
+char		**kenvp;
+struct sx	kenv_lock;
+
+/*
+ * No need to protect this with a mutex
+ * since SYSINITS are single threaded.
+ */
+int	dynamic_kenv = 0;
+
+#define KENV_CHECK	if (!dynamic_kenv) \
+			    panic("%s: called before SI_SUB_KMEM", __func__)
+
+int
+kenv(td, uap)
+	struct thread *td;
+	struct kenv_args /* {
+		syscallarg(int) what;
+		syscallarg(const char *) name;
+		syscallarg(char *) value;
+		syscallarg(int) len;
+	} */ *uap;
+{
+	char *name, *value;
+	size_t len, done;
+	int error, i;
+
+	KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0"));
+
+	error = 0;
+	if (SCARG(uap, what) == KENV_DUMP) {
+		len = 0;
+		/* Return the size if called with a NULL buffer */
+		if (SCARG(uap, value) == NULL) {
+			sx_slock(&kenv_lock);
+			for (i = 0; kenvp[i] != NULL; i++)
+				len += strlen(kenvp[i]) + 1;
+			sx_sunlock(&kenv_lock);
+			td->td_retval[0] = len;
+			return (0);
+		}
+		done = 0;
+		sx_slock(&kenv_lock);
+		for (i = 0; kenvp[i] != NULL && done < SCARG(uap, len); i++) {
+			len = min(strlen(kenvp[i]) + 1, SCARG(uap, len) - done);
+			error = copyout(kenvp[i], SCARG(uap, value) + done,
+			    len);
+			if (error) {
+				sx_sunlock(&kenv_lock);
+				return (error);
+			}
+			done += len;
+		}
+		sx_sunlock(&kenv_lock);
+		return (0);
+	}
+
+	if ((SCARG(uap, what) == KENV_SET) ||
+	    (SCARG(uap, what) == KENV_UNSET)) {
+		error = suser(td);
+		if (error)
+			return (error);
+	}
+
+	name = malloc(KENV_MNAMELEN, M_TEMP, M_WAITOK);
+
+	error = copyinstr(SCARG(uap, name), name, KENV_MNAMELEN, NULL);
+	if (error)
+		goto done;
+
+	switch (SCARG(uap, what)) {
+	case KENV_GET:
+		value = getenv(name);
+		if (value == NULL) {
+			error = ENOENT;
+			goto done;
+		}
+		len = strlen(value) + 1;
+		if (len > SCARG(uap, len))
+			len = SCARG(uap, len);
+		error = copyout(value, SCARG(uap, value), len);
+		freeenv(value);
+		if (error)
+			goto done;
+		td->td_retval[0] = len;
+		break;
+	case KENV_SET:
+		len = SCARG(uap, len);
+		if (len < 1) {
+			error = EINVAL;
+			goto done;
+		}
+		if (len > KENV_MVALLEN)
+			len = KENV_MVALLEN;
+		value = malloc(len, M_TEMP, M_WAITOK);
+		error = copyinstr(SCARG(uap, value), value, len, NULL);
+		if (error) {
+			free(value, M_TEMP);
+			goto done;
+		}
+		setenv(name, value);
+		free(value, M_TEMP);
+		break;
+	case KENV_UNSET:
+		error = unsetenv(name);
+		if (error)
+			error = ENOENT;
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+done:
+	free(name, M_TEMP);
+	return (error);
+}
+
+/*
+ * Setup the dynamic kernel environment.
+ */
+static void
+init_dynamic_kenv(void *data __unused)
+{
+	char *cp;
+	int len, i;
+
+	kenvp = malloc(KENV_SIZE * sizeof(char *), M_KENV, M_WAITOK | M_ZERO);
+	i = 0;
+	for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+		len = strlen(cp) + 1;
+		kenvp[i] = malloc(len, M_KENV, M_WAITOK);
+		strcpy(kenvp[i++], cp);
+	}
+	kenvp[i] = NULL;
+
+	sx_init(&kenv_lock, "kernel environment");
+	dynamic_kenv = 1;
+}
+SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL);
+
+void
+freeenv(char *env)
+{
+
+	if (dynamic_kenv)
+		free(env, M_KENV);
+}
+
+/*
+ * Internal functions for string lookup.
+ */
+static char *
+_getenv_dynamic(const char *name, int *idx)
+{
+	char *cp;
+	int len, i;
+
+	sx_assert(&kenv_lock, SX_LOCKED);
+	len = strlen(name);
+	for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) {
+		if ((cp[len] == '=') &&
+		    (strncmp(cp, name, len) == 0)) {
+			if (idx != NULL)
+				*idx = i;
+			return (cp + len + 1);
+		}
+	}
+	return (NULL);
+}
+
+static char *
+_getenv_static(const char *name)
+{
+	char *cp, *ep;
+	int len;
+
+	for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+		for (ep = cp; (*ep != '=') && (*ep != 0); ep++)
+			;
+		if (*ep != '=')
+			continue;
+		len = ep - cp;
+		ep++;
+		if (!strncmp(name, cp, len) && name[len] == 0)
+			return (ep);
+	}
+	return (NULL);
+}
+
+/*
+ * Look up an environment variable by name.
+ * Return a pointer to the string if found.
+ * The pointer has to be freed with freeenv()
+ * after use.
+ */
+char *
+getenv(const char *name)
+{
+	char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
+	char *ret, *cp;
+	int len;
+
+	if (dynamic_kenv) {
+		sx_slock(&kenv_lock);
+		cp = _getenv_dynamic(name, NULL);
+		if (cp != NULL) {
+			strcpy(buf, cp);
+			sx_sunlock(&kenv_lock);
+			len = strlen(buf) + 1;
+			ret = malloc(len, M_KENV, M_WAITOK);
+			strcpy(ret, buf);
+		} else {
+			sx_sunlock(&kenv_lock);
+			ret = NULL;
+		}
+	} else
+		ret = _getenv_static(name);
+	return (ret);
+}
+
+/*
+ * Test if an environment variable is defined.
+ */
+int
+testenv(const char *name)
+{
+	char *cp;
+
+	if (dynamic_kenv) {
+		sx_slock(&kenv_lock);
+		cp = _getenv_dynamic(name, NULL);
+		sx_sunlock(&kenv_lock);
+	} else
+		cp = _getenv_static(name);
+	if (cp != NULL)
+		return (1);
+	return (0);
+}
+
+/*
+ * Set an environment variable by name.
+ */
+int
+setenv(const char *name, const char *value)
+{
+	char *buf, *cp, *oldenv;
+	int namelen, vallen, i;
+
+	KENV_CHECK;
+
+	namelen = strlen(name) + 1;
+	if (namelen > KENV_MNAMELEN)
+		return (-1);
+	vallen = strlen(value) + 1;
+	if (vallen > KENV_MVALLEN)
+		return (-1);
+	buf = malloc(namelen + vallen, M_KENV, M_WAITOK);
+	sprintf(buf, "%s=%s", name, value);
+
+	sx_xlock(&kenv_lock);
+	cp = _getenv_dynamic(name, &i);
+	if (cp != NULL) {
+		oldenv = kenvp[i];
+		kenvp[i] = buf;
+		sx_xunlock(&kenv_lock);
+		free(oldenv, M_KENV);
+	} else {
+		/* We add the option if it wasn't found */
+		for (i = 0; (cp = kenvp[i]) != NULL; i++)
+			;
+		kenvp[i] = buf;
+		kenvp[i + 1] = NULL;
+		sx_xunlock(&kenv_lock);
+	}
+	return (0);
+}
+
+/*
+ * Unset an environment variable string.
+ */
+int
+unsetenv(const char *name)
+{
+	char *cp, *oldenv;
+	int i, j;
+
+	KENV_CHECK;
+
+	sx_xlock(&kenv_lock);
+	cp = _getenv_dynamic(name, &i);
+	if (cp != NULL) {
+		oldenv = kenvp[i];
+		for (j = i + 1; kenvp[j] != NULL; j++)
+			kenvp[i++] = kenvp[j];
+		kenvp[i] = NULL;
+		sx_xunlock(&kenv_lock);
+		free(oldenv, M_KENV);
+		return (0);
+	}
+	sx_xunlock(&kenv_lock);
+	return (-1);
+}
+
+/*
+ * Return a string value from an environment variable.
+ */
+int
+getenv_string(const char *name, char *data, int size)
+{
+	char *tmp;
+
+	tmp = getenv(name);
+	if (tmp != NULL) {
+		strncpy(data, tmp, size);
+		freeenv(tmp);
+		data[size - 1] = 0;
+		return (1);
+	} else
+		return (0);
+}
+
+/*
+ * Return an integer value from an environment variable.
+ */
+int
+getenv_int(const char *name, int *data)
+{
+	quad_t tmp;
+	int rval;
+
+	rval = getenv_quad(name, &tmp);
+	if (rval)
+		*data = (int) tmp;
+	return (rval);
+}
+
+/*
+ * Return a quad_t value from an environment variable.
+ */
+int
+getenv_quad(const char *name, quad_t *data)
+{
+	char	*value;
+	char	*vtp;
+	quad_t	iv;
+
+	value = getenv(name);
+	if (value == NULL)
+		return (0);
+	iv = strtoq(value, &vtp, 0);
+	if ((vtp == value) || (*vtp != '\0')) {
+		freeenv(value);
+		return (0);
+	}
+	freeenv(value);
+	*data = iv;
+	return (1);
+}
+
+/*
+ * Find the next entry after the one which (cp) falls within, return a
+ * pointer to its start or NULL if there are no more.
+ */
+static char *
+kernenv_next(char *cp)
+{
+
+	if (cp != NULL) {
+		while (*cp != 0)
+			cp++;
+		cp++;
+		if (*cp == 0)
+			cp = NULL;
+	}
+	return (cp);
+}
+
+void
+tunable_int_init(void *data)
+{
+	struct tunable_int *d = (struct tunable_int *)data;
+
+	TUNABLE_INT_FETCH(d->path, d->var);
+}
+
+void
+tunable_quad_init(void *data)
+{
+	struct tunable_quad *d = (struct tunable_quad *)data;
+
+	TUNABLE_QUAD_FETCH(d->path, d->var);
+}
+
+void
+tunable_str_init(void *data)
+{
+	struct tunable_str *d = (struct tunable_str *)data;
+
+	TUNABLE_STR_FETCH(d->path, d->var, d->size);
+}
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
new file mode 100644
index 0000000..46d57c9
--- /dev/null
+++ b/sys/kern/kern_event.c
@@ -0,0 +1,1082 @@
+/*-
+ * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/malloc.h> 
+#include <sys/unistd.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/selinfo.h>
+#include <sys/queue.h>
+#include <sys/event.h>
+#include <sys/eventvar.h>
+#include <sys/poll.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/uio.h>
+
+#include <vm/uma.h>
+
+MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
+
+static int	kqueue_scan(struct file *fp, int maxevents,
+		    struct kevent *ulistp, const struct timespec *timeout,
+		    struct thread *td);
+static int 	kqueue_read(struct file *fp, struct uio *uio,
+		    struct ucred *cred, int flags, struct thread *td);
+static int	kqueue_write(struct file *fp, struct uio *uio,
+		    struct ucred *cred, int flags, struct thread *td);
+static int	kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
+		    struct thread *td);
+static int 	kqueue_poll(struct file *fp, int events, struct ucred *cred,
+		    struct thread *td);
+static int 	kqueue_kqfilter(struct file *fp, struct knote *kn);
+static int 	kqueue_stat(struct file *fp, struct stat *st, struct thread *td);
+static int 	kqueue_close(struct file *fp, struct thread *td);
+static void 	kqueue_wakeup(struct kqueue *kq);
+
+static struct fileops kqueueops = {
+	kqueue_read,
+	kqueue_write,
+	kqueue_ioctl,
+	kqueue_poll,
+	kqueue_kqfilter,
+	kqueue_stat,
+	kqueue_close
+};
+
+static void 	knote_attach(struct knote *kn, struct filedesc *fdp);
+static void 	knote_drop(struct knote *kn, struct thread *td);
+static void 	knote_enqueue(struct knote *kn);
+static void 	knote_dequeue(struct knote *kn);
+static void 	knote_init(void);
+static struct 	knote *knote_alloc(void);
+static void 	knote_free(struct knote *kn);
+
+static void	filt_kqdetach(struct knote *kn);
+static int	filt_kqueue(struct knote *kn, long hint);
+static int	filt_procattach(struct knote *kn);
+static void	filt_procdetach(struct knote *kn);
+static int	filt_proc(struct knote *kn, long hint);
+static int	filt_fileattach(struct knote *kn);
+static void	filt_timerexpire(void *knx);
+static int	filt_timerattach(struct knote *kn);
+static void	filt_timerdetach(struct knote *kn);
+static int	filt_timer(struct knote *kn, long hint);
+
+static struct filterops file_filtops =
+	{ 1, filt_fileattach, NULL, NULL };
+static struct filterops kqread_filtops =
+	{ 1, NULL, filt_kqdetach, filt_kqueue };
+static struct filterops proc_filtops =
+	{ 0, filt_procattach, filt_procdetach, filt_proc };
+static struct filterops timer_filtops =
+	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
+
+static uma_zone_t	knote_zone;
+static int 		kq_ncallouts = 0;
+static int 		kq_calloutmax = (4 * 1024);
+SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
+    &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
+
+#define KNOTE_ACTIVATE(kn) do { 					\
+	kn->kn_status |= KN_ACTIVE;					\
+	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
+		knote_enqueue(kn);					\
+} while(0)
+
+#define	KN_HASHSIZE		64		/* XXX should be tunable */
+#define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
+
+static int
+filt_nullattach(struct knote *kn)
+{
+
+	return (ENXIO);
+};
+
+struct filterops null_filtops =
+	{ 0, filt_nullattach, NULL, NULL };
+
+extern struct filterops sig_filtops;
+
+/*
+ * Table for for all system-defined filters.
+ */
+static struct filterops *sysfilt_ops[] = {
+	&file_filtops,			/* EVFILT_READ */
+	&file_filtops,			/* EVFILT_WRITE */
+	&null_filtops,			/* EVFILT_AIO */
+	&file_filtops,			/* EVFILT_VNODE */
+	&proc_filtops,			/* EVFILT_PROC */
+	&sig_filtops,			/* EVFILT_SIGNAL */
+	&timer_filtops,			/* EVFILT_TIMER */
+	&file_filtops,			/* EVFILT_NETDEV */
+};
+
+static int
+filt_fileattach(struct knote *kn)
+{
+	
+	return (fo_kqfilter(kn->kn_fp, kn));
+}
+
+/*ARGSUSED*/
+static int
+kqueue_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
+
+	if (kn->kn_filter != EVFILT_READ)
+		return (1);
+
+	kn->kn_fop = &kqread_filtops;
+	SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
+	return (0);
+}
+
+static void
+filt_kqdetach(struct knote *kn)
+{
+	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
+
+	SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
+}
+
+/*ARGSUSED*/
+static int
+filt_kqueue(struct knote *kn, long hint)
+{
+	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
+
+	kn->kn_data = kq->kq_count;
+	return (kn->kn_data > 0);
+}
+
+static int
+filt_procattach(struct knote *kn)
+{
+	struct proc *p;
+	int error;
+
+	p = pfind(kn->kn_id);
+	if (p == NULL)
+		return (ESRCH);
+	if ((error = p_cansee(curthread, p))) {
+		PROC_UNLOCK(p);
+		return (error);
+	}
+
+	kn->kn_ptr.p_proc = p;
+	kn->kn_flags |= EV_CLEAR;		/* automatically set */
+
+	/*
+	 * internal flag indicating registration done by kernel
+	 */
+	if (kn->kn_flags & EV_FLAG1) {
+		kn->kn_data = kn->kn_sdata;		/* ppid */
+		kn->kn_fflags = NOTE_CHILD;
+		kn->kn_flags &= ~EV_FLAG1;
+	}
+
+	SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
+	PROC_UNLOCK(p);
+
+	return (0);
+}
+
+/*
+ * The knote may be attached to a different process, which may exit,
+ * leaving nothing for the knote to be attached to.  So when the process
+ * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
+ * it will be deleted when read out.  However, as part of the knote deletion,
+ * this routine is called, so a check is needed to avoid actually performing
+ * a detach, because the original process does not exist any more.
+ */
+static void
+filt_procdetach(struct knote *kn)
+{
+	struct proc *p = kn->kn_ptr.p_proc;
+
+	if (kn->kn_status & KN_DETACHED)
+		return;
+
+	PROC_LOCK(p);
+	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
+	PROC_UNLOCK(p);
+}
+
+static int
+filt_proc(struct knote *kn, long hint)
+{
+	u_int event;
+
+	/*
+	 * mask off extra data
+	 */
+	event = (u_int)hint & NOTE_PCTRLMASK;
+
+	/*
+	 * if the user is interested in this event, record it.
+	 */
+	if (kn->kn_sfflags & event)
+		kn->kn_fflags |= event;
+
+	/*
+	 * process is gone, so flag the event as finished.
+	 */
+	if (event == NOTE_EXIT) {
+		kn->kn_status |= KN_DETACHED;
+		kn->kn_flags |= (EV_EOF | EV_ONESHOT); 
+		return (1);
+	}
+
+	/*
+	 * process forked, and user wants to track the new process,
+	 * so attach a new knote to it, and immediately report an
+	 * event with the parent's pid.
+	 */
+	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
+		struct kevent kev;
+		int error;
+
+		/*
+		 * register knote with new process.
+		 */
+		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
+		kev.filter = kn->kn_filter;
+		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
+		kev.fflags = kn->kn_sfflags;
+		kev.data = kn->kn_id;			/* parent */
+		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
+		error = kqueue_register(kn->kn_kq, &kev, NULL);
+		if (error)
+			kn->kn_fflags |= NOTE_TRACKERR;
+	}
+
+	return (kn->kn_fflags != 0);
+}
+
+static void
+filt_timerexpire(void *knx)
+{
+	struct knote *kn = knx;
+	struct callout *calloutp;
+	struct timeval tv;
+	int tticks;
+
+	kn->kn_data++;
+	KNOTE_ACTIVATE(kn);
+
+	if ((kn->kn_flags & EV_ONESHOT) == 0) {
+		tv.tv_sec = kn->kn_sdata / 1000;
+		tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
+		tticks = tvtohz(&tv);
+		calloutp = (struct callout *)kn->kn_hook;
+		callout_reset(calloutp, tticks, filt_timerexpire, kn);
+	}
+}
+
+/*
+ * data contains amount of time to sleep, in milliseconds
+ */ 
+static int
+filt_timerattach(struct knote *kn)
+{
+	struct callout *calloutp;
+	struct timeval tv;
+	int tticks;
+
+	if (kq_ncallouts >= kq_calloutmax)
+		return (ENOMEM);
+	kq_ncallouts++;
+
+	tv.tv_sec = kn->kn_sdata / 1000;
+	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
+	tticks = tvtohz(&tv);
+
+	kn->kn_flags |= EV_CLEAR;		/* automatically set */
+	MALLOC(calloutp, struct callout *, sizeof(*calloutp),
+	    M_KQUEUE, M_WAITOK);
+	callout_init(calloutp, 0);
+	callout_reset(calloutp, tticks, filt_timerexpire, kn);
+	kn->kn_hook = calloutp;
+
+	return (0);
+}
+
+static void
+filt_timerdetach(struct knote *kn)
+{
+	struct callout *calloutp;
+
+	calloutp = (struct callout *)kn->kn_hook;
+	callout_stop(calloutp);
+	FREE(calloutp, M_KQUEUE);
+	kq_ncallouts--;
+}
+
+static int
+filt_timer(struct knote *kn, long hint)
+{
+
+	return (kn->kn_data != 0);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kqueue(struct thread *td, struct kqueue_args *uap)
+{
+	struct filedesc *fdp;
+	struct kqueue *kq;
+	struct file *fp;
+	int fd, error;
+
+	mtx_lock(&Giant);
+	fdp = td->td_proc->p_fd;
+	error = falloc(td, &fp, &fd);
+	if (error)
+		goto done2;
+	kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&kq->kq_head);
+	FILE_LOCK(fp);
+	fp->f_flag = FREAD | FWRITE;
+	fp->f_type = DTYPE_KQUEUE;
+	fp->f_ops = &kqueueops;
+	TAILQ_INIT(&kq->kq_head);
+	fp->f_data = kq;
+	FILE_UNLOCK(fp);
+	FILEDESC_LOCK(fdp);
+	td->td_retval[0] = fd;
+	if (fdp->fd_knlistsize < 0)
+		fdp->fd_knlistsize = 0;		/* this process has a kq */
+	FILEDESC_UNLOCK(fdp);
+	kq->kq_fdp = fdp;
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct kevent_args {
+	int	fd;
+	const struct kevent *changelist;
+	int	nchanges;
+	struct	kevent *eventlist;
+	int	nevents;
+	const struct timespec *timeout;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+kevent(struct thread *td, struct kevent_args *uap)
+{
+	struct kevent *kevp;
+	struct kqueue *kq;
+	struct file *fp;
+	struct timespec ts;
+	int i, n, nerrors, error;
+
+	if ((error = fget(td, uap->fd, &fp)) != 0)
+		return (error);
+	if (fp->f_type != DTYPE_KQUEUE) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+	if (uap->timeout != NULL) {
+		error = copyin(uap->timeout, &ts, sizeof(ts));
+		if (error)
+			goto done_nogiant;
+		uap->timeout = &ts;
+	}
+	mtx_lock(&Giant);
+
+	kq = (struct kqueue *)fp->f_data;
+	nerrors = 0;
+
+	while (uap->nchanges > 0) {
+		n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
+		error = copyin(uap->changelist, kq->kq_kev,
+		    n * sizeof(struct kevent));
+		if (error)
+			goto done;
+		for (i = 0; i < n; i++) {
+			kevp = &kq->kq_kev[i];
+			kevp->flags &= ~EV_SYSFLAGS;
+			error = kqueue_register(kq, kevp, td);
+			if (error) {
+				if (uap->nevents != 0) {
+					kevp->flags = EV_ERROR;
+					kevp->data = error;
+					(void) copyout(kevp,
+					    uap->eventlist,
+					    sizeof(*kevp));
+					uap->eventlist++;
+					uap->nevents--;
+					nerrors++;
+				} else {
+					goto done;
+				}
+			}
+		}
+		uap->nchanges -= n;
+		uap->changelist += n;
+	}
+	if (nerrors) {
+        	td->td_retval[0] = nerrors;
+		error = 0;
+		goto done;
+	}
+
+	error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td);
+done:
+	mtx_unlock(&Giant);
+done_nogiant:
+	if (fp != NULL)
+		fdrop(fp, td);
+	return (error);
+}
+
+int
+kqueue_add_filteropts(int filt, struct filterops *filtops)
+{
+
+	if (filt > 0)
+		panic("filt(%d) > 0", filt);
+	if (filt + EVFILT_SYSCOUNT < 0)
+		panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
+		    filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
+	if (sysfilt_ops[~filt] != &null_filtops)
+		panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
+	sysfilt_ops[~filt] = filtops;
+	return (0);
+}
+
+int
+kqueue_del_filteropts(int filt)
+{
+
+	if (filt > 0)
+		panic("filt(%d) > 0", filt);
+	if (filt + EVFILT_SYSCOUNT < 0)
+		panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
+		    filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
+	if (sysfilt_ops[~filt] == &null_filtops)
+		panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
+	sysfilt_ops[~filt] = &null_filtops;
+	return (0);
+}
+
+int
+kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
+{
+	struct filedesc *fdp = kq->kq_fdp;
+	struct filterops *fops;
+	struct file *fp = NULL;
+	struct knote *kn = NULL;
+	int s, error = 0;
+
+	if (kev->filter < 0) {
+		if (kev->filter + EVFILT_SYSCOUNT < 0)
+			return (EINVAL);
+		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
+	} else {
+		/*
+		 * XXX
+		 * filter attach routine is responsible for insuring that
+		 * the identifier can be attached to it.
+		 */
+		printf("unknown filter: %d\n", kev->filter);
+		return (EINVAL);
+	}
+
+	FILEDESC_LOCK(fdp);
+	if (fops->f_isfd) {
+		/* validate descriptor */
+		if ((u_int)kev->ident >= fdp->fd_nfiles ||
+		    (fp = fdp->fd_ofiles[kev->ident]) == NULL) {
+			FILEDESC_UNLOCK(fdp);
+			return (EBADF);
+		}
+		fhold(fp);
+
+		if (kev->ident < fdp->fd_knlistsize) {
+			SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
+				if (kq == kn->kn_kq &&
+				    kev->filter == kn->kn_filter)
+					break;
+		}
+	} else {
+		if (fdp->fd_knhashmask != 0) {
+			struct klist *list;
+			
+			list = &fdp->fd_knhash[
+			    KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
+			SLIST_FOREACH(kn, list, kn_link)
+				if (kev->ident == kn->kn_id &&
+				    kq == kn->kn_kq &&
+				    kev->filter == kn->kn_filter)
+					break;
+		}
+	}
+	FILEDESC_UNLOCK(fdp);
+
+	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
+		error = ENOENT;
+		goto done;
+	}
+
+	/*
+	 * kn now contains the matching knote, or NULL if no match
+	 */
+	if (kev->flags & EV_ADD) {
+
+		if (kn == NULL) {
+			kn = knote_alloc();
+			if (kn == NULL) {
+				error = ENOMEM;
+				goto done;
+			}
+			kn->kn_fp = fp;
+			kn->kn_kq = kq;
+			kn->kn_fop = fops;
+
+			/*
+			 * apply reference count to knote structure, and
+			 * do not release it at the end of this routine.
+			 */
+			fp = NULL;
+
+			kn->kn_sfflags = kev->fflags;
+			kn->kn_sdata = kev->data;
+			kev->fflags = 0;
+			kev->data = 0;
+			kn->kn_kevent = *kev;
+
+			knote_attach(kn, fdp);
+			if ((error = fops->f_attach(kn)) != 0) {
+				knote_drop(kn, td);
+				goto done;
+			}
+		} else {
+			/*
+			 * The user may change some filter values after the
+			 * initial EV_ADD, but doing so will not reset any 
+			 * filter which have already been triggered.
+			 */
+			kn->kn_sfflags = kev->fflags;
+			kn->kn_sdata = kev->data;
+			kn->kn_kevent.udata = kev->udata;
+		}
+
+		s = splhigh();
+		if (kn->kn_fop->f_event(kn, 0))
+			KNOTE_ACTIVATE(kn);
+		splx(s);
+
+	} else if (kev->flags & EV_DELETE) {
+		kn->kn_fop->f_detach(kn);
+		knote_drop(kn, td);
+		goto done;
+	}
+
+	if ((kev->flags & EV_DISABLE) &&
+	    ((kn->kn_status & KN_DISABLED) == 0)) {
+		s = splhigh();
+		kn->kn_status |= KN_DISABLED;
+		splx(s);
+	}
+
+	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
+		s = splhigh();
+		kn->kn_status &= ~KN_DISABLED;
+		if ((kn->kn_status & KN_ACTIVE) &&
+		    ((kn->kn_status & KN_QUEUED) == 0))
+			knote_enqueue(kn);
+		splx(s);
+	}
+
+done:
+	if (fp != NULL)
+		fdrop(fp, td);
+	return (error);
+}
+
+static int
+kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp,
+	const struct timespec *tsp, struct thread *td)
+{
+	struct kqueue *kq;
+	struct kevent *kevp;
+	struct timeval atv, rtv, ttv;
+	struct knote *kn, marker;
+	int s, count, timeout, nkev = 0, error = 0;
+
+	FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
+
+	kq = (struct kqueue *)fp->f_data;
+	count = maxevents;
+	if (count == 0)
+		goto done;
+
+	if (tsp != NULL) {
+		TIMESPEC_TO_TIMEVAL(&atv, tsp);
+		if (itimerfix(&atv)) {
+			error = EINVAL;
+			goto done;
+		}
+		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
+			timeout = -1;
+		else 
+			timeout = atv.tv_sec > 24 * 60 * 60 ?
+			    24 * 60 * 60 * hz : tvtohz(&atv);
+		getmicrouptime(&rtv);
+		timevaladd(&atv, &rtv);
+	} else {
+		atv.tv_sec = 0;
+		atv.tv_usec = 0;
+		timeout = 0;
+	}
+	goto start;
+
+retry:
+	if (atv.tv_sec || atv.tv_usec) {
+		getmicrouptime(&rtv);
+		if (timevalcmp(&rtv, &atv, >=))
+			goto done;
+		ttv = atv;
+		timevalsub(&ttv, &rtv);
+		timeout = ttv.tv_sec > 24 * 60 * 60 ?
+			24 * 60 * 60 * hz : tvtohz(&ttv);
+	}
+
+start:
+	kevp = kq->kq_kev;
+	s = splhigh();
+	if (kq->kq_count == 0) {
+		if (timeout < 0) { 
+			error = EWOULDBLOCK;
+		} else {
+			kq->kq_state |= KQ_SLEEP;
+			error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
+		}
+		splx(s);
+		if (error == 0)
+			goto retry;
+		/* don't restart after signals... */
+		if (error == ERESTART)
+			error = EINTR;
+		else if (error == EWOULDBLOCK)
+			error = 0;
+		goto done;
+	}
+
+	TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); 
+	while (count) {
+		kn = TAILQ_FIRST(&kq->kq_head);
+		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 
+		if (kn == &marker) {
+			splx(s);
+			if (count == maxevents)
+				goto retry;
+			goto done;
+		}
+		if (kn->kn_status & KN_DISABLED) {
+			kn->kn_status &= ~KN_QUEUED;
+			kq->kq_count--;
+			continue;
+		}
+		if ((kn->kn_flags & EV_ONESHOT) == 0 &&
+		    kn->kn_fop->f_event(kn, 0) == 0) {
+			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+			kq->kq_count--;
+			continue;
+		}
+		*kevp = kn->kn_kevent;
+		kevp++;
+		nkev++;
+		if (kn->kn_flags & EV_ONESHOT) {
+			kn->kn_status &= ~KN_QUEUED;
+			kq->kq_count--;
+			splx(s);
+			kn->kn_fop->f_detach(kn);
+			knote_drop(kn, td);
+			s = splhigh();
+		} else if (kn->kn_flags & EV_CLEAR) {
+			kn->kn_data = 0;
+			kn->kn_fflags = 0;
+			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+			kq->kq_count--;
+		} else {
+			TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 
+		}
+		count--;
+		if (nkev == KQ_NEVENTS) {
+			splx(s);
+			error = copyout(&kq->kq_kev, ulistp,
+			    sizeof(struct kevent) * nkev);
+			ulistp += nkev;
+			nkev = 0;
+			kevp = kq->kq_kev;
+			s = splhigh();
+			if (error)
+				break;
+		}
+	}
+	TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); 
+	splx(s);
+done:
+	if (nkev != 0)
+		error = copyout(&kq->kq_kev, ulistp,
+		    sizeof(struct kevent) * nkev);
+        td->td_retval[0] = maxevents - count;
+	return (error);
+}
+
+/*
+ * XXX
+ * This could be expanded to call kqueue_scan, if desired.
+ */
+/*ARGSUSED*/
+static int
+kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred,
+	int flags, struct thread *td)
+{
+	return (ENXIO);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred,
+	 int flags, struct thread *td)
+{
+	return (ENXIO);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct thread *td)
+{
+	return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
+{
+	struct kqueue *kq;
+	int revents = 0;
+	int s = splnet();
+
+	kq = (struct kqueue *)fp->f_data;
+        if (events & (POLLIN | POLLRDNORM)) {
+                if (kq->kq_count) {
+                        revents |= events & (POLLIN | POLLRDNORM);
+		} else {
+                        selrecord(td, &kq->kq_sel);
+			kq->kq_state |= KQ_SEL;
+		}
+	}
+	splx(s);
+	return (revents);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_stat(struct file *fp, struct stat *st, struct thread *td)
+{
+	struct kqueue *kq;
+
+	kq = (struct kqueue *)fp->f_data;
+	bzero((void *)st, sizeof(*st));
+	st->st_size = kq->kq_count;
+	st->st_blksize = sizeof(struct kevent);
+	st->st_mode = S_IFIFO;
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_close(struct file *fp, struct thread *td)
+{
+	struct kqueue *kq = (struct kqueue *)fp->f_data;
+	struct filedesc *fdp = td->td_proc->p_fd;
+	struct knote **knp, *kn, *kn0;
+	int i;
+
+	FILEDESC_LOCK(fdp);
+	for (i = 0; i < fdp->fd_knlistsize; i++) {
+		knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
+		kn = *knp;
+		while (kn != NULL) {
+			kn0 = SLIST_NEXT(kn, kn_link);
+			if (kq == kn->kn_kq) {
+				kn->kn_fop->f_detach(kn);
+				*knp = kn0;
+				FILE_LOCK(kn->kn_fp);
+				FILEDESC_UNLOCK(fdp);
+				fdrop_locked(kn->kn_fp, td);
+				knote_free(kn);
+				FILEDESC_LOCK(fdp);
+			} else {
+				knp = &SLIST_NEXT(kn, kn_link);
+			}
+			kn = kn0;
+		}
+	}
+	if (fdp->fd_knhashmask != 0) {
+		for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
+			knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
+			kn = *knp;
+			while (kn != NULL) {
+				kn0 = SLIST_NEXT(kn, kn_link);
+				if (kq == kn->kn_kq) {
+					kn->kn_fop->f_detach(kn);
+					*knp = kn0;
+		/* XXX non-fd release of kn->kn_ptr */
+					FILEDESC_UNLOCK(fdp);
+					knote_free(kn);
+					FILEDESC_LOCK(fdp);
+				} else {
+					knp = &SLIST_NEXT(kn, kn_link);
+				}
+				kn = kn0;
+			}
+		}
+	}
+	FILEDESC_UNLOCK(fdp);
+	free(kq, M_KQUEUE);
+	fp->f_data = NULL;
+
+	return (0);
+}
+
+static void
+kqueue_wakeup(struct kqueue *kq)
+{
+
+	if (kq->kq_state & KQ_SLEEP) {
+		kq->kq_state &= ~KQ_SLEEP;
+		wakeup(kq);
+	}
+	if (kq->kq_state & KQ_SEL) {
+		kq->kq_state &= ~KQ_SEL;
+		selwakeup(&kq->kq_sel);
+	}
+	KNOTE(&kq->kq_sel.si_note, 0);
+}
+
+/*
+ * walk down a list of knotes, activating them if their event has triggered.
+ */
+void
+knote(struct klist *list, long hint)
+{
+	struct knote *kn;
+
+	SLIST_FOREACH(kn, list, kn_selnext)
+		if (kn->kn_fop->f_event(kn, hint))
+			KNOTE_ACTIVATE(kn);
+}
+
+/*
+ * remove all knotes from a specified klist
+ */
+void
+knote_remove(struct thread *td, struct klist *list)
+{
+	struct knote *kn;
+
+	while ((kn = SLIST_FIRST(list)) != NULL) {
+		kn->kn_fop->f_detach(kn);
+		knote_drop(kn, td);
+	}
+}
+
+/*
+ * remove all knotes referencing a specified fd
+ */
+void
+knote_fdclose(struct thread *td, int fd)
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	struct klist *list;
+
+	FILEDESC_LOCK(fdp);
+	list = &fdp->fd_knlist[fd];
+	FILEDESC_UNLOCK(fdp);
+	knote_remove(td, list);
+}
+
+static void
+knote_attach(struct knote *kn, struct filedesc *fdp)
+{
+	struct klist *list, *oldlist;
+	int size, newsize;
+
+	FILEDESC_LOCK(fdp);
+
+	if (! kn->kn_fop->f_isfd) {
+		if (fdp->fd_knhashmask == 0)
+			fdp->fd_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
+			    &fdp->fd_knhashmask);
+		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
+		goto done;
+	}
+
+	if (fdp->fd_knlistsize <= kn->kn_id) {
+retry:
+		size = fdp->fd_knlistsize;
+		while (size <= kn->kn_id)
+			size += KQEXTENT;
+		FILEDESC_UNLOCK(fdp);
+		MALLOC(list, struct klist *,
+		    size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
+		FILEDESC_LOCK(fdp);
+		newsize = fdp->fd_knlistsize;
+		while (newsize <= kn->kn_id)
+			newsize += KQEXTENT;
+		if (newsize != size) {
+			FILEDESC_UNLOCK(fdp);
+			free(list, M_TEMP);
+			FILEDESC_LOCK(fdp);
+			goto retry;
+		}
+		bcopy(fdp->fd_knlist, list,
+		    fdp->fd_knlistsize * sizeof(struct klist *));
+		bzero((caddr_t)list +
+		    fdp->fd_knlistsize * sizeof(struct klist *),
+		    (size - fdp->fd_knlistsize) * sizeof(struct klist *));
+		if (fdp->fd_knlist != NULL)
+			oldlist = fdp->fd_knlist;
+		else
+			oldlist = NULL;
+		fdp->fd_knlistsize = size;
+		fdp->fd_knlist = list;
+		FILEDESC_UNLOCK(fdp);
+		if (oldlist != NULL)
+			FREE(oldlist, M_KQUEUE);
+		FILEDESC_LOCK(fdp);
+	}
+	list = &fdp->fd_knlist[kn->kn_id];
+done:
+	FILEDESC_UNLOCK(fdp);
+	SLIST_INSERT_HEAD(list, kn, kn_link);
+	kn->kn_status = 0;
+}
+
+/*
+ * should be called at spl == 0, since we don't want to hold spl
+ * while calling fdrop and free.
+ */
+static void
+knote_drop(struct knote *kn, struct thread *td)
+{
+        struct filedesc *fdp = td->td_proc->p_fd;
+	struct klist *list;
+
+	FILEDESC_LOCK(fdp);
+	if (kn->kn_fop->f_isfd)
+		list = &fdp->fd_knlist[kn->kn_id];
+	else
+		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
+	if (kn->kn_fop->f_isfd)
+		FILE_LOCK(kn->kn_fp);
+	FILEDESC_UNLOCK(fdp);
+
+	SLIST_REMOVE(list, kn, knote, kn_link);
+	if (kn->kn_status & KN_QUEUED)
+		knote_dequeue(kn);
+	if (kn->kn_fop->f_isfd)
+		fdrop_locked(kn->kn_fp, td);
+	knote_free(kn);
+}
+
+
+static void
+knote_enqueue(struct knote *kn)
+{
+	struct kqueue *kq = kn->kn_kq;
+	int s = splhigh();
+
+	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
+
+	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 
+	kn->kn_status |= KN_QUEUED;
+	kq->kq_count++;
+	splx(s);
+	kqueue_wakeup(kq);
+}
+
+static void
+knote_dequeue(struct knote *kn)
+{
+	struct kqueue *kq = kn->kn_kq;
+	int s = splhigh();
+
+	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
+
+	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 
+	kn->kn_status &= ~KN_QUEUED;
+	kq->kq_count--;
+	splx(s);
+}
+
+static void
+knote_init(void)
+{
+	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, 0);
+
+}
+SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
+
+static struct knote *
+knote_alloc(void)
+{
+	return ((struct knote *)uma_zalloc(knote_zone, M_WAITOK));
+}
+
+static void
+knote_free(struct knote *kn)
+{
+	uma_zfree(knote_zone, kn);
+}
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
new file mode 100644
index 0000000..bc773df
--- /dev/null
+++ b/sys/kern/kern_exec.c
@@ -0,0 +1,1022 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/fcntl.h>
+#include <sys/acct.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/wait.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/namei.h>
+#include <sys/sysent.h>
+#include <sys/shm.h>
+#include <sys/sysctl.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+
+#include <machine/reg.h>
+
+MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
+
+static MALLOC_DEFINE(M_ATEXEC, "atexec", "atexec callback");
+
+/*
+ * callout list for things to do at exec time
+ */
+struct execlist {
+	execlist_fn function;
+	TAILQ_ENTRY(execlist) next;
+};
+
+TAILQ_HEAD(exec_list_head, execlist);
+static struct exec_list_head exec_list = TAILQ_HEAD_INITIALIZER(exec_list);
+
+static register_t *exec_copyout_strings(struct image_params *);
+
+/* XXX This should be vm_size_t. */
+static u_long ps_strings = PS_STRINGS;
+SYSCTL_ULONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, 0, "");
+
+/* XXX This should be vm_size_t. */
+static u_long usrstack = USRSTACK;
+SYSCTL_ULONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, 0, "");
+
+u_long ps_arg_cache_limit = PAGE_SIZE / 16;
+SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
+    &ps_arg_cache_limit, 0, "");
+
+int ps_argsopen = 1;
+SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, "");
+
+#ifdef __ia64__
+/* XXX HACK */
+static int regstkpages = 256;
+SYSCTL_INT(_machdep, OID_AUTO, regstkpages, CTLFLAG_RW, &regstkpages, 0, "");
+#endif
+
+/*
+ * Each of the items is a pointer to a `const struct execsw', hence the
+ * double pointer here.
+ */
+static const struct execsw **execsw;
+
+#ifndef _SYS_SYSPROTO_H_
+struct execve_args {
+        char    *fname; 
+        char    **argv;
+        char    **envv; 
+};
+#endif
+
+/*
+ * execve() system call.
+ *
+ * MPSAFE
+ */
+int
+execve(td, uap)
+	struct thread *td;
+	register struct execve_args *uap;
+{
+	struct proc *p = td->td_proc;
+	struct nameidata nd, *ndp;
+	struct ucred *newcred = NULL, *oldcred;
+	struct uidinfo *euip;
+	register_t *stack_base;
+	int error, len, i;
+	struct image_params image_params, *imgp;
+	struct vattr attr;
+	int (*img_first)(struct image_params *);
+	struct pargs *oldargs = NULL, *newargs = NULL;
+	struct procsig *oldprocsig, *newprocsig;
+#ifdef KTRACE
+	struct vnode *tracevp = NULL;
+#endif
+	struct vnode *textvp = NULL;
+
+	imgp = &image_params;
+
+	/*
+	 * Lock the process and set the P_INEXEC flag to indicate that
+	 * it should be left alone until we're done here.  This is
+	 * necessary to avoid race conditions - e.g. in ptrace() -
+	 * that might allow a local user to illicitly obtain elevated
+	 * privileges.
+	 */
+	mtx_lock(&Giant);
+	PROC_LOCK(p);
+	KASSERT((p->p_flag & P_INEXEC) == 0,
+	    ("%s(): process already has P_INEXEC flag", __func__));
+	p->p_flag |= P_INEXEC;
+	PROC_UNLOCK(p);
+	
+/* XXXKSE */
+/* !!!!!!!! we need abort all the other threads of this process before we */
+/* proceed beyond his point! */
+
+	/*
+	 * Initialize part of the common data
+	 */
+	imgp->proc = p;
+	imgp->uap = uap;
+	imgp->attr = &attr;
+	imgp->argc = imgp->envc = 0;
+	imgp->argv0 = NULL;
+	imgp->entry_addr = 0;
+	imgp->vmspace_destroyed = 0;
+	imgp->interpreted = 0;
+	imgp->interpreter_name[0] = '\0';
+	imgp->auxargs = NULL;
+	imgp->vp = NULL;
+	imgp->firstpage = NULL;
+	imgp->ps_strings = 0;
+	imgp->auxarg_size = 0;
+
+	/*
+	 * Allocate temporary demand zeroed space for argument and
+	 *	environment strings
+	 */
+	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE);
+	if (imgp->stringbase == NULL) {
+		error = ENOMEM;
+		goto exec_fail;
+	}
+	imgp->stringp = imgp->stringbase;
+	imgp->stringspace = ARG_MAX;
+	imgp->image_header = imgp->stringbase + ARG_MAX;
+
+	/*
+	 * Translate the file name. namei() returns a vnode pointer
+	 *	in ni_vp amoung other things.
+	 */
+	ndp = &nd;
+	NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+	    UIO_USERSPACE, uap->fname, td);
+
+interpret:
+
+	error = namei(ndp);
+	if (error) {
+		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
+			ARG_MAX + PAGE_SIZE);
+		goto exec_fail;
+	}
+
+	imgp->vp = ndp->ni_vp;
+	imgp->fname = uap->fname;
+
+	/*
+	 * Check file permissions (also 'opens' file)
+	 */
+	error = exec_check_permissions(imgp);
+	if (error) {
+		VOP_UNLOCK(imgp->vp, 0, td);
+		goto exec_fail_dealloc;
+	}
+
+	error = exec_map_first_page(imgp);
+	VOP_UNLOCK(imgp->vp, 0, td);
+	if (error)
+		goto exec_fail_dealloc;
+
+	/*
+	 *	If the current process has a special image activator it
+	 *	wants to try first, call it.   For example, emulating shell 
+	 *	scripts differently.
+	 */
+	error = -1;
+	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
+		error = img_first(imgp);
+
+	/*
+	 *	Loop through the list of image activators, calling each one.
+	 *	An activator returns -1 if there is no match, 0 on success,
+	 *	and an error otherwise.
+	 */
+	for (i = 0; error == -1 && execsw[i]; ++i) {
+		if (execsw[i]->ex_imgact == NULL ||
+		    execsw[i]->ex_imgact == img_first) {
+			continue;
+		}
+		error = (*execsw[i]->ex_imgact)(imgp);
+	}
+
+	if (error) {
+		if (error == -1)
+			error = ENOEXEC;
+		goto exec_fail_dealloc;
+	}
+
+	/*
+	 * Special interpreter operation, cleanup and loop up to try to
+	 * activate the interpreter.
+	 */
+	if (imgp->interpreted) {
+		exec_unmap_first_page(imgp);
+		/* free name buffer and old vnode */
+		NDFREE(ndp, NDF_ONLY_PNBUF);
+		vrele(ndp->ni_vp);
+		/* set new name to that of the interpreter */
+		NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+		    UIO_SYSSPACE, imgp->interpreter_name, td);
+		goto interpret;
+	}
+
+	/*
+	 * Copy out strings (args and env) and initialize stack base
+	 */
+	stack_base = exec_copyout_strings(imgp);
+
+	/*
+	 * If custom stack fixup routine present for this process
+	 * let it do the stack setup.
+	 * Else stuff argument count as first item on stack
+	 */
+	if (p->p_sysent->sv_fixup)
+		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
+	else
+		suword(--stack_base, imgp->argc);
+
+	/*
+	 * For security and other reasons, the file descriptor table cannot
+	 * be shared after an exec.
+	 */
+	FILEDESC_LOCK(p->p_fd);
+	if (p->p_fd->fd_refcnt > 1) {
+		struct filedesc *tmp;
+
+		tmp = fdcopy(td);
+		FILEDESC_UNLOCK(p->p_fd);
+		fdfree(td);
+		p->p_fd = tmp;
+	} else
+		FILEDESC_UNLOCK(p->p_fd);
+
+	/*
+	 * Malloc things before we need locks.
+	 */
+	newcred = crget();
+	euip = uifind(attr.va_uid);
+	i = imgp->endargs - imgp->stringbase;
+	if (ps_arg_cache_limit >= i + sizeof(struct pargs))
+		newargs = pargs_alloc(i);
+
+	/* close files on exec */
+	fdcloseexec(td);
+
+	/*
+	 * For security and other reasons, signal handlers cannot
+	 * be shared after an exec. The new process gets a copy of the old
+	 * handlers. In execsigs(), the new process will have its signals
+	 * reset.
+	 */
+	PROC_LOCK(p);
+	mp_fixme("procsig needs a lock");
+	if (p->p_procsig->ps_refcnt > 1) {
+		oldprocsig = p->p_procsig;
+		PROC_UNLOCK(p);
+		MALLOC(newprocsig, struct procsig *, sizeof(struct procsig),
+		    M_SUBPROC, M_WAITOK);
+		bcopy(oldprocsig, newprocsig, sizeof(*newprocsig));
+		newprocsig->ps_refcnt = 1;
+		oldprocsig->ps_refcnt--;
+		PROC_LOCK(p);
+		p->p_procsig = newprocsig;
+		if (p->p_sigacts == &p->p_uarea->u_sigacts)
+			panic("shared procsig but private sigacts?");
+
+		p->p_uarea->u_sigacts = *p->p_sigacts;
+		p->p_sigacts = &p->p_uarea->u_sigacts;
+	}
+	/* Stop profiling */
+	stopprofclock(p);
+
+	/* reset caught signals */
+	execsigs(p);
+
+	/* name this process - nameiexec(p, ndp) */
+	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
+	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
+	p->p_comm[len] = 0;
+
+	/*
+	 * mark as execed, wakeup the process that vforked (if any) and tell
+	 * it that it now has its own resources back
+	 */
+	p->p_flag |= P_EXEC;
+	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
+		p->p_flag &= ~P_PPWAIT;
+		wakeup((caddr_t)p->p_pptr);
+	}
+
+	/*
+	 * Implement image setuid/setgid.
+	 *
+	 * Don't honor setuid/setgid if the filesystem prohibits it or if
+	 * the process is being traced.
+	 */
+	oldcred = p->p_ucred;
+	if ((((attr.va_mode & VSUID) && oldcred->cr_uid != attr.va_uid) ||
+	     ((attr.va_mode & VSGID) && oldcred->cr_gid != attr.va_gid)) &&
+	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
+	    (p->p_flag & P_TRACED) == 0) {
+		/*
+		 * Turn off syscall tracing for set-id programs, except for
+		 * root.  Record any set-id flags first to make sure that
+		 * we do not regain any tracing during a possible block.
+		 */
+		setsugid(p);
+#ifdef KTRACE
+		if (p->p_tracep && suser_cred(oldcred, PRISON_ROOT)) {
+			mtx_lock(&ktrace_mtx);
+			p->p_traceflag = 0;
+			tracevp = p->p_tracep;
+			p->p_tracep = NULL;
+			mtx_unlock(&ktrace_mtx);
+		}
+#endif
+		/* Make sure file descriptors 0..2 are in use.  */
+		error = fdcheckstd(td);
+		if (error != 0) {
+			oldcred = NULL;
+			goto done1;
+		}
+		/*
+		 * Set the new credentials.
+		 */
+		crcopy(newcred, oldcred);
+		if (attr.va_mode & VSUID)
+			change_euid(newcred, euip);
+		if (attr.va_mode & VSGID)
+			change_egid(newcred, attr.va_gid);
+		setugidsafety(td);
+		/*
+		 * Implement correct POSIX saved-id behavior.
+		 */
+		change_svuid(newcred, newcred->cr_uid);
+		change_svgid(newcred, newcred->cr_gid);
+		p->p_ucred = newcred;
+		newcred = NULL;
+	} else {
+		if (oldcred->cr_uid == oldcred->cr_ruid &&
+		    oldcred->cr_gid == oldcred->cr_rgid)
+			p->p_flag &= ~P_SUGID;
+		/*
+		 * Implement correct POSIX saved-id behavior.
+		 *
+		 * XXX: It's not clear that the existing behavior is
+		 * POSIX-compliant.  A number of sources indicate that the
+		 * saved uid/gid should only be updated if the new ruid is
+		 * not equal to the old ruid, or the new euid is not equal
+		 * to the old euid and the new euid is not equal to the old
+		 * ruid.  The FreeBSD code always updates the saved uid/gid.
+		 * Also, this code uses the new (replaced) euid and egid as
+		 * the source, which may or may not be the right ones to use.
+		 */
+		if (oldcred->cr_svuid != oldcred->cr_uid ||
+		    oldcred->cr_svgid != oldcred->cr_gid) {
+			crcopy(newcred, oldcred);
+			change_svuid(newcred, newcred->cr_uid);
+			change_svgid(newcred, newcred->cr_gid);
+			p->p_ucred = newcred;
+			newcred = NULL;
+		}
+	}
+
+	/*
+	 * Store the vp for use in procfs
+	 */
+	textvp = p->p_textvp;
+	VREF(ndp->ni_vp);
+	p->p_textvp = ndp->ni_vp;
+
+	/*
+	 * Notify others that we exec'd, and clear the P_INEXEC flag
+	 * as we're now a bona fide freshly-execed process.
+	 */
+	KNOTE(&p->p_klist, NOTE_EXEC);
+	p->p_flag &= ~P_INEXEC;
+
+	/*
+	 * If tracing the process, trap to debugger so breakpoints
+	 * can be set before the program executes.
+	 */
+	_STOPEVENT(p, S_EXEC, 0);
+
+	if (p->p_flag & P_TRACED)
+		psignal(p, SIGTRAP);
+
+	/* clear "fork but no exec" flag, as we _are_ execing */
+	p->p_acflag &= ~AFORK;
+
+	/* Free any previous argument cache */
+	oldargs = p->p_args;
+	p->p_args = NULL;
+
+	/* Set values passed into the program in registers. */
+	setregs(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base,
+	    imgp->ps_strings);
+
+	/* Cache arguments if they fit inside our allowance */
+	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
+		bcopy(imgp->stringbase, newargs->ar_args, i);
+		p->p_args = newargs;
+		newargs = NULL;
+	}
+done1:
+	PROC_UNLOCK(p);
+
+	/*
+	 * Free any resources malloc'd earlier that we didn't use.
+	 */
+	uifree(euip);
+	if (newcred == NULL)
+		crfree(oldcred);
+	else
+		crfree(newcred);
+	/*
+	 * Handle deferred decrement of ref counts.
+	 */
+	if (textvp != NULL)
+		vrele(textvp);
+#ifdef KTRACE
+	if (tracevp != NULL)
+		vrele(tracevp);
+#endif
+	if (oldargs != NULL)
+		pargs_drop(oldargs);
+	if (newargs != NULL)
+		pargs_drop(newargs);
+
+exec_fail_dealloc:
+
+	/*
+	 * free various allocated resources
+	 */
+	if (imgp->firstpage)
+		exec_unmap_first_page(imgp);
+
+	if (imgp->stringbase != NULL)
+		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
+			ARG_MAX + PAGE_SIZE);
+
+	if (imgp->vp) {
+		NDFREE(ndp, NDF_ONLY_PNBUF);
+		vrele(imgp->vp);
+	}
+
+	if (error == 0)
+		goto done2;
+
+exec_fail:
+	/* we're done here, clear P_INEXEC */
+	PROC_LOCK(p);
+	p->p_flag &= ~P_INEXEC;
+	PROC_UNLOCK(p);
+	
+	if (imgp->vmspace_destroyed) {
+		/* sorry, no more process anymore. exit gracefully */
+		exit1(td, W_EXITCODE(0, SIGABRT));
+		/* NOT REACHED */
+		error = 0;
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+int
+exec_map_first_page(imgp)
+	struct image_params *imgp;
+{
+	int rv, i;
+	int initial_pagein;
+	vm_page_t ma[VM_INITIAL_PAGEIN];
+	vm_object_t object;
+
+	GIANT_REQUIRED;
+
+	if (imgp->firstpage) {
+		exec_unmap_first_page(imgp);
+	}
+
+	VOP_GETVOBJECT(imgp->vp, &object);
+
+	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+
+	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
+		initial_pagein = VM_INITIAL_PAGEIN;
+		if (initial_pagein > object->size)
+			initial_pagein = object->size;
+		for (i = 1; i < initial_pagein; i++) {
+			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
+				if ((ma[i]->flags & PG_BUSY) || ma[i]->busy)
+					break;
+				if (ma[i]->valid)
+					break;
+				vm_page_busy(ma[i]);
+			} else {
+				ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL);
+				if (ma[i] == NULL)
+					break;
+			}
+		}
+		initial_pagein = i;
+
+		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
+		ma[0] = vm_page_lookup(object, 0);
+
+		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) {
+			if (ma[0]) {
+				vm_page_protect(ma[0], VM_PROT_NONE);
+				vm_page_free(ma[0]);
+			}
+			return EIO;
+		}
+	}
+
+	vm_page_wire(ma[0]);
+	vm_page_wakeup(ma[0]);
+
+	pmap_qenter((vm_offset_t)imgp->image_header, ma, 1);
+	imgp->firstpage = ma[0];
+
+	return 0;
+}
+
+void
+exec_unmap_first_page(imgp)
+	struct image_params *imgp;
+{
+	GIANT_REQUIRED;
+
+	if (imgp->firstpage) {
+		pmap_qremove((vm_offset_t)imgp->image_header, 1);
+		vm_page_unwire(imgp->firstpage, 1);
+		imgp->firstpage = NULL;
+	}
+}
+
+/*
+ * Destroy old address space, and allocate a new stack
+ *	The new stack is only SGROWSIZ large because it is grown
+ *	automatically in trap.c.
+ */
+int
+exec_new_vmspace(imgp)
+	struct image_params *imgp;
+{
+	int error;
+	struct execlist *ep;
+	struct proc *p = imgp->proc;
+	struct vmspace *vmspace = p->p_vmspace;
+	vm_offset_t stack_addr = USRSTACK - maxssiz;
+
+	GIANT_REQUIRED;
+
+	imgp->vmspace_destroyed = 1;
+
+	/*
+	 * Perform functions registered with at_exec().
+	 */
+	TAILQ_FOREACH(ep, &exec_list, next)
+		(*ep->function)(p);
+
+	/*
+	 * Blow away entire process VM, if address space not shared,
+	 * otherwise, create a new VM space so that other threads are
+	 * not disrupted
+	 */
+	if (vmspace->vm_refcnt == 1) {
+		if (vmspace->vm_shm)
+			shmexit(p);
+		pmap_remove_pages(vmspace_pmap(vmspace), 0, VM_MAXUSER_ADDRESS);
+		vm_map_remove(&vmspace->vm_map, 0, VM_MAXUSER_ADDRESS);
+	} else {
+		vmspace_exec(p);
+		vmspace = p->p_vmspace;
+	}
+
+	/* Allocate a new stack */
+	error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz,
+	    VM_PROT_ALL, VM_PROT_ALL, 0);
+	if (error)
+		return (error);
+
+#ifdef __ia64__
+	{
+		/*
+		 * Allocate backing store. We really need something
+		 * similar to vm_map_stack which can allow the backing 
+		 * store to grow upwards. This will do for now.
+		 */
+		vm_offset_t bsaddr;
+		bsaddr = USRSTACK - 2*maxssiz;
+		error = vm_map_find(&vmspace->vm_map, 0, 0, &bsaddr,
+				    regstkpages * PAGE_SIZE, 0,
+				    VM_PROT_ALL, VM_PROT_ALL, 0);
+		FIRST_THREAD_IN_PROC(p)->td_md.md_bspstore = bsaddr;
+	}
+#endif
+
+	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
+	 * VM_STACK case, but they are still used to monitor the size of the
+	 * process stack so we can check the stack rlimit.
+	 */
+	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
+	vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz;
+
+	return(0);
+}
+
+/*
+ * Copy out argument and environment strings from the old process
+ *	address space into the temporary string buffer.
+ */
+int
+exec_extract_strings(imgp)
+	struct image_params *imgp;
+{
+	char	**argv, **envv;
+	char	*argp, *envp;
+	int	error;
+	size_t	length;
+
+	/*
+	 * extract arguments first
+	 */
+
+	argv = imgp->uap->argv;
+
+	if (argv) {
+		argp = (caddr_t) (intptr_t) fuword(argv);
+		if (argp == (caddr_t) -1)
+			return (EFAULT);
+		if (argp)
+			argv++;
+		if (imgp->argv0)
+			argp = imgp->argv0;
+		if (argp) {
+			do {
+				if (argp == (caddr_t) -1)
+					return (EFAULT);
+				if ((error = copyinstr(argp, imgp->stringp,
+				    imgp->stringspace, &length))) {
+					if (error == ENAMETOOLONG)
+						return(E2BIG);
+					return (error);
+				}
+				imgp->stringspace -= length;
+				imgp->stringp += length;
+				imgp->argc++;
+			} while ((argp = (caddr_t) (intptr_t) fuword(argv++)));
+		}
+	}	
+
+	imgp->endargs = imgp->stringp;
+
+	/*
+	 * extract environment strings
+	 */
+
+	envv = imgp->uap->envv;
+
+	if (envv) {
+		while ((envp = (caddr_t) (intptr_t) fuword(envv++))) {
+			if (envp == (caddr_t) -1)
+				return (EFAULT);
+			if ((error = copyinstr(envp, imgp->stringp,
+			    imgp->stringspace, &length))) {
+				if (error == ENAMETOOLONG)
+					return(E2BIG);
+				return (error);
+			}
+			imgp->stringspace -= length;
+			imgp->stringp += length;
+			imgp->envc++;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Copy strings out to the new process address space, constructing
+ *	new arg and env vector tables. Return a pointer to the base
+ *	so that it can be used as the initial stack pointer.
+ */
+register_t *
+exec_copyout_strings(imgp)
+	struct image_params *imgp;
+{
+	int argc, envc;
+	char **vectp;
+	char *stringp, *destp;
+	register_t *stack_base;
+	struct ps_strings *arginfo;
+	int szsigcode;
+
+	/*
+	 * Calculate string base and vector table pointers.
+	 * Also deal with signal trampoline code for this exec type.
+	 */
+	arginfo = (struct ps_strings *)PS_STRINGS;
+	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
+	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
+		roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
+
+	/*
+	 * install sigcode
+	 */
+	if (szsigcode)
+		copyout(imgp->proc->p_sysent->sv_sigcode,
+			((caddr_t)arginfo - szsigcode), szsigcode);
+
+	/*
+	 * If we have a valid auxargs ptr, prepare some room
+	 * on the stack.
+	 */
+	if (imgp->auxargs) {
+		/*
+		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
+		 * lower compatibility.
+		 */
+		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
+			: (AT_COUNT * 2);
+		/*
+		 * The '+ 2' is for the null pointers at the end of each of
+		 * the arg and env vector sets,and imgp->auxarg_size is room
+		 * for argument of Runtime loader.
+		 */
+		vectp = (char **) (destp - (imgp->argc + imgp->envc + 2 +
+				       imgp->auxarg_size) * sizeof(char *));
+
+	} else 
+		/*
+		 * The '+ 2' is for the null pointers at the end of each of
+		 * the arg and env vector sets
+		 */
+		vectp = (char **)
+			(destp - (imgp->argc + imgp->envc + 2) * sizeof(char *));
+
+	/*
+	 * vectp also becomes our initial stack base
+	 */
+	stack_base = (register_t *)vectp;
+
+	stringp = imgp->stringbase;
+	argc = imgp->argc;
+	envc = imgp->envc;
+
+	/*
+	 * Copy out strings - arguments and environment.
+	 */
+	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
+
+	/*
+	 * Fill in "ps_strings" struct for ps, w, etc.
+	 */
+	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
+	suword(&arginfo->ps_nargvstr, argc);
+
+	/*
+	 * Fill in argument portion of vector table.
+	 */
+	for (; argc > 0; --argc) {
+		suword(vectp++, (long)(intptr_t)destp);
+		while (*stringp++ != 0)
+			destp++;
+		destp++;
+	}
+
+	/* a null vector table pointer separates the argp's from the envp's */
+	suword(vectp++, 0);
+
+	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
+	suword(&arginfo->ps_nenvstr, envc);
+
+	/*
+	 * Fill in environment portion of vector table.
+	 */
+	for (; envc > 0; --envc) {
+		suword(vectp++, (long)(intptr_t)destp);
+		while (*stringp++ != 0)
+			destp++;
+		destp++;
+	}
+
+	/* end of vector table is a null pointer */
+	suword(vectp, 0);
+
+	return (stack_base);
+}
+
+/*
+ * Check permissions of file to execute.
+ *	Called with imgp->vp locked.
+ *	Return 0 for success or error code on failure.
+ */
+int
+exec_check_permissions(imgp)
+	struct image_params *imgp;
+{
+	struct vnode *vp = imgp->vp;
+	struct vattr *attr = imgp->attr;
+	struct thread *td;
+	int error;
+
+	td = curthread;			/* XXXKSE */
+	/* Get file attributes */
+	error = VOP_GETATTR(vp, attr, td->td_ucred, td);
+	if (error)
+		return (error);
+
+	/*
+	 * 1) Check if file execution is disabled for the filesystem that this
+	 *	file resides on.
+	 * 2) Insure that at least one execute bit is on - otherwise root
+	 *	will always succeed, and we don't want to happen unless the
+	 *	file really is executable.
+	 * 3) Insure that the file is a regular file.
+	 */
+	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
+	    ((attr->va_mode & 0111) == 0) ||
+	    (attr->va_type != VREG))
+		return (EACCES);
+
+	/*
+	 * Zero length files can't be exec'd
+	 */
+	if (attr->va_size == 0)
+		return (ENOEXEC);
+
+	/*
+	 *  Check for execute permission to file based on current credentials.
+	 */
+	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+	if (error)
+		return (error);
+
+	/*
+	 * Check number of open-for-writes on the file and deny execution
+	 * if there are any.
+	 */
+	if (vp->v_writecount)
+		return (ETXTBSY);
+
+	/*
+	 * Call filesystem specific open routine (which does nothing in the
+	 * general case).
+	 */
+	error = VOP_OPEN(vp, FREAD, td->td_ucred, td);
+	return (error);
+}
+
+/*
+ * Exec handler registration
+ */
+int
+exec_register(execsw_arg)
+	const struct execsw *execsw_arg;
+{
+	const struct execsw **es, **xs, **newexecsw;
+	int count = 2;	/* New slot and trailing NULL */
+
+	if (execsw)
+		for (es = execsw; *es; es++)
+			count++;
+	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+	if (newexecsw == NULL)
+		return ENOMEM;
+	xs = newexecsw;
+	if (execsw)
+		for (es = execsw; *es; es++)
+			*xs++ = *es;
+	*xs++ = execsw_arg;
+	*xs = NULL;
+	if (execsw)
+		free(execsw, M_TEMP);
+	execsw = newexecsw;
+	return 0;
+}
+
+int
+exec_unregister(execsw_arg)
+	const struct execsw *execsw_arg;
+{
+	const struct execsw **es, **xs, **newexecsw;
+	int count = 1;
+
+	if (execsw == NULL)
+		panic("unregister with no handlers left?\n");
+
+	for (es = execsw; *es; es++) {
+		if (*es == execsw_arg)
+			break;
+	}
+	if (*es == NULL)
+		return ENOENT;
+	for (es = execsw; *es; es++)
+		if (*es != execsw_arg)
+			count++;
+	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+	if (newexecsw == NULL)
+		return ENOMEM;
+	xs = newexecsw;
+	for (es = execsw; *es; es++)
+		if (*es != execsw_arg)
+			*xs++ = *es;
+	*xs = NULL;
+	if (execsw)
+		free(execsw, M_TEMP);
+	execsw = newexecsw;
+	return 0;
+}
+
+int
+at_exec(function)
+	execlist_fn function;
+{
+	struct execlist *ep;
+
+#ifdef INVARIANTS
+	/* Be noisy if the programmer has lost track of things */
+	if (rm_at_exec(function)) 
+		printf("WARNING: exec callout entry (%p) already present\n",
+		    function);
+#endif
+	ep = malloc(sizeof(*ep), M_ATEXEC, M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->function = function;
+	TAILQ_INSERT_TAIL(&exec_list, ep, next);
+	return (0);
+}
+
+/*
+ * Scan the exec callout list for the given item and remove it.
+ * Returns the number of items removed (0 or 1)
+ */
+int
+rm_at_exec(function)
+	execlist_fn function;
+{
+	struct execlist *ep;
+
+	TAILQ_FOREACH(ep, &exec_list, next) {
+		if (ep->function == function) {
+			TAILQ_REMOVE(&exec_list, ep, next);
+			free(ep, M_ATEXEC);
+			return(1);
+		}
+	}	
+	return (0);
+}
+
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
new file mode 100644
index 0000000..fab9437
--- /dev/null
+++ b/sys/kern/kern_exit.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/tty.h>
+#include <sys/wait.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sx.h>
+#include <sys/ptrace.h>
+#include <sys/acct.h>		/* for acct_process() function prototype */
+#include <sys/filedesc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+#include <sys/jail.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/uma.h>
+#include <sys/user.h>
+
+/* Required to be non-static for SysVR4 emulator */
+MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status");
+
+static MALLOC_DEFINE(M_ATEXIT, "atexit", "atexit callback");
+
+static int wait1(struct thread *, struct wait_args *, int);
+
+/*
+ * callout list for things to do at exit time
+ */
+struct exitlist {
+	exitlist_fn function;
+	TAILQ_ENTRY(exitlist) next;
+};
+
+TAILQ_HEAD(exit_list_head, exitlist);
+static struct exit_list_head exit_list = TAILQ_HEAD_INITIALIZER(exit_list);
+
+/*
+ * exit --
+ *	Death of process.
+ *
+ * MPSAFE
+ */
+void
+sys_exit(td, uap)
+	struct thread *td;
+	struct sys_exit_args /* {
+		int	rval;
+	} */ *uap;
+{
+
+	mtx_lock(&Giant);
+	exit1(td, W_EXITCODE(uap->rval, 0));
+	/* NOTREACHED */
+}
+
+/*
+ * Exit: deallocate address space and other resources, change proc state
+ * to zombie, and unlink proc from allproc and parent's lists.  Save exit
+ * status and rusage for wait().  Check for child processes and orphan them.
+ */
+void
+exit1(td, rv)
+	register struct thread *td;
+	int rv;
+{
+	struct exitlist *ep;
+	struct proc *p, *nq, *q;
+	struct tty *tp;
+	struct vnode *ttyvp;
+	register struct vmspace *vm;
+	struct vnode *vtmp;
+#ifdef KTRACE
+	struct vnode *tracevp;
+#endif
+
+	GIANT_REQUIRED;
+
+	p = td->td_proc;
+	if (p == initproc) {
+		printf("init died (signal %d, exit %d)\n",
+		    WTERMSIG(rv), WEXITSTATUS(rv));
+		panic("Going nowhere without my init!");
+	}
+
+	/*
+	 * XXXXKSE: MUST abort all other threads before proceeding past here.
+	 */
+
+	/* Are we a task leader? */
+	PROC_LOCK(p);
+	if (p == p->p_leader) {
+		q = p->p_peers;
+		while (q != NULL) {
+			PROC_LOCK(q);
+			psignal(q, SIGKILL);
+			PROC_UNLOCK(q);
+			q = q->p_peers;
+		}
+		while (p->p_peers) 
+			msleep((caddr_t)p, &p->p_mtx, PWAIT, "exit1", 0);
+	}
+	PROC_UNLOCK(p);
+
+#ifdef PGINPROF
+	vmsizmon();
+#endif
+	STOPEVENT(p, S_EXIT, rv);
+	wakeup(&p->p_stype);	/* Wakeup anyone in procfs' PIOCWAIT */
+
+	/* 
+	 * Check if any loadable modules need anything done at process exit.
+	 * e.g. SYSV IPC stuff
+	 * XXX what if one of these generates an error?
+	 */
+	TAILQ_FOREACH(ep, &exit_list, next) 
+		(*ep->function)(p);
+
+	stopprofclock(p);
+
+	MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
+		M_ZOMBIE, M_WAITOK);
+	/*
+	 * If parent is waiting for us to exit or exec,
+	 * P_PPWAIT is set; we will wakeup the parent below.
+	 */
+	PROC_LOCK(p);
+	p->p_flag &= ~(P_TRACED | P_PPWAIT);
+	p->p_flag |= P_WEXIT;
+	SIGEMPTYSET(p->p_siglist);
+	PROC_UNLOCK(p);
+	if (timevalisset(&p->p_realtimer.it_value))
+		callout_stop(&p->p_itcallout);
+
+	/*
+	 * Reset any sigio structures pointing to us as a result of
+	 * F_SETOWN with our pid.
+	 */
+	funsetownlst(&p->p_sigiolst);
+
+	/*
+	 * Close open files and release open-file table.
+	 * This may block!
+	 */
+	fdfree(td); /* XXXKSE *//* may not be the one in proc */
+
+	/*
+	 * Remove ourself from our leader's peer list and wake our leader.
+	 */
+	PROC_LOCK(p->p_leader);
+	if (p->p_leader->p_peers) {
+		q = p->p_leader;
+		while (q->p_peers != p)
+			q = q->p_peers;
+		q->p_peers = p->p_peers;
+		wakeup((caddr_t)p->p_leader);
+	}
+	PROC_UNLOCK(p->p_leader);
+
+	/* The next two chunks should probably be moved to vmspace_exit. */
+	vm = p->p_vmspace;
+	/*
+	 * Release user portion of address space.
+	 * This releases references to vnodes,
+	 * which could cause I/O if the file has been unlinked.
+	 * Need to do this early enough that we can still sleep.
+	 * Can't free the entire vmspace as the kernel stack
+	 * may be mapped within that space also.
+	 */
+	if (--vm->vm_refcnt == 0) {
+		if (vm->vm_shm)
+			shmexit(p);
+		pmap_remove_pages(vmspace_pmap(vm), VM_MIN_ADDRESS,
+		    VM_MAXUSER_ADDRESS);
+		(void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS,
+		    VM_MAXUSER_ADDRESS);
+		vm->vm_freer = p;
+	}
+
+	sx_xlock(&proctree_lock);
+	if (SESS_LEADER(p)) {
+		register struct session *sp;
+
+		sp = p->p_session;
+		if (sp->s_ttyvp) {
+			/*
+			 * Controlling process.
+			 * Signal foreground pgrp,
+			 * drain controlling terminal
+			 * and revoke access to controlling terminal.
+			 */
+			if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
+				tp = sp->s_ttyp;
+				if (sp->s_ttyp->t_pgrp) {
+					PGRP_LOCK(sp->s_ttyp->t_pgrp);
+					pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
+					PGRP_UNLOCK(sp->s_ttyp->t_pgrp);
+				}
+				/* XXX tp should be locked. */
+				sx_xunlock(&proctree_lock);
+				(void) ttywait(tp);
+				sx_xlock(&proctree_lock);
+				/*
+				 * The tty could have been revoked
+				 * if we blocked.
+				 */
+				if (sp->s_ttyvp) {
+					ttyvp = sp->s_ttyvp;
+					SESS_LOCK(p->p_session);
+					sp->s_ttyvp = NULL;
+					SESS_UNLOCK(p->p_session);
+					sx_xunlock(&proctree_lock);
+					VOP_REVOKE(ttyvp, REVOKEALL);
+					vrele(ttyvp);
+					sx_xlock(&proctree_lock);
+				}
+			}
+			if (sp->s_ttyvp) {
+				ttyvp = sp->s_ttyvp;
+				SESS_LOCK(p->p_session);
+				sp->s_ttyvp = NULL;
+				SESS_UNLOCK(p->p_session);
+				vrele(ttyvp);
+			}
+			/*
+			 * s_ttyp is not zero'd; we use this to indicate
+			 * that the session once had a controlling terminal.
+			 * (for logging and informational purposes)
+			 */
+		}
+		SESS_LOCK(p->p_session);
+		sp->s_leader = NULL;
+		SESS_UNLOCK(p->p_session);
+	}
+	fixjobc(p, p->p_pgrp, 0);
+	sx_xunlock(&proctree_lock);
+	(void)acct_process(td);
+#ifdef KTRACE
+	/*
+	 * release trace file
+	 */
+	PROC_LOCK(p);
+	mtx_lock(&ktrace_mtx);
+	p->p_traceflag = 0;	/* don't trace the vrele() */
+	tracevp = p->p_tracep;
+	p->p_tracep = NULL;
+	mtx_unlock(&ktrace_mtx);
+	PROC_UNLOCK(p);
+	if (tracevp != NULL)
+		vrele(tracevp);
+#endif
+	/*
+	 * Release reference to text vnode
+	 */
+	if ((vtmp = p->p_textvp) != NULL) {
+		p->p_textvp = NULL;
+		vrele(vtmp);
+	}
+
+	/*
+	 * Release our limits structure.
+	 */
+	mtx_assert(&Giant, MA_OWNED);
+	if (--p->p_limit->p_refcnt == 0) {
+		FREE(p->p_limit, M_SUBPROC);
+		p->p_limit = NULL;
+	}
+
+	/*
+	 * Release this thread's reference to the ucred.  The actual proc
+	 * reference will stay around until the proc is harvested by
+	 * wait().  At this point the ucred is immutable (no other threads
+	 * from this proc are around that can change it) so we leave the
+	 * per-thread ucred pointer intact in case it is needed although
+	 * in theory nothing should be using it at this point.
+	 */
+	crfree(td->td_ucred);
+
+	/*
+	 * Remove proc from allproc queue and pidhash chain.
+	 * Place onto zombproc.  Unlink from parent's child list.
+	 */
+	sx_xlock(&allproc_lock);
+	LIST_REMOVE(p, p_list);
+	LIST_INSERT_HEAD(&zombproc, p, p_list);
+	LIST_REMOVE(p, p_hash);
+	sx_xunlock(&allproc_lock);
+
+	sx_xlock(&proctree_lock);
+	q = LIST_FIRST(&p->p_children);
+	if (q != NULL)		/* only need this if any child is S_ZOMB */
+		wakeup((caddr_t) initproc);
+	for (; q != NULL; q = nq) {
+		nq = LIST_NEXT(q, p_sibling);
+		PROC_LOCK(q);
+		proc_reparent(q, initproc);
+		q->p_sigparent = SIGCHLD;
+		/*
+		 * Traced processes are killed
+		 * since their existence means someone is screwing up.
+		 */
+		if (q->p_flag & P_TRACED) {
+			q->p_flag &= ~P_TRACED;
+			psignal(q, SIGKILL);
+		}
+		PROC_UNLOCK(q);
+	}
+
+	/*
+	 * Save exit status and final rusage info, adding in child rusage
+	 * info and self times.
+	 */
+	PROC_LOCK(p);
+	p->p_xstat = rv;
+	*p->p_ru = p->p_stats->p_ru;
+	mtx_lock_spin(&sched_lock);
+	calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL);
+	mtx_unlock_spin(&sched_lock);
+	ruadd(p->p_ru, &p->p_stats->p_cru);
+
+	/*
+	 * Notify interested parties of our demise.
+	 */
+	KNOTE(&p->p_klist, NOTE_EXIT);
+
+	/*
+	 * Notify parent that we're gone.  If parent has the PS_NOCLDWAIT
+	 * flag set, or if the handler is set to SIG_IGN, notify process
+	 * 1 instead (and hope it will handle this situation).
+	 */
+	PROC_LOCK(p->p_pptr);
+	if (p->p_pptr->p_procsig->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
+		struct proc *pp;
+
+		pp = p->p_pptr;
+		PROC_UNLOCK(pp);
+		proc_reparent(p, initproc);
+		PROC_LOCK(p->p_pptr);
+		/*
+		 * If this was the last child of our parent, notify
+		 * parent, so in case he was wait(2)ing, he will
+		 * continue.
+		 */
+		if (LIST_EMPTY(&pp->p_children))
+			wakeup((caddr_t)pp);
+	}
+
+	if (p->p_sigparent && p->p_pptr != initproc)
+	        psignal(p->p_pptr, p->p_sigparent);
+	else
+	        psignal(p->p_pptr, SIGCHLD);
+	PROC_UNLOCK(p->p_pptr);
+
+	/*
+	 * If this is a kthread, then wakeup anyone waiting for it to exit.
+	 */
+	if (p->p_flag & P_KTHREAD)
+		wakeup((caddr_t)p);
+	PROC_UNLOCK(p);
+	
+	/*
+	 * Finally, call machine-dependent code to release the remaining
+	 * resources including address space, the kernel stack and pcb.
+	 * The address space is released by "vmspace_exitfree(p)" in
+	 * vm_waitproc().
+	 */
+	cpu_exit(td);
+
+	PROC_LOCK(p);
+	PROC_LOCK(p->p_pptr);
+	sx_xunlock(&proctree_lock);
+	mtx_lock_spin(&sched_lock);
+	while (mtx_owned(&Giant))
+		mtx_unlock(&Giant);
+
+	/*
+	 * We have to wait until after releasing all locks before
+	 * changing p_stat.  If we block on a mutex then we will be
+	 * back at SRUN when we resume and our parent will never
+	 * harvest us.
+	 */
+	p->p_stat = SZOMB;
+
+	wakeup(p->p_pptr);
+	PROC_UNLOCK(p->p_pptr);
+	PROC_UNLOCK(p);
+
+	cnt.v_swtch++;
+	binuptime(PCPU_PTR(switchtime));
+	PCPU_SET(switchticks, ticks);
+
+	cpu_sched_exit(td);
+	cpu_throw();
+	panic("exit1");
+}
+
+#ifdef COMPAT_43
+/*
+ * MPSAFE.  The dirty work is handled by wait1().
+ */
+int
+owait(td, uap)
+	struct thread *td;
+	register struct owait_args /* {
+		int     dummy;
+	} */ *uap;
+{
+	struct wait_args w;
+
+	w.options = 0;
+	w.rusage = NULL;
+	w.pid = WAIT_ANY;
+	w.status = NULL;
+	return (wait1(td, &w, 1));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * MPSAFE.  The dirty work is handled by wait1().
+ */
+int
+wait4(td, uap)
+	struct thread *td;
+	struct wait_args *uap;
+{
+
+	return (wait1(td, uap, 0));
+}
+
+/*
+ * MPSAFE
+ */
+static int
+wait1(td, uap, compat)
+	register struct thread *td;
+	register struct wait_args /* {
+		int pid;
+		int *status;
+		int options;
+		struct rusage *rusage;
+	} */ *uap;
+	int compat;
+{
+	struct rusage ru;
+	register int nfound;
+	register struct proc *p, *q, *t;
+	int status, error;
+
+	q = td->td_proc;
+	if (uap->pid == 0) {
+		PROC_LOCK(q);
+		uap->pid = -q->p_pgid;
+		PROC_UNLOCK(q);
+	}
+	if (uap->options &~ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE))
+		return (EINVAL);
+	mtx_lock(&Giant);
+loop:
+	nfound = 0;
+	sx_xlock(&proctree_lock);
+	LIST_FOREACH(p, &q->p_children, p_sibling) {
+		PROC_LOCK(p);
+		if (uap->pid != WAIT_ANY &&
+		    p->p_pid != uap->pid && p->p_pgid != -uap->pid) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+
+		/*
+		 * This special case handles a kthread spawned by linux_clone 
+		 * (see linux_misc.c).  The linux_wait4 and linux_waitpid
+		 * functions need to be able to distinguish between waiting
+		 * on a process and waiting on a thread.  It is a thread if
+		 * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
+		 * signifies we want to wait for threads and not processes.
+		 */
+		if ((p->p_sigparent != SIGCHLD) ^
+		    ((uap->options & WLINUXCLONE) != 0)) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+
+		nfound++;
+		if (p->p_stat == SZOMB) {
+			/*
+			 * charge childs scheduling cpu usage to parent
+			 * XXXKSE assume only one thread & kse & ksegrp
+			 * keep estcpu in each ksegrp
+			 * so charge it to the ksegrp that did the wait
+			 * since process estcpu is sum of all ksegrps,
+			 * this is strictly as expected.
+			 * Assume that the child process aggregated all 
+			 * tke estcpu into the 'build-in' ksegrp.
+			 * XXXKSE
+			 */
+			if (curthread->td_proc->p_pid != 1) {
+				mtx_lock_spin(&sched_lock);
+				curthread->td_ksegrp->kg_estcpu =
+				    ESTCPULIM(curthread->td_ksegrp->kg_estcpu +
+				    p->p_ksegrp.kg_estcpu);
+				mtx_unlock_spin(&sched_lock);
+			}
+
+			td->td_retval[0] = p->p_pid;
+#ifdef COMPAT_43
+			if (compat)
+				td->td_retval[1] = p->p_xstat;
+			else
+#endif
+			if (uap->status) {
+				status = p->p_xstat;	/* convert to int */
+				PROC_UNLOCK(p);
+				if ((error = copyout((caddr_t)&status,
+				    (caddr_t)uap->status, sizeof(status)))) {
+					sx_xunlock(&proctree_lock);
+					mtx_unlock(&Giant);
+					return (error);
+				}
+				PROC_LOCK(p);
+			}
+			if (uap->rusage) {
+				bcopy(p->p_ru, &ru, sizeof(ru));
+				PROC_UNLOCK(p);
+				if ((error = copyout((caddr_t)&ru,
+				    (caddr_t)uap->rusage,
+				    sizeof (struct rusage)))) {
+					sx_xunlock(&proctree_lock);
+					mtx_unlock(&Giant);
+					return (error);
+				}
+			} else
+				PROC_UNLOCK(p);
+			/*
+			 * If we got the child via a ptrace 'attach',
+			 * we need to give it back to the old parent.
+			 */
+			if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
+				PROC_LOCK(p);
+				p->p_oppid = 0;
+				proc_reparent(p, t);
+				PROC_UNLOCK(p);
+				psignal(t, SIGCHLD);
+				wakeup((caddr_t)t);
+				PROC_UNLOCK(t);
+				sx_xunlock(&proctree_lock);
+				mtx_unlock(&Giant);
+				return (0);
+			}
+			/*
+			 * Remove other references to this process to ensure
+			 * we have an exclusive reference.
+			 */
+			leavepgrp(p);
+
+			sx_xlock(&allproc_lock);
+			LIST_REMOVE(p, p_list);	/* off zombproc */
+			sx_xunlock(&allproc_lock);
+
+			LIST_REMOVE(p, p_sibling);
+			sx_xunlock(&proctree_lock);
+
+			/*
+			 * As a side effect of this lock, we know that
+			 * all other writes to this proc are visible now, so
+			 * no more locking is needed for p.
+			 */
+			PROC_LOCK(p);
+			p->p_xstat = 0;		/* XXX: why? */
+			PROC_UNLOCK(p);
+			PROC_LOCK(q);
+			ruadd(&q->p_stats->p_cru, p->p_ru);
+			PROC_UNLOCK(q);
+			FREE(p->p_ru, M_ZOMBIE);
+			p->p_ru = NULL;
+
+			/*
+			 * Decrement the count of procs running with this uid.
+			 */
+			(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
+
+			/*
+			 * Free up credentials.
+			 */
+			crfree(p->p_ucred);
+			p->p_ucred = NULL;	/* XXX: why? */
+
+			/*
+			 * Remove unused arguments
+			 */
+			pargs_drop(p->p_args);
+			p->p_args = NULL;
+
+			if (--p->p_procsig->ps_refcnt == 0) {
+				if (p->p_sigacts != &p->p_uarea->u_sigacts)
+					FREE(p->p_sigacts, M_SUBPROC);
+			        FREE(p->p_procsig, M_SUBPROC);
+				p->p_procsig = NULL;
+			}
+
+			/*
+			 * Give vm and machine-dependent layer a chance
+			 * to free anything that cpu_exit couldn't
+			 * release while still running in process context.
+			 */
+			vm_waitproc(p);
+			mtx_destroy(&p->p_mtx);
+			uma_zfree(proc_zone, p);
+			sx_xlock(&allproc_lock);
+			nprocs--;
+			sx_xunlock(&allproc_lock);
+			mtx_unlock(&Giant);
+			return (0);
+		}
+		if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 &&
+		    (p->p_flag & P_TRACED || uap->options & WUNTRACED)) {
+			p->p_flag |= P_WAITED;
+			sx_xunlock(&proctree_lock);
+			td->td_retval[0] = p->p_pid;
+#ifdef COMPAT_43
+			if (compat) {
+				td->td_retval[1] = W_STOPCODE(p->p_xstat);
+				PROC_UNLOCK(p);
+				error = 0;
+			} else
+#endif
+			if (uap->status) {
+				status = W_STOPCODE(p->p_xstat);
+				PROC_UNLOCK(p);
+				error = copyout((caddr_t)&status,
+					(caddr_t)uap->status, sizeof(status));
+			} else {
+				PROC_UNLOCK(p);
+				error = 0;
+			}
+			mtx_unlock(&Giant);
+			return (error);
+		}
+		if (uap->options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
+			sx_xunlock(&proctree_lock);
+			td->td_retval[0] = p->p_pid;
+			p->p_flag &= ~P_CONTINUED;
+			PROC_UNLOCK(p);
+
+			if (uap->status) {
+				status = SIGCONT;
+				error = copyout((caddr_t)&status,
+				    (caddr_t)uap->status, sizeof(status));
+			} else
+				error = 0;
+
+			mtx_unlock(&Giant);
+			return (error);
+		}
+		PROC_UNLOCK(p);
+	}
+	if (nfound == 0) {
+		sx_xunlock(&proctree_lock);
+		mtx_unlock(&Giant);
+		return (ECHILD);
+	}
+	if (uap->options & WNOHANG) {
+		sx_xunlock(&proctree_lock);
+		td->td_retval[0] = 0;
+		mtx_unlock(&Giant);
+		return (0);
+	}
+	PROC_LOCK(q);
+	sx_xunlock(&proctree_lock);
+	error = msleep((caddr_t)q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
+	PROC_UNLOCK(q);
+	if (error) {
+		mtx_unlock(&Giant);
+		return (error);
+	}
+	goto loop;
+}
+
+/*
+ * Make process 'parent' the new parent of process 'child'.
+ * Must be called with an exclusive hold of proctree lock.
+ */
+void
+proc_reparent(child, parent)
+	register struct proc *child;
+	register struct proc *parent;
+{
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	PROC_LOCK_ASSERT(child, MA_OWNED);
+	if (child->p_pptr == parent)
+		return;
+
+	LIST_REMOVE(child, p_sibling);
+	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
+	child->p_pptr = parent;
+}
+
+/*
+ * The next two functions are to handle adding/deleting items on the
+ * exit callout list
+ * 
+ * at_exit():
+ * Take the arguments given and put them onto the exit callout list,
+ * However first make sure that it's not already there.
+ * returns 0 on success.
+ */
+
+int
+at_exit(function)
+	exitlist_fn function;
+{
+	struct exitlist *ep;
+
+#ifdef INVARIANTS
+	/* Be noisy if the programmer has lost track of things */
+	if (rm_at_exit(function)) 
+		printf("WARNING: exit callout entry (%p) already present\n",
+		    function);
+#endif
+	ep = malloc(sizeof(*ep), M_ATEXIT, M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->function = function;
+	TAILQ_INSERT_TAIL(&exit_list, ep, next);
+	return (0);
+}
+
+/*
+ * Scan the exit callout list for the given item and remove it.
+ * Returns the number of items removed (0 or 1)
+ */
+int
+rm_at_exit(function)
+	exitlist_fn function;
+{
+	struct exitlist *ep;
+
+	TAILQ_FOREACH(ep, &exit_list, next) {
+		if (ep->function == function) {
+			TAILQ_REMOVE(&exit_list, ep, next);
+			free(ep, M_ATEXIT);
+			return (1);
+		}
+	}
+	return (0);
+}
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
new file mode 100644
index 0000000..016653b
--- /dev/null
+++ b/sys/kern/kern_fork.c
@@ -0,0 +1,866 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/syscall.h>
+#include <sys/vnode.h>
+#include <sys/acct.h>
+#include <sys/ktr.h>
+#include <sys/ktrace.h>
+#include <sys/kthread.h>
+#include <sys/unistd.h>	
+#include <sys/jail.h>
+#include <sys/sx.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+#include <sys/vmmeter.h>
+#include <sys/user.h>
+#include <machine/critical.h>
+
+static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
+
+/*
+ * These are the stuctures used to create a callout list for things to do
+ * when forking a process
+ */
+struct forklist {
+	forklist_fn function;
+	TAILQ_ENTRY(forklist) next;
+};
+
+static struct sx fork_list_lock;
+
+TAILQ_HEAD(forklist_head, forklist);
+static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
+
+#ifndef _SYS_SYSPROTO_H_
+struct fork_args {
+	int     dummy;
+};
+#endif
+
+int forksleep; /* Place for fork1() to sleep on. */
+
+static void
+init_fork_list(void *data __unused)
+{
+
+	sx_init(&fork_list_lock, "fork list");
+}
+SYSINIT(fork_list, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_fork_list, NULL);
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+fork(td, uap)
+	struct thread *td;
+	struct fork_args *uap;
+{
+	int error;
+	struct proc *p2;
+
+	mtx_lock(&Giant);
+	error = fork1(td, RFFDG | RFPROC, &p2);
+	if (error == 0) {
+		td->td_retval[0] = p2->p_pid;
+		td->td_retval[1] = 0;
+	}
+	mtx_unlock(&Giant);
+	return error;
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+vfork(td, uap)
+	struct thread *td;
+	struct vfork_args *uap;
+{
+	int error;
+	struct proc *p2;
+
+	mtx_lock(&Giant);
+	error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
+	if (error == 0) {
+		td->td_retval[0] = p2->p_pid;
+		td->td_retval[1] = 0;
+	}
+	mtx_unlock(&Giant);
+	return error;
+}
+
+/*
+ * MPSAFE
+ */
+int
+rfork(td, uap)
+	struct thread *td;
+	struct rfork_args *uap;
+{
+	int error;
+	struct proc *p2;
+
+	/* Don't allow kernel only flags. */
+	if ((uap->flags & RFKERNELONLY) != 0)
+		return (EINVAL);
+	mtx_lock(&Giant);
+	error = fork1(td, uap->flags, &p2);
+	if (error == 0) {
+		td->td_retval[0] = p2 ? p2->p_pid : 0;
+		td->td_retval[1] = 0;
+	}
+	mtx_unlock(&Giant);
+	return error;
+}
+
+
+int	nprocs = 1;				/* process 0 */
+int	lastpid = 0;
+SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 
+    "Last used PID");
+
+/*
+ * Random component to lastpid generation.  We mix in a random factor to make
+ * it a little harder to predict.  We sanity check the modulus value to avoid
+ * doing it in critical paths.  Don't let it be too small or we pointlessly
+ * waste randomness entropy, and don't let it be impossibly large.  Using a
+ * modulus that is too big causes a LOT more process table scans and slows
+ * down fork processing as the pidchecked caching is defeated.
+ */
+static int randompid = 0;
+
+static int
+sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
+{
+	int error, pid;
+
+	sx_xlock(&allproc_lock);
+	pid = randompid;
+	error = sysctl_handle_int(oidp, &pid, 0, req);
+	if (error == 0 && req->newptr != NULL) {
+		if (pid < 0 || pid > PID_MAX - 100)	/* out of range */
+			pid = PID_MAX - 100;
+		else if (pid < 2)			/* NOP */
+			pid = 0;
+		else if (pid < 100)			/* Make it reasonable */
+			pid = 100;
+		randompid = pid;
+	}
+	sx_xunlock(&allproc_lock);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
+    0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
+
+#if 0
+void
+kse_init(struct kse *kse1, struct kse *kse2) 
+{
+}
+
+void
+thread_init(struct thread *thread1, struct thread *thread2) 
+{
+}
+
+void
+ksegrp_init(struct ksegrp *ksegrp1, struct ksegrp *ksegrp2) 
+{
+}
+#endif
+
+int
+fork1(td, flags, procp)
+	struct thread *td;			/* parent proc */
+	int flags;
+	struct proc **procp;			/* child proc */
+{
+	struct proc *p2, *pptr;
+	uid_t uid;
+	struct proc *newproc;
+	int trypid;
+	int ok;
+	static int pidchecked = 0;
+	struct forklist *ep;
+	struct filedesc *fd;
+	struct proc *p1 = td->td_proc;
+	struct thread *td2;
+	struct kse *ke2;
+	struct ksegrp *kg2;
+	struct sigacts *newsigacts;
+	struct procsig *newprocsig;
+
+	GIANT_REQUIRED;
+
+	/* Can't copy and clear */
+	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+		return (EINVAL);
+
+	/*
+	 * Here we don't create a new process, but we divorce
+	 * certain parts of a process from itself.
+	 */
+	if ((flags & RFPROC) == 0) {
+		vm_forkproc(td, NULL, NULL, flags);
+
+		/*
+		 * Close all file descriptors.
+		 */
+		if (flags & RFCFDG) {
+			struct filedesc *fdtmp;
+			fdtmp = fdinit(td);	/* XXXKSE */
+			PROC_LOCK(p1);
+			fdfree(td);		/* XXXKSE */
+			p1->p_fd = fdtmp;
+			PROC_UNLOCK(p1);
+		}
+
+		/*
+		 * Unshare file descriptors (from parent.)
+		 */
+		if (flags & RFFDG) {
+			FILEDESC_LOCK(p1->p_fd);
+			if (p1->p_fd->fd_refcnt > 1) {
+				struct filedesc *newfd;
+
+				newfd = fdcopy(td);
+				FILEDESC_UNLOCK(p1->p_fd);
+				PROC_LOCK(p1);
+				fdfree(td);
+				p1->p_fd = newfd;
+				PROC_UNLOCK(p1);
+			} else
+				FILEDESC_UNLOCK(p1->p_fd);
+		}
+		*procp = NULL;
+		return (0);
+	}
+
+	/* Allocate new proc. */
+	newproc = uma_zalloc(proc_zone, M_WAITOK);
+
+	/*
+	 * Although process entries are dynamically created, we still keep
+	 * a global limit on the maximum number we will create.  Don't allow
+	 * a nonprivileged user to use the last process; don't let root
+	 * exceed the limit. The variable nprocs is the current number of
+	 * processes, maxproc is the limit.
+	 */
+	sx_xlock(&allproc_lock);
+	uid = td->td_ucred->cr_ruid;
+	if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
+		sx_xunlock(&allproc_lock);
+		uma_zfree(proc_zone, newproc);
+		tsleep(&forksleep, PUSER, "fork", hz / 2);
+		return (EAGAIN);
+	}
+	/*
+	 * Increment the count of procs running with this uid. Don't allow
+	 * a nonprivileged user to exceed their current limit.
+	 */
+	PROC_LOCK(p1);
+	ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
+		(uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
+	PROC_UNLOCK(p1);
+	if (!ok) {
+		sx_xunlock(&allproc_lock);
+		uma_zfree(proc_zone, newproc);
+		tsleep(&forksleep, PUSER, "fork", hz / 2);
+		return (EAGAIN);
+	}
+
+	/*
+	 * Increment the nprocs resource before blocking can occur.  There
+	 * are hard-limits as to the number of processes that can run.
+	 */
+	nprocs++;
+
+	/*
+	 * Find an unused process ID.  We remember a range of unused IDs
+	 * ready to use (from lastpid+1 through pidchecked-1).
+	 *
+	 * If RFHIGHPID is set (used during system boot), do not allocate
+	 * low-numbered pids.
+	 */
+	trypid = lastpid + 1;
+	if (flags & RFHIGHPID) {
+		if (trypid < 10) {
+			trypid = 10;
+		}
+	} else {
+		if (randompid)
+			trypid += arc4random() % randompid;
+	}
+retry:
+	/*
+	 * If the process ID prototype has wrapped around,
+	 * restart somewhat above 0, as the low-numbered procs
+	 * tend to include daemons that don't exit.
+	 */
+	if (trypid >= PID_MAX) {
+		trypid = trypid % PID_MAX;
+		if (trypid < 100)
+			trypid += 100;
+		pidchecked = 0;
+	}
+	if (trypid >= pidchecked) {
+		int doingzomb = 0;
+
+		pidchecked = PID_MAX;
+		/*
+		 * Scan the active and zombie procs to check whether this pid
+		 * is in use.  Remember the lowest pid that's greater
+		 * than trypid, so we can avoid checking for a while.
+		 */
+		p2 = LIST_FIRST(&allproc);
+again:
+		for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
+			PROC_LOCK(p2);
+			while (p2->p_pid == trypid ||
+			    p2->p_pgrp->pg_id == trypid ||
+			    p2->p_session->s_sid == trypid) {
+				trypid++;
+				if (trypid >= pidchecked) {
+					PROC_UNLOCK(p2);
+					goto retry;
+				}
+			}
+			if (p2->p_pid > trypid && pidchecked > p2->p_pid)
+				pidchecked = p2->p_pid;
+			if (p2->p_pgrp->pg_id > trypid &&
+			    pidchecked > p2->p_pgrp->pg_id)
+				pidchecked = p2->p_pgrp->pg_id;
+			if (p2->p_session->s_sid > trypid &&
+			    pidchecked > p2->p_session->s_sid)
+				pidchecked = p2->p_session->s_sid;
+			PROC_UNLOCK(p2);
+		}
+		if (!doingzomb) {
+			doingzomb = 1;
+			p2 = LIST_FIRST(&zombproc);
+			goto again;
+		}
+	}
+
+	/*
+	 * RFHIGHPID does not mess with the lastpid counter during boot.
+	 */
+	if (flags & RFHIGHPID)
+		pidchecked = 0;
+	else
+		lastpid = trypid;
+
+	p2 = newproc;
+	p2->p_stat = SIDL;			/* protect against others */
+	p2->p_pid = trypid;
+	LIST_INSERT_HEAD(&allproc, p2, p_list);
+	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
+	sx_xunlock(&allproc_lock);
+
+	/*
+	 * Malloc things while we don't hold any locks.
+	 */
+	if (flags & RFSIGSHARE) {
+		MALLOC(newsigacts, struct sigacts *,
+		    sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
+		newprocsig = NULL;
+	} else {
+		newsigacts = NULL;
+		MALLOC(newprocsig, struct procsig *, sizeof(struct procsig),
+		    M_SUBPROC, M_WAITOK);
+	}
+
+	/*
+	 * Copy filedesc.
+	 * XXX: This is busted.  fd*() need to not take proc
+	 * arguments or something.
+	 */
+	if (flags & RFCFDG)
+		fd = fdinit(td);
+	else if (flags & RFFDG) {
+		FILEDESC_LOCK(p1->p_fd);
+		fd = fdcopy(td);
+		FILEDESC_UNLOCK(p1->p_fd);
+	} else
+		fd = fdshare(p1);
+
+	/*
+	 * Make a proc table entry for the new process.
+	 * Start by zeroing the section of proc that is zero-initialized,
+	 * then copy the section that is copied directly from the parent.
+	 */
+	td2 = thread_get(p2);
+	ke2 = &p2->p_kse;
+	kg2 = &p2->p_ksegrp;
+
+#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
+
+	bzero(&p2->p_startzero,
+	    (unsigned) RANGEOF(struct proc, p_startzero, p_endzero));
+	bzero(&ke2->ke_startzero,
+	    (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));
+	bzero(&td2->td_startzero,
+	    (unsigned) RANGEOF(struct thread, td_startzero, td_endzero));
+	bzero(&kg2->kg_startzero,
+	    (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero));
+
+	mtx_init(&p2->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+	PROC_LOCK(p2);
+	PROC_LOCK(p1);
+
+	bcopy(&p1->p_startcopy, &p2->p_startcopy,
+	    (unsigned) RANGEOF(struct proc, p_startcopy, p_endcopy));
+	bcopy(&td->td_kse->ke_startcopy, &ke2->ke_startcopy,
+	    (unsigned) RANGEOF(struct kse, ke_startcopy, ke_endcopy));
+	bcopy(&td->td_startcopy, &td2->td_startcopy,
+	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
+	bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy,
+	    (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
+#undef RANGEOF
+
+	/*
+	 * XXXKSE Theoretically only the running thread would get copied 
+	 * Others in the kernel would be 'aborted' in the child.
+	 * i.e return E*something*
+	 */
+	proc_linkup(p2, kg2, ke2, td2);
+
+	/* note.. XXXKSE no pcb or u-area yet */
+
+	/*
+	 * Duplicate sub-structures as needed.
+	 * Increase reference counts on shared objects.
+	 * The p_stats and p_sigacts substructs are set in vm_forkproc.
+	 */
+	p2->p_flag = 0;
+	mtx_lock_spin(&sched_lock);
+	p2->p_sflag = PS_INMEM;
+	if (p1->p_sflag & PS_PROFIL)
+		startprofclock(p2);
+	mtx_unlock_spin(&sched_lock);
+	p2->p_ucred = crhold(td->td_ucred);
+	td2->td_ucred = crhold(p2->p_ucred);	/* XXXKSE */
+
+	/*
+	 * Setup linkage for kernel based threading
+	 */
+	if((flags & RFTHREAD) != 0) {
+		/*
+		 * XXX: This assumes a leader is a parent or grandparent of
+		 * all processes in a task.
+		 */
+		if (p1->p_leader != p1)
+			PROC_LOCK(p1->p_leader);
+		p2->p_peers = p1->p_peers;
+		p1->p_peers = p2;
+		p2->p_leader = p1->p_leader;
+		if (p1->p_leader != p1)
+			PROC_UNLOCK(p1->p_leader);
+	} else {
+		p2->p_peers = NULL;
+		p2->p_leader = p2;
+	}
+
+	pargs_hold(p2->p_args);
+
+	if (flags & RFSIGSHARE) {
+		p2->p_procsig = p1->p_procsig;
+		p2->p_procsig->ps_refcnt++;
+		if (p1->p_sigacts == &p1->p_uarea->u_sigacts) {
+			/*
+			 * Set p_sigacts to the new shared structure.
+			 * Note that this is updating p1->p_sigacts at the
+			 * same time, since p_sigacts is just a pointer to
+			 * the shared p_procsig->ps_sigacts.
+			 */
+			p2->p_sigacts  = newsigacts;
+			newsigacts = NULL;
+			*p2->p_sigacts = p1->p_uarea->u_sigacts;
+		}
+	} else {
+		p2->p_procsig = newprocsig;
+		newprocsig = NULL;
+		bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
+		p2->p_procsig->ps_refcnt = 1;
+		p2->p_sigacts = NULL;	/* finished in vm_forkproc() */
+	}
+	if (flags & RFLINUXTHPN) 
+	        p2->p_sigparent = SIGUSR1;
+	else
+	        p2->p_sigparent = SIGCHLD;
+
+	/* Bump references to the text vnode (for procfs) */
+	p2->p_textvp = p1->p_textvp;
+	if (p2->p_textvp)
+		VREF(p2->p_textvp);
+	p2->p_fd = fd;
+	PROC_UNLOCK(p1);
+	PROC_UNLOCK(p2);
+
+	/*
+	 * If p_limit is still copy-on-write, bump refcnt,
+	 * otherwise get a copy that won't be modified.
+	 * (If PL_SHAREMOD is clear, the structure is shared
+	 * copy-on-write.)
+	 */
+	if (p1->p_limit->p_lflags & PL_SHAREMOD)
+		p2->p_limit = limcopy(p1->p_limit);
+	else {
+		p2->p_limit = p1->p_limit;
+		p2->p_limit->p_refcnt++;
+	}
+
+	sx_xlock(&proctree_lock);
+	PGRP_LOCK(p1->p_pgrp);
+	PROC_LOCK(p2);
+	PROC_LOCK(p1);
+
+	/*
+	 * Preserve some more flags in subprocess.  PS_PROFIL has already
+	 * been preserved.
+	 */
+	p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK);
+	SESS_LOCK(p1->p_session);
+	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
+		p2->p_flag |= P_CONTROLT;
+	SESS_UNLOCK(p1->p_session);
+	if (flags & RFPPWAIT)
+		p2->p_flag |= P_PPWAIT;
+
+	LIST_INSERT_AFTER(p1, p2, p_pglist);
+	PGRP_UNLOCK(p1->p_pgrp);
+	LIST_INIT(&p2->p_children);
+	LIST_INIT(&td2->td_contested); /* XXXKSE only 1 thread? */
+
+	callout_init(&p2->p_itcallout, 0);
+	callout_init(&td2->td_slpcallout, 1); /* XXXKSE */
+
+#ifdef KTRACE
+	/*
+	 * Copy traceflag and tracefile if enabled.
+	 */
+	mtx_lock(&ktrace_mtx);
+	KASSERT(p2->p_tracep == NULL, ("new process has a ktrace vnode"));
+	if (p1->p_traceflag & KTRFAC_INHERIT) {
+		p2->p_traceflag = p1->p_traceflag;
+		if ((p2->p_tracep = p1->p_tracep) != NULL)
+			VREF(p2->p_tracep);
+	}
+	mtx_unlock(&ktrace_mtx);
+#endif
+
+	/*
+	 * set priority of child to be that of parent
+	 * XXXKSE hey! copying the estcpu seems dodgy.. should split it..
+	 */
+	mtx_lock_spin(&sched_lock);
+	p2->p_ksegrp.kg_estcpu = p1->p_ksegrp.kg_estcpu;
+	mtx_unlock_spin(&sched_lock);
+
+	/*
+	 * This begins the section where we must prevent the parent
+	 * from being swapped.
+	 */
+	_PHOLD(p1);
+	PROC_UNLOCK(p1);
+
+	/*
+	 * Attach the new process to its parent.
+	 *
+	 * If RFNOWAIT is set, the newly created process becomes a child
+	 * of init.  This effectively disassociates the child from the
+	 * parent.
+	 */
+	if (flags & RFNOWAIT)
+		pptr = initproc;
+	else
+		pptr = p1;
+	p2->p_pptr = pptr;
+	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
+	PROC_UNLOCK(p2);
+	sx_xunlock(&proctree_lock);
+
+	/*
+	 * XXXKSE: In KSE, there would be a race here if one thread was
+	 * dieing due to a signal (or calling exit1() for that matter) while
+	 * another thread was calling fork1().  Not sure how KSE wants to work
+	 * around that.  The problem is that up until the point above, if p1
+	 * gets killed, it won't find p2 in its list in order for it to be
+	 * reparented.  Alternatively, we could add a new p_flag that gets set
+	 * before we reparent all the children that we check above and just
+	 * use init as our parent if that if that flag is set.  (Either that
+	 * or abort the fork if the flag is set since our parent died trying
+	 * to fork us (which is evil)).
+	 */
+
+	KASSERT(newprocsig == NULL, ("unused newprocsig"));
+	if (newsigacts != NULL)
+		FREE(newsigacts, M_SUBPROC);
+	/*
+	 * Finish creating the child process.  It will return via a different
+	 * execution path later.  (ie: directly into user mode)
+	 */
+	vm_forkproc(td, p2, td2, flags);
+
+	if (flags == (RFFDG | RFPROC)) {
+		cnt.v_forks++;
+		cnt.v_forkpages += p2->p_vmspace->vm_dsize +
+		    p2->p_vmspace->vm_ssize;
+	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
+		cnt.v_vforks++;
+		cnt.v_vforkpages += p2->p_vmspace->vm_dsize +
+		    p2->p_vmspace->vm_ssize;
+	} else if (p1 == &proc0) {
+		cnt.v_kthreads++;
+		cnt.v_kthreadpages += p2->p_vmspace->vm_dsize +
+		    p2->p_vmspace->vm_ssize;
+	} else {
+		cnt.v_rforks++;
+		cnt.v_rforkpages += p2->p_vmspace->vm_dsize +
+		    p2->p_vmspace->vm_ssize;
+	}
+
+	/*
+	 * Both processes are set up, now check if any loadable modules want
+	 * to adjust anything.
+	 *   What if they have an error? XXX
+	 */
+	sx_slock(&fork_list_lock);
+	TAILQ_FOREACH(ep, &fork_list, next) {
+		(*ep->function)(p1, p2, flags);
+	}
+	sx_sunlock(&fork_list_lock);
+
+	/*
+	 * If RFSTOPPED not requested, make child runnable and add to
+	 * run queue.
+	 */
+	microtime(&(p2->p_stats->p_start));
+	p2->p_acflag = AFORK;
+	if ((flags & RFSTOPPED) == 0) {
+		mtx_lock_spin(&sched_lock);
+		p2->p_stat = SRUN;
+		setrunqueue(td2);
+		mtx_unlock_spin(&sched_lock);
+	}
+
+	/*
+	 * Now can be swapped.
+	 */
+	PROC_LOCK(p1);
+	_PRELE(p1);
+
+	/*
+	 * tell any interested parties about the new process
+	 */
+	KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
+	PROC_UNLOCK(p1);
+
+	/*
+	 * Preserve synchronization semantics of vfork.  If waiting for
+	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
+	 * proc (in case of exit).
+	 */
+	PROC_LOCK(p2);
+	while (p2->p_flag & P_PPWAIT)
+		msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
+	PROC_UNLOCK(p2);
+
+	/*
+	 * Return child proc pointer to parent.
+	 */
+	*procp = p2;
+	return (0);
+}
+
+/*
+ * The next two functionms are general routines to handle adding/deleting
+ * items on the fork callout list.
+ *
+ * at_fork():
+ * Take the arguments given and put them onto the fork callout list,
+ * However first make sure that it's not already there.
+ * Returns 0 on success or a standard error number.
+ */
+
+int
+at_fork(function)
+	forklist_fn function;
+{
+	struct forklist *ep;
+
+#ifdef INVARIANTS
+	/* let the programmer know if he's been stupid */
+	if (rm_at_fork(function)) 
+		printf("WARNING: fork callout entry (%p) already present\n",
+		    function);
+#endif
+	ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->function = function;
+	sx_xlock(&fork_list_lock);
+	TAILQ_INSERT_TAIL(&fork_list, ep, next);
+	sx_xunlock(&fork_list_lock);
+	return (0);
+}
+
+/*
+ * Scan the exit callout list for the given item and remove it..
+ * Returns the number of items removed (0 or 1)
+ */
+
+int
+rm_at_fork(function)
+	forklist_fn function;
+{
+	struct forklist *ep;
+
+	sx_xlock(&fork_list_lock);
+	TAILQ_FOREACH(ep, &fork_list, next) {
+		if (ep->function == function) {
+			TAILQ_REMOVE(&fork_list, ep, next);
+			sx_xunlock(&fork_list_lock);
+			free(ep, M_ATFORK);
+			return(1);
+		}
+	}
+	sx_xunlock(&fork_list_lock);
+	return (0);
+}
+
+/*
+ * Handle the return of a child process from fork1().  This function
+ * is called from the MD fork_trampoline() entry point.
+ */
+void
+fork_exit(callout, arg, frame)
+	void (*callout)(void *, struct trapframe *);
+	void *arg;
+	struct trapframe *frame;
+{
+	struct thread *td = curthread;
+	struct proc *p = td->td_proc;
+
+	td->td_kse->ke_oncpu = PCPU_GET(cpuid);
+	/*
+	 * Finish setting up thread glue.  We need to initialize
+	 * the thread into a td_critnest=1 state.  Some platforms
+	 * may have already partially or fully initialized td_critnest
+	 * and/or td_md.md_savecrit (when applciable).
+	 *
+	 * see <arch>/<arch>/critical.c
+	 */
+	sched_lock.mtx_lock = (uintptr_t)td;
+	sched_lock.mtx_recurse = 0;
+	cpu_critical_fork_exit();
+	CTR3(KTR_PROC, "fork_exit: new proc %p (pid %d, %s)", p, p->p_pid,
+	    p->p_comm);
+	if (PCPU_GET(switchtime.sec) == 0)
+		binuptime(PCPU_PTR(switchtime));
+	PCPU_SET(switchticks, ticks);
+	mtx_unlock_spin(&sched_lock);
+
+	/*
+	 * cpu_set_fork_handler intercepts this function call to
+         * have this call a non-return function to stay in kernel mode.
+         * initproc has its own fork handler, but it does return.
+         */
+	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
+	callout(arg, frame);
+
+	/*
+	 * Check if a kernel thread misbehaved and returned from its main
+	 * function.
+	 */
+	PROC_LOCK(p);
+	if (p->p_flag & P_KTHREAD) {
+		PROC_UNLOCK(p);
+		mtx_lock(&Giant);
+		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
+		    p->p_comm, p->p_pid);
+		kthread_exit(0);
+	}
+	PROC_UNLOCK(p);
+	mtx_assert(&Giant, MA_NOTOWNED);
+}
+
+/*
+ * Simplified back end of syscall(), used when returning from fork()
+ * directly into user mode.  Giant is not held on entry, and must not
+ * be held on return.  This function is passed in to fork_exit() as the
+ * first parameter and is called when returning to a new userland process.
+ */
+void
+fork_return(td, frame)
+	struct thread *td;
+	struct trapframe *frame;
+{
+
+	userret(td, frame, 0);
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_SYSRET))
+		ktrsysret(SYS_fork, 0, 0);
+#endif
+	mtx_assert(&Giant, MA_NOTOWNED);
+}
diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c
new file mode 100644
index 0000000..29194b7
--- /dev/null
+++ b/sys/kern/kern_idle.c
@@ -0,0 +1,110 @@
+/*-
+ * Copyright (c) 2000, All rights reserved.  See /usr/src/COPYRIGHT
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/smp.h>
+#include <sys/unistd.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+static void idle_setup(void *dummy);
+SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL)
+
+static void idle_proc(void *dummy);
+
+/*
+ * Setup per-cpu idle process contexts.  The AP's shouldn't be running or
+ * accessing their idle processes at this point, so don't bother with
+ * locking.
+ */
+static void
+idle_setup(void *dummy)
+{
+#ifdef SMP
+	struct pcpu *pc;
+#endif
+	struct proc *p;
+	int error;
+
+#ifdef SMP
+	SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
+		error = kthread_create(idle_proc, NULL, &p,
+		    RFSTOPPED | RFHIGHPID, "idle: cpu%d", pc->pc_cpuid);
+		pc->pc_idlethread = FIRST_THREAD_IN_PROC(p);
+		if (pc->pc_curthread == NULL) {
+			pc->pc_curthread = pc->pc_idlethread;
+			pc->pc_idlethread->td_critnest = 0;
+		}
+#else
+		error = kthread_create(idle_proc, NULL, &p,
+		    RFSTOPPED | RFHIGHPID, "idle");
+		PCPU_SET(idlethread, FIRST_THREAD_IN_PROC(p));
+#endif
+		if (error)
+			panic("idle_setup: kthread_create error %d\n", error);
+
+		p->p_flag |= P_NOLOAD;
+		p->p_stat = SRUN;
+#ifdef SMP
+	}
+#endif
+}
+
+/*
+ * idle process context
+ */
+static void
+idle_proc(void *dummy)
+{
+#ifdef DIAGNOSTIC
+	int count;
+#endif
+
+	for (;;) {
+		mtx_assert(&Giant, MA_NOTOWNED);
+
+#ifdef DIAGNOSTIC
+		count = 0;
+
+		while (count >= 0 && procrunnable() == 0) {
+#else
+		while (procrunnable() == 0) {
+#endif
+		/*
+		 * This is a good place to put things to be done in
+		 * the background, including sanity checks.
+		 */
+
+#ifdef DIAGNOSTIC
+			if (count++ < 0)
+				CTR0(KTR_PROC, "idle_proc: timed out waiting"
+				    " for a process");
+#endif
+
+#ifdef __i386__
+			cpu_idle();
+#endif
+		}
+
+		mtx_lock_spin(&sched_lock);
+		curproc->p_stats->p_ru.ru_nvcsw++;
+		mi_switch();
+		mtx_unlock_spin(&sched_lock);
+	}
+}
diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c
new file mode 100644
index 0000000..d65dc82
--- /dev/null
+++ b/sys/kern/kern_intr.c
@@ -0,0 +1,684 @@
+/*
+ * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/rtprio.h>
+#include <sys/systm.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/random.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+#include <sys/unistd.h>
+#include <sys/vmmeter.h>
+#include <machine/atomic.h>
+#include <machine/cpu.h>
+#include <machine/md_var.h>
+#include <machine/stdarg.h>
+
+#include <net/netisr.h>		/* prototype for legacy_setsoftnet */
+
+struct	int_entropy {
+	struct	proc *proc;
+	int	vector;
+};
+
+void	*net_ih;
+void	*vm_ih;
+void	*softclock_ih;
+struct	ithd *clk_ithd;
+struct	ithd *tty_ithd;
+
+static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
+
+static void	ithread_update(struct ithd *);
+static void	ithread_loop(void *);
+static void	start_softintr(void *);
+static void	swi_net(void *);
+
+u_char
+ithread_priority(enum intr_type flags)
+{
+	u_char pri;
+
+	flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET |
+	    INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV);
+	switch (flags) {
+	case INTR_TYPE_TTY:
+		pri = PI_TTYLOW;
+		break;
+	case INTR_TYPE_BIO:
+		/*
+		 * XXX We need to refine this.  BSD/OS distinguishes
+		 * between tape and disk priorities.
+		 */
+		pri = PI_DISK;
+		break;
+	case INTR_TYPE_NET:
+		pri = PI_NET;
+		break;
+	case INTR_TYPE_CAM:
+		pri = PI_DISK;          /* XXX or PI_CAM? */
+		break;
+	case INTR_TYPE_AV:		/* Audio/video */
+		pri = PI_AV;
+		break;
+	case INTR_TYPE_CLK:
+		pri = PI_REALTIME;
+		break;
+	case INTR_TYPE_MISC:
+		pri = PI_DULL;          /* don't care */
+		break;
+	default:
+		/* We didn't specify an interrupt level. */
+		panic("ithread_priority: no interrupt type in flags");
+	}
+
+	return pri;
+}
+
+/*
+ * Regenerate the name (p_comm) and priority for a threaded interrupt thread.
+ */
+static void
+ithread_update(struct ithd *ithd)
+{
+	struct intrhand *ih;
+	struct thread *td;
+	struct proc *p;
+	int entropy;
+
+	mtx_assert(&ithd->it_lock, MA_OWNED);
+	td = ithd->it_td;
+	if (td == NULL)
+		return;
+	p = td->td_proc;
+
+	strncpy(p->p_comm, ithd->it_name, sizeof(ithd->it_name));
+	ih = TAILQ_FIRST(&ithd->it_handlers);
+	if (ih == NULL) {
+		mtx_lock_spin(&sched_lock);
+		td->td_priority = PRI_MAX_ITHD;
+		td->td_base_pri = PRI_MAX_ITHD;
+		mtx_unlock_spin(&sched_lock);
+		ithd->it_flags &= ~IT_ENTROPY;
+		return;
+	}
+	entropy = 0;
+	mtx_lock_spin(&sched_lock);
+	td->td_priority = ih->ih_pri;
+	td->td_base_pri = ih->ih_pri;
+	mtx_unlock_spin(&sched_lock);
+	TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) {
+		if (strlen(p->p_comm) + strlen(ih->ih_name) + 1 <
+		    sizeof(p->p_comm)) {
+			strcat(p->p_comm, " ");
+			strcat(p->p_comm, ih->ih_name);
+		} else if (strlen(p->p_comm) + 1 == sizeof(p->p_comm)) {
+			if (p->p_comm[sizeof(p->p_comm) - 2] == '+')
+				p->p_comm[sizeof(p->p_comm) - 2] = '*';
+			else
+				p->p_comm[sizeof(p->p_comm) - 2] = '+';
+		} else
+			strcat(p->p_comm, "+");
+		if (ih->ih_flags & IH_ENTROPY)
+			entropy++;
+	}
+	if (entropy)
+		ithd->it_flags |= IT_ENTROPY;
+	else
+		ithd->it_flags &= ~IT_ENTROPY;
+	CTR2(KTR_INTR, "%s: updated %s\n", __func__, p->p_comm);
+}
+
+int
+ithread_create(struct ithd **ithread, int vector, int flags,
+    void (*disable)(int), void (*enable)(int), const char *fmt, ...)
+{
+	struct ithd *ithd;
+	struct thread *td;
+	struct proc *p;
+	int error;
+	va_list ap;
+
+	/* The only valid flag during creation is IT_SOFT. */
+	if ((flags & ~IT_SOFT) != 0)
+		return (EINVAL);
+
+	ithd = malloc(sizeof(struct ithd), M_ITHREAD, M_WAITOK | M_ZERO);
+	ithd->it_vector = vector;
+	ithd->it_disable = disable;
+	ithd->it_enable = enable;
+	ithd->it_flags = flags;
+	TAILQ_INIT(&ithd->it_handlers);
+	mtx_init(&ithd->it_lock, "ithread", NULL, MTX_DEF);
+
+	va_start(ap, fmt);
+	vsnprintf(ithd->it_name, sizeof(ithd->it_name), fmt, ap);
+	va_end(ap);
+
+	error = kthread_create(ithread_loop, ithd, &p, RFSTOPPED | RFHIGHPID,
+	    "%s", ithd->it_name);
+	if (error) {
+		mtx_destroy(&ithd->it_lock);
+		free(ithd, M_ITHREAD);
+		return (error);
+	}
+	td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
+	td->td_ksegrp->kg_pri_class = PRI_ITHD;
+	td->td_priority = PRI_MAX_ITHD;
+	p->p_stat = SWAIT;
+	ithd->it_td = td;
+	td->td_ithd = ithd;
+	if (ithread != NULL)
+		*ithread = ithd;
+
+	CTR2(KTR_INTR, "%s: created %s", __func__, ithd->it_name);
+	return (0);
+}
+
+int
+ithread_destroy(struct ithd *ithread)
+{
+
+	struct thread *td;
+	struct proc *p;
+	if (ithread == NULL)
+		return (EINVAL);
+
+	td = ithread->it_td;
+	p = td->td_proc;
+	mtx_lock(&ithread->it_lock);
+	if (!TAILQ_EMPTY(&ithread->it_handlers)) {
+		mtx_unlock(&ithread->it_lock);
+		return (EINVAL);
+	}
+	ithread->it_flags |= IT_DEAD;
+	mtx_lock_spin(&sched_lock);
+	if (p->p_stat == SWAIT) {
+		p->p_stat = SRUN; /* XXXKSE */
+		setrunqueue(td);
+	}
+	mtx_unlock_spin(&sched_lock);
+	mtx_unlock(&ithread->it_lock);
+	CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_name);
+	return (0);
+}
+
+int
+ithread_add_handler(struct ithd* ithread, const char *name,
+    driver_intr_t handler, void *arg, u_char pri, enum intr_type flags,
+    void **cookiep)
+{
+	struct intrhand *ih, *temp_ih;
+
+	if (ithread == NULL || name == NULL || handler == NULL)
+		return (EINVAL);
+	if ((flags & INTR_FAST) !=0)
+		flags |= INTR_EXCL;
+
+	ih = malloc(sizeof(struct intrhand), M_ITHREAD, M_WAITOK | M_ZERO);
+	ih->ih_handler = handler;
+	ih->ih_argument = arg;
+	ih->ih_name = name;
+	ih->ih_ithread = ithread;
+	ih->ih_pri = pri;
+	if (flags & INTR_FAST)
+		ih->ih_flags = IH_FAST | IH_EXCLUSIVE;
+	else if (flags & INTR_EXCL)
+		ih->ih_flags = IH_EXCLUSIVE;
+	if (flags & INTR_MPSAFE)
+		ih->ih_flags |= IH_MPSAFE;
+	if (flags & INTR_ENTROPY)
+		ih->ih_flags |= IH_ENTROPY;
+
+	mtx_lock(&ithread->it_lock);
+	if ((flags & INTR_EXCL) !=0 && !TAILQ_EMPTY(&ithread->it_handlers))
+		goto fail;
+	if (!TAILQ_EMPTY(&ithread->it_handlers) &&
+	    (TAILQ_FIRST(&ithread->it_handlers)->ih_flags & IH_EXCLUSIVE) != 0)
+		goto fail;
+
+	TAILQ_FOREACH(temp_ih, &ithread->it_handlers, ih_next)
+	    if (temp_ih->ih_pri > ih->ih_pri)
+		    break;
+	if (temp_ih == NULL)
+		TAILQ_INSERT_TAIL(&ithread->it_handlers, ih, ih_next);
+	else
+		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
+	ithread_update(ithread);
+	mtx_unlock(&ithread->it_lock);
+
+	if (cookiep != NULL)
+		*cookiep = ih;
+	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
+	    ithread->it_name);
+	return (0);
+
+fail:
+	mtx_unlock(&ithread->it_lock);
+	free(ih, M_ITHREAD);
+	return (EINVAL);
+}
+
+int
+ithread_remove_handler(void *cookie)
+{
+	struct intrhand *handler = (struct intrhand *)cookie;
+	struct ithd *ithread;
+#ifdef INVARIANTS
+	struct intrhand *ih;
+#endif
+
+	if (handler == NULL)
+		return (EINVAL);
+	ithread = handler->ih_ithread;
+	KASSERT(ithread != NULL,
+	    ("interrupt handler \"%s\" has a NULL interrupt thread",
+		handler->ih_name));
+	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
+	    ithread->it_name);
+	mtx_lock(&ithread->it_lock);
+#ifdef INVARIANTS
+	TAILQ_FOREACH(ih, &ithread->it_handlers, ih_next)
+		if (ih == handler)
+			goto ok;
+	mtx_unlock(&ithread->it_lock);
+	panic("interrupt handler \"%s\" not found in interrupt thread \"%s\"",
+	    ih->ih_name, ithread->it_name);
+ok:
+#endif
+	/*
+	 * If the interrupt thread is already running, then just mark this
+	 * handler as being dead and let the ithread do the actual removal.
+	 */
+	mtx_lock_spin(&sched_lock);
+	if (ithread->it_td->td_proc->p_stat != SWAIT) {
+		handler->ih_flags |= IH_DEAD;
+
+		/*
+		 * Ensure that the thread will process the handler list
+		 * again and remove this handler if it has already passed
+		 * it on the list.
+		 */
+		ithread->it_need = 1;
+	} else 
+		TAILQ_REMOVE(&ithread->it_handlers, handler, ih_next);
+	mtx_unlock_spin(&sched_lock);
+	if ((handler->ih_flags & IH_DEAD) != 0)
+		msleep(handler, &ithread->it_lock, PUSER, "itrmh", 0);
+	ithread_update(ithread);
+	mtx_unlock(&ithread->it_lock);
+	free(handler, M_ITHREAD);
+	return (0);
+}
+
+int
+ithread_schedule(struct ithd *ithread, int do_switch)
+{
+	struct int_entropy entropy;
+	struct thread *td;
+	struct proc *p;
+
+	/*
+	 * If no ithread or no handlers, then we have a stray interrupt.
+	 */
+	if ((ithread == NULL) || TAILQ_EMPTY(&ithread->it_handlers))
+		return (EINVAL);
+
+	/*
+	 * If any of the handlers for this ithread claim to be good
+	 * sources of entropy, then gather some.
+	 */
+	if (harvest.interrupt && ithread->it_flags & IT_ENTROPY) {
+		entropy.vector = ithread->it_vector;
+		entropy.proc = curthread->td_proc;;
+		random_harvest(&entropy, sizeof(entropy), 2, 0,
+		    RANDOM_INTERRUPT);
+	}
+
+	td = ithread->it_td;
+	p = td->td_proc;
+	KASSERT(p != NULL, ("ithread %s has no process", ithread->it_name));
+	CTR4(KTR_INTR, "%s: pid %d: (%s) need = %d", __func__, p->p_pid, p->p_comm,
+	    ithread->it_need);
+
+	/*
+	 * Set it_need to tell the thread to keep running if it is already
+	 * running.  Then, grab sched_lock and see if we actually need to
+	 * put this thread on the runqueue.  If so and the do_switch flag is
+	 * true and it is safe to switch, then switch to the ithread
+	 * immediately.  Otherwise, set the needresched flag to guarantee
+	 * that this ithread will run before any userland processes.
+	 */
+	ithread->it_need = 1;
+	mtx_lock_spin(&sched_lock);
+	if (p->p_stat == SWAIT) {
+		CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, p->p_pid);
+		p->p_stat = SRUN;
+		setrunqueue(td); /* XXXKSE */
+		if (do_switch && curthread->td_critnest == 1 &&
+		    curthread->td_proc->p_stat == SRUN) {
+			if (curthread != PCPU_GET(idlethread))
+				setrunqueue(curthread);
+			curthread->td_proc->p_stats->p_ru.ru_nivcsw++;
+			mi_switch();
+		} else {
+			curthread->td_kse->ke_flags |= KEF_NEEDRESCHED;
+		}
+	} else {
+		CTR4(KTR_INTR, "%s: pid %d: it_need %d, state %d",
+		    __func__, p->p_pid, ithread->it_need, p->p_stat);
+	}
+	mtx_unlock_spin(&sched_lock);
+
+	return (0);
+}
+
+int
+swi_add(struct ithd **ithdp, const char *name, driver_intr_t handler, 
+	    void *arg, int pri, enum intr_type flags, void **cookiep)
+{
+	struct ithd *ithd;
+	int error;
+
+	if (flags & (INTR_FAST | INTR_ENTROPY))
+		return (EINVAL);
+
+	ithd = (ithdp != NULL) ? *ithdp : NULL;
+
+	if (ithd != NULL) {
+		if ((ithd->it_flags & IT_SOFT) == 0)
+			return(EINVAL);
+	} else {
+		error = ithread_create(&ithd, pri, IT_SOFT, NULL, NULL,
+		    "swi%d:", pri);
+		if (error)
+			return (error);
+
+		if (ithdp != NULL)
+			*ithdp = ithd;
+	}
+	return (ithread_add_handler(ithd, name, handler, arg,
+		    (pri * RQ_PPQ) + PI_SOFT, flags, cookiep));
+}
+
+
+/*
+ * Schedule a heavyweight software interrupt process. 
+ */
+void
+swi_sched(void *cookie, int flags)
+{
+	struct intrhand *ih = (struct intrhand *)cookie;
+	struct ithd *it = ih->ih_ithread;
+	int error;
+
+	atomic_add_int(&cnt.v_intr, 1); /* one more global interrupt */
+		
+	CTR3(KTR_INTR, "swi_sched pid %d(%s) need=%d",
+		it->it_td->td_proc->p_pid, it->it_td->td_proc->p_comm, it->it_need);
+
+	/*
+	 * Set ih_need for this handler so that if the ithread is already
+	 * running it will execute this handler on the next pass.  Otherwise,
+	 * it will execute it the next time it runs.
+	 */
+	atomic_store_rel_int(&ih->ih_need, 1);
+	if (!(flags & SWI_DELAY)) {
+		error = ithread_schedule(it, !cold);
+		KASSERT(error == 0, ("stray software interrupt"));
+	}
+}
+
+/*
+ * This is the main code for interrupt threads.
+ */
+void
+ithread_loop(void *arg)
+{
+	struct ithd *ithd;		/* our thread context */
+	struct intrhand *ih;		/* and our interrupt handler chain */
+	struct thread *td;
+	struct proc *p;
+	
+	td = curthread;
+	p = td->td_proc;
+	ithd = (struct ithd *)arg;	/* point to myself */
+	KASSERT(ithd->it_td == td && td->td_ithd == ithd,
+	    ("%s: ithread and proc linkage out of sync", __func__));
+
+	/*
+	 * As long as we have interrupts outstanding, go through the
+	 * list of handlers, giving each one a go at it.
+	 */
+	for (;;) {
+		/*
+		 * If we are an orphaned thread, then just die.
+		 */
+		if (ithd->it_flags & IT_DEAD) {
+			CTR3(KTR_INTR, "%s: pid %d: (%s) exiting", __func__,
+			    p->p_pid, p->p_comm);
+			td->td_ithd = NULL;
+			mtx_destroy(&ithd->it_lock);
+			mtx_lock(&Giant);
+			free(ithd, M_ITHREAD);
+			kthread_exit(0);
+		}
+
+		CTR4(KTR_INTR, "%s: pid %d: (%s) need=%d", __func__,
+		     p->p_pid, p->p_comm, ithd->it_need);
+		while (ithd->it_need) {
+			/*
+			 * Service interrupts.  If another interrupt
+			 * arrives while we are running, they will set
+			 * it_need to denote that we should make
+			 * another pass.
+			 */
+			atomic_store_rel_int(&ithd->it_need, 0);
+restart:
+			TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) {
+				if (ithd->it_flags & IT_SOFT && !ih->ih_need)
+					continue;
+				atomic_store_rel_int(&ih->ih_need, 0);
+				CTR6(KTR_INTR,
+				    "%s: pid %d ih=%p: %p(%p) flg=%x", __func__,
+				    p->p_pid, (void *)ih,
+				    (void *)ih->ih_handler, ih->ih_argument,
+				    ih->ih_flags);
+
+				if ((ih->ih_flags & IH_DEAD) != 0) {
+					mtx_lock(&ithd->it_lock);
+					TAILQ_REMOVE(&ithd->it_handlers, ih,
+					    ih_next);
+					wakeup(ih);
+					mtx_unlock(&ithd->it_lock);
+					goto restart;
+				}
+				if ((ih->ih_flags & IH_MPSAFE) == 0)
+					mtx_lock(&Giant);
+				ih->ih_handler(ih->ih_argument);
+				if ((ih->ih_flags & IH_MPSAFE) == 0)
+					mtx_unlock(&Giant);
+			}
+		}
+
+		/*
+		 * Processed all our interrupts.  Now get the sched
+		 * lock.  This may take a while and it_need may get
+		 * set again, so we have to check it again.
+		 */
+		mtx_assert(&Giant, MA_NOTOWNED);
+		mtx_lock_spin(&sched_lock);
+		if (!ithd->it_need) {
+			/*
+			 * Should we call this earlier in the loop above?
+			 */
+			if (ithd->it_enable != NULL)
+				ithd->it_enable(ithd->it_vector);
+			p->p_stat = SWAIT; /* we're idle */
+			p->p_stats->p_ru.ru_nvcsw++;
+			CTR2(KTR_INTR, "%s: pid %d: done", __func__, p->p_pid);
+			mi_switch();
+			CTR2(KTR_INTR, "%s: pid %d: resumed", __func__, p->p_pid);
+		}
+		mtx_unlock_spin(&sched_lock);
+	}
+}
+
+/*
+ * Start standard software interrupt threads
+ */
+static void
+start_softintr(void *dummy)
+{
+
+	if (swi_add(NULL, "net", swi_net, NULL, SWI_NET, 0, &net_ih) ||
+	    swi_add(&clk_ithd, "clock", softclock, NULL, SWI_CLOCK,
+		INTR_MPSAFE, &softclock_ih) ||
+	    swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, 0, &vm_ih))
+		panic("died while creating standard software ithreads");
+
+	PROC_LOCK(clk_ithd->it_td->td_proc);
+	clk_ithd->it_td->td_proc->p_flag |= P_NOLOAD;
+	PROC_UNLOCK(clk_ithd->it_td->td_proc);
+}
+SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL)
+
+void
+legacy_setsoftnet(void)
+{
+	swi_sched(net_ih, 0);
+}
+
+/*
+ * XXX: This should really be in the network code somewhere and installed
+ * via a SI_SUB_SOFINTR, SI_ORDER_MIDDLE sysinit.
+ */
+void	(*netisrs[32])(void);
+volatile unsigned int	netisr;	/* scheduling bits for network */
+
+int
+register_netisr(num, handler)
+	int num;
+	netisr_t *handler;
+{
+	
+	if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
+		printf("register_netisr: bad isr number: %d\n", num);
+		return (EINVAL);
+	}
+	netisrs[num] = handler;
+	return (0);
+}
+
+int
+unregister_netisr(num)
+	int num;
+{
+	
+	if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
+		printf("unregister_netisr: bad isr number: %d\n", num);
+		return (EINVAL);
+	}
+	netisrs[num] = NULL;
+	return (0);
+}
+
+#ifdef DEVICE_POLLING
+	void netisr_pollmore(void);
+#endif
+
+static void
+swi_net(void *dummy)
+{
+	u_int bits;
+	int i;
+
+#ifdef DEVICE_POLLING
+    for (;;) {
+	int pollmore;
+#endif
+	bits = atomic_readandclear_int(&netisr);
+#ifdef DEVICE_POLLING
+	if (bits == 0)
+		return;
+	pollmore = bits & (1 << NETISR_POLL);
+#endif
+	while ((i = ffs(bits)) != 0) {
+		i--;
+		if (netisrs[i] != NULL)
+			netisrs[i]();
+		else
+			printf("swi_net: unregistered isr number: %d.\n", i);
+		bits &= ~(1 << i);
+	}
+#ifdef DEVICE_POLLING
+	if (pollmore)
+		netisr_pollmore();
+    }
+#endif
+}
+
+/* 
+ * Sysctls used by systat and others: hw.intrnames and hw.intrcnt.
+ * The data for this machine dependent, and the declarations are in machine
+ * dependent code.  The layout of intrnames and intrcnt however is machine
+ * independent.
+ *
+ * We do not know the length of intrcnt and intrnames at compile time, so
+ * calculate things at run time.
+ */
+static int
+sysctl_intrnames(SYSCTL_HANDLER_ARGS)
+{
+	return (sysctl_handle_opaque(oidp, intrnames, eintrnames - intrnames, 
+	   req));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    NULL, 0, sysctl_intrnames, "", "Interrupt Names");
+
+static int
+sysctl_intrcnt(SYSCTL_HANDLER_ARGS)
+{
+	return (sysctl_handle_opaque(oidp, intrcnt, 
+	    (char *)eintrcnt - (char *)intrcnt, req));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    NULL, 0, sysctl_intrcnt, "", "Interrupt Counts");
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
new file mode 100644
index 0000000..cf3b03c
--- /dev/null
+++ b/sys/kern/kern_jail.c
@@ -0,0 +1,256 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/sysproto.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <net/if.h>
+#include <netinet/in.h>
+
+MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
+
+SYSCTL_DECL(_security);
+SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
+    "Jail rules");
+
+mp_fixme("these variables need a lock")
+
+int	jail_set_hostname_allowed = 1;
+SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
+    &jail_set_hostname_allowed, 0,
+    "Processes in jail can set their hostnames");
+
+int	jail_socket_unixiproute_only = 1;
+SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
+    &jail_socket_unixiproute_only, 0,
+    "Processes in jail are limited to creating UNIX/IPv4/route sockets only");
+
+int	jail_sysvipc_allowed = 0;
+SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
+    &jail_sysvipc_allowed, 0,
+    "Processes in jail can use System V IPC primitives");
+
+/*
+ * MPSAFE
+ */
+int
+jail(td, uap)
+	struct thread *td;
+	struct jail_args /* {
+		syscallarg(struct jail *) jail;
+	} */ *uap;
+{
+	struct proc *p = td->td_proc;
+	int error;
+	struct prison *pr;
+	struct jail j;
+	struct chroot_args ca;
+	struct ucred *newcred = NULL, *oldcred;
+
+	error = copyin(uap->jail, &j, sizeof j);
+	if (error)
+		return (error);
+	if (j.version != 0)
+		return (EINVAL);
+
+	MALLOC(pr, struct prison *, sizeof *pr , M_PRISON, M_WAITOK | M_ZERO);
+	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
+	pr->pr_securelevel = securelevel;
+	error = copyinstr(j.hostname, &pr->pr_host, sizeof pr->pr_host, 0);
+	if (error)
+		goto bail;
+	ca.path = j.path;
+	error = chroot(td, &ca);
+	if (error)
+		goto bail;
+	newcred = crget();
+	pr->pr_ip = j.ip_number;
+	PROC_LOCK(p);
+	/* Implicitly fail if already in jail.  */
+	error = suser_cred(p->p_ucred, 0);
+	if (error)
+		goto badcred;
+	oldcred = p->p_ucred;
+	crcopy(newcred, oldcred);
+	p->p_ucred = newcred;
+	p->p_ucred->cr_prison = pr;
+	pr->pr_ref = 1;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	return (0);
+badcred:
+	PROC_UNLOCK(p);
+	crfree(newcred);
+bail:
+	FREE(pr, M_PRISON);
+	return (error);
+}
+
+void
+prison_free(struct prison *pr)
+{
+
+	mtx_lock(&pr->pr_mtx);
+	pr->pr_ref--;
+	if (pr->pr_ref == 0) {
+		mtx_unlock(&pr->pr_mtx);
+		mtx_destroy(&pr->pr_mtx);
+		if (pr->pr_linux != NULL)
+			FREE(pr->pr_linux, M_PRISON);
+		FREE(pr, M_PRISON);
+		return;
+	}
+	mtx_unlock(&pr->pr_mtx);
+}
+
+void
+prison_hold(struct prison *pr)
+{
+
+	mtx_lock(&pr->pr_mtx);
+	pr->pr_ref++;
+	mtx_unlock(&pr->pr_mtx);
+}
+
+u_int32_t
+prison_getip(struct ucred *cred)
+{
+
+	return (cred->cr_prison->pr_ip);
+}
+
+int
+prison_ip(struct ucred *cred, int flag, u_int32_t *ip)
+{
+	u_int32_t tmp;
+
+	if (!jailed(cred))
+		return (0);
+	if (flag) 
+		tmp = *ip;
+	else
+		tmp = ntohl(*ip);
+	if (tmp == INADDR_ANY) {
+		if (flag) 
+			*ip = cred->cr_prison->pr_ip;
+		else
+			*ip = htonl(cred->cr_prison->pr_ip);
+		return (0);
+	}
+	if (tmp == INADDR_LOOPBACK) {
+		if (flag)
+			*ip = cred->cr_prison->pr_ip;
+		else
+			*ip = htonl(cred->cr_prison->pr_ip);
+		return (0);
+	}
+	if (cred->cr_prison->pr_ip != tmp)
+		return (1);
+	return (0);
+}
+
+void
+prison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip)
+{
+	u_int32_t tmp;
+
+	if (!jailed(cred))
+		return;
+	if (flag)
+		tmp = *ip;
+	else
+		tmp = ntohl(*ip);
+	if (tmp == INADDR_LOOPBACK) {
+		if (flag)
+			*ip = cred->cr_prison->pr_ip;
+		else
+			*ip = htonl(cred->cr_prison->pr_ip);
+		return;
+	}
+	return;
+}
+
+int
+prison_if(struct ucred *cred, struct sockaddr *sa)
+{
+	struct sockaddr_in *sai = (struct sockaddr_in*) sa;
+	int ok;
+
+	if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
+		ok = 1;
+	else if (sai->sin_family != AF_INET)
+		ok = 0;
+	else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
+		ok = 1;
+	else
+		ok = 0;
+	return (ok);
+}
+
+/*
+ * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
+ */
+int
+prison_check(cred1, cred2)
+	struct ucred *cred1, *cred2;
+{
+
+	if (jailed(cred1)) {
+		if (!jailed(cred2))
+			return (ESRCH);
+		if (cred2->cr_prison != cred1->cr_prison)
+			return (ESRCH);
+	}
+
+	return (0);
+}
+
+/*
+ * Return 1 if the passed credential is in a jail, otherwise 0.
+ */
+int
+jailed(cred)
+	struct ucred *cred;
+{
+
+	return (cred->cr_prison != NULL);
+}
+
+/*
+ * Return the correct hostname for the passed credential.
+ */
+void
+getcredhostname(cred, buf, size)
+	struct ucred *cred;
+	char *buf;
+	size_t size;
+{
+
+	if (jailed(cred)) {
+		mtx_lock(&cred->cr_prison->pr_mtx);
+		strncpy(buf, cred->cr_prison->pr_host, size);
+		mtx_unlock(&cred->cr_prison->pr_mtx);
+	}
+	else
+		strncpy(buf, hostname, size);
+	buf[size - 1] = '\0';
+}
diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
new file mode 100644
index 0000000..a456a86
--- /dev/null
+++ b/sys/kern/kern_kthread.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/wait.h>
+
+#include <machine/stdarg.h>
+
+/*
+ * Start a kernel process.  This is called after a fork() call in
+ * mi_startup() in the file kern/init_main.c.
+ *
+ * This function is used to start "internal" daemons and intended
+ * to be called from SYSINIT().
+ */
+void
+kproc_start(udata)
+	const void *udata;
+{
+	const struct kproc_desc	*kp = udata;
+	int error;
+
+	error = kthread_create((void (*)(void *))kp->func, NULL,
+		    kp->global_procpp, 0, "%s", kp->arg0);
+	if (error)
+		panic("kproc_start: %s: error %d", kp->arg0, error);
+}
+
+/*
+ * Create a kernel process/thread/whatever.  It shares its address space
+ * with proc0 - ie: kernel only.
+ *
+ * func is the function to start.
+ * arg is the parameter to pass to function on first startup.
+ * newpp is the return value pointing to the thread's struct proc.
+ * flags are flags to fork1 (in unistd.h)
+ * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.).
+ */
+int
+kthread_create(void (*func)(void *), void *arg,
+    struct proc **newpp, int flags, const char *fmt, ...)
+{
+	int error;
+	va_list ap;
+	struct proc *p2;
+
+	if (!proc0.p_stats /* || proc0.p_stats->p_start.tv_sec == 0 */)
+		panic("kthread_create called too soon");
+
+	error = fork1(&thread0, RFMEM | RFFDG | RFPROC | RFSTOPPED | flags,
+	    &p2);
+	if (error)
+		return error;
+
+	/* save a global descriptor, if desired */
+	if (newpp != NULL)
+		*newpp = p2;
+
+	/* this is a non-swapped system process */
+	PROC_LOCK(p2);
+	p2->p_flag |= P_SYSTEM | P_KTHREAD;
+	p2->p_procsig->ps_flag |= PS_NOCLDWAIT;
+	_PHOLD(p2);
+	PROC_UNLOCK(p2);
+
+	/* set up arg0 for 'ps', et al */
+	va_start(ap, fmt);
+	vsnprintf(p2->p_comm, sizeof(p2->p_comm), fmt, ap);
+	va_end(ap);
+
+	/* call the processes' main()... */
+	cpu_set_fork_handler(FIRST_THREAD_IN_PROC(p2), func, arg);
+
+	/* Delay putting it on the run queue until now. */
+	mtx_lock_spin(&sched_lock);
+	p2->p_sflag |= PS_INMEM;
+	if (!(flags & RFSTOPPED)) {
+		p2->p_stat = SRUN;
+		setrunqueue(FIRST_THREAD_IN_PROC(p2)); /* XXXKSE */
+	}
+	mtx_unlock_spin(&sched_lock);
+
+	return 0;
+}
+
+void
+kthread_exit(int ecode)
+{
+	struct thread *td;
+	struct proc *p;
+
+	td = curthread;
+	p = td->td_proc;
+	sx_xlock(&proctree_lock);
+	PROC_LOCK(p);
+	proc_reparent(p, initproc);
+	PROC_UNLOCK(p);
+	sx_xunlock(&proctree_lock);
+	exit1(td, W_EXITCODE(ecode, 0));
+}
+
+/*
+ * Advise a kernel process to suspend (or resume) in its main loop.
+ * Participation is voluntary.
+ */
+int
+kthread_suspend(struct proc *p, int timo)
+{
+	/*
+	 * Make sure this is indeed a system process and we can safely
+	 * use the p_siglist field.
+	 */
+	PROC_LOCK(p);
+	if ((p->p_flag & P_KTHREAD) == 0) {
+		PROC_UNLOCK(p);
+		return (EINVAL);
+	}
+	SIGADDSET(p->p_siglist, SIGSTOP);
+	wakeup(p);
+	return msleep(&p->p_siglist, &p->p_mtx, PPAUSE | PDROP, "suspkt", timo);
+}
+
+int
+kthread_resume(struct proc *p)
+{
+	/*
+	 * Make sure this is indeed a system process and we can safely
+	 * use the p_siglist field.
+	 */
+	PROC_LOCK(p);
+	if ((p->p_flag & P_KTHREAD) == 0) {
+		PROC_UNLOCK(p);
+		return (EINVAL);
+	}
+	SIGDELSET(p->p_siglist, SIGSTOP);
+	PROC_UNLOCK(p);
+	wakeup(&p->p_siglist);
+	return (0);
+}
+
+void
+kthread_suspend_check(struct proc *p)
+{
+	PROC_LOCK(p);
+	while (SIGISMEMBER(p->p_siglist, SIGSTOP)) {
+		wakeup(&p->p_siglist);
+		msleep(&p->p_siglist, &p->p_mtx, PPAUSE, "ktsusp", 0);
+	}
+	PROC_UNLOCK(p);
+}
diff --git a/sys/kern/kern_ktr.c b/sys/kern/kern_ktr.c
new file mode 100644
index 0000000..719d5e4
--- /dev/null
+++ b/sys/kern/kern_ktr.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2000
+ *	John Baldwin <jhb@FreeBSD.org>.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY JOHN BALDWIN AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL JOHN BALDWIN OR THE VOICES IN HIS HEAD
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This module holds the global variables used by KTR and the ktr_tracepoint()
+ * function that does the actual tracing.
+ */
+
+#include "opt_ddb.h"
+#include "opt_ktr.h"
+
+#include <sys/param.h>
+#include <sys/cons.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/libkern.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+
+#include <machine/cpu.h>
+#ifdef __sparc64__
+#include <machine/ktr.h>
+#endif
+
+#include <ddb/ddb.h>
+
+#ifndef KTR_ENTRIES
+#define	KTR_ENTRIES	1024
+#endif
+
+#ifndef KTR_MASK
+#define	KTR_MASK	(KTR_GEN)
+#endif
+
+#ifndef KTR_CPUMASK
+#define	KTR_CPUMASK	(~0)
+#endif
+
+#ifndef KTR_TIME
+#define	KTR_TIME	get_cyclecount()
+#endif
+
+#ifndef KTR_CPU
+#define	KTR_CPU		PCPU_GET(cpuid)
+#endif
+
+SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options");
+
+int	ktr_cpumask = KTR_CPUMASK;
+TUNABLE_INT("debug.ktr.cpumask", &ktr_cpumask);
+SYSCTL_INT(_debug_ktr, OID_AUTO, cpumask, CTLFLAG_RW, &ktr_cpumask, 0, "");
+
+int	ktr_mask = KTR_MASK;
+TUNABLE_INT("debug.ktr.mask", &ktr_mask);
+SYSCTL_INT(_debug_ktr, OID_AUTO, mask, CTLFLAG_RW, &ktr_mask, 0, "");
+
+int	ktr_entries = KTR_ENTRIES;
+SYSCTL_INT(_debug_ktr, OID_AUTO, entries, CTLFLAG_RD, &ktr_entries, 0, "");
+
+int	ktr_version = KTR_VERSION;
+SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD, &ktr_version, 0, "");
+
+volatile int	ktr_idx = 0;
+struct	ktr_entry ktr_buf[KTR_ENTRIES];
+
+#ifdef KTR_VERBOSE
+int	ktr_verbose = KTR_VERBOSE;
+TUNABLE_INT("debug.ktr.verbose", &ktr_verbose);
+SYSCTL_INT(_debug_ktr, OID_AUTO, verbose, CTLFLAG_RW, &ktr_verbose, 0, "");
+#endif
+
+void
+ktr_tracepoint(u_int mask, const char *file, int line, const char *format,
+    u_long arg1, u_long arg2, u_long arg3, u_long arg4, u_long arg5,
+    u_long arg6)
+{
+	struct ktr_entry *entry;
+	int newindex, saveindex;
+#ifdef KTR_VERBOSE
+	struct thread *td;
+#endif
+	int cpu;
+
+	if (panicstr)
+		return;
+	if ((ktr_mask & mask) == 0)
+		return;
+	cpu = KTR_CPU;
+	if (((1 << cpu) & ktr_cpumask) == 0)
+		return;
+#ifdef KTR_VERBOSE
+	td = curthread;
+	if (td->td_inktr)
+		return;
+	td->td_inktr++;
+#endif
+	do {
+		saveindex = ktr_idx;
+		newindex = (saveindex + 1) & (KTR_ENTRIES - 1);
+	} while (atomic_cmpset_rel_int(&ktr_idx, saveindex, newindex) == 0);
+	entry = &ktr_buf[saveindex];
+	entry->ktr_timestamp = KTR_TIME;
+	entry->ktr_cpu = cpu;
+	entry->ktr_file = file;
+	entry->ktr_line = line;
+#ifdef KTR_VERBOSE
+	if (ktr_verbose) {
+#ifdef SMP
+		printf("cpu%d ", cpu);
+#endif
+		if (ktr_verbose > 1) {
+			printf("%s.%d\t", entry->ktr_file,
+			    entry->ktr_line);
+		}
+		printf(format, arg1, arg2, arg3, arg4, arg5, arg6);
+		printf("\n");
+	}
+#endif
+	entry->ktr_desc = format;
+	entry->ktr_parms[0] = arg1;
+	entry->ktr_parms[1] = arg2;
+	entry->ktr_parms[2] = arg3;
+	entry->ktr_parms[3] = arg4;
+	entry->ktr_parms[4] = arg5;
+	entry->ktr_parms[5] = arg6;
+#ifdef KTR_VERBOSE
+	td->td_inktr--;
+#endif
+}
+
+#ifdef DDB
+
+struct tstate {
+	int	cur;
+	int	first;
+};
+static	struct tstate tstate;
+static	int db_ktr_verbose;
+static	int db_mach_vtrace(void);
+
+#define	NUM_LINES_PER_PAGE	18
+
+DB_SHOW_COMMAND(ktr, db_ktr_all)
+{
+	int	c, lines;
+
+	lines = NUM_LINES_PER_PAGE;
+	tstate.cur = (ktr_idx - 1) & (KTR_ENTRIES - 1);
+	tstate.first = -1;
+	if (strcmp(modif, "v") == 0)
+		db_ktr_verbose = 1;
+	else
+		db_ktr_verbose = 0;
+	while (db_mach_vtrace())
+		if (--lines == 0) {
+			db_printf("--More--");
+			c = cngetc();
+			db_printf("\r");
+			switch (c) {
+			case '\n':	/* one more line */
+				lines = 1;
+				break;
+			case ' ':	/* one more page */
+				lines = NUM_LINES_PER_PAGE;
+				break;
+			default:
+				db_printf("\n");
+				return;
+			}
+		}
+}
+
+static int
+db_mach_vtrace(void)
+{
+	struct ktr_entry	*kp;
+
+	if (tstate.cur == tstate.first) {
+		db_printf("--- End of trace buffer ---\n");
+		return (0);
+	}
+	kp = &ktr_buf[tstate.cur];
+
+	/* Skip over unused entries. */
+	if (kp->ktr_desc == NULL) {
+		db_printf("--- End of trace buffer ---\n");
+		return (0);
+	}
+	db_printf("%d: ", tstate.cur);
+#ifdef SMP
+	db_printf("cpu%d ", kp->ktr_cpu);
+#endif
+	if (db_ktr_verbose) {
+		db_printf("%10.10lld %s.%d\t", (long long)kp->ktr_timestamp,
+		    kp->ktr_file, kp->ktr_line);
+	}
+	db_printf(kp->ktr_desc, kp->ktr_parms[0], kp->ktr_parms[1],
+	    kp->ktr_parms[2], kp->ktr_parms[3], kp->ktr_parms[4],
+	    kp->ktr_parms[5]);
+	db_printf("\n");
+
+	if (tstate.first == -1)
+		tstate.first = tstate.cur;
+
+	if (--tstate.cur < 0)
+		tstate.cur = KTR_ENTRIES - 1;
+
+	return (1);
+}
+
+#endif	/* DDB */
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
new file mode 100644
index 0000000..b71f695
--- /dev/null
+++ b/sys/kern/kern_ktrace.c
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_ktrace.c	8.2 (Berkeley) 9/23/93
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/ktrace.h>
+#include <sys/sema.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+
+static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
+
+#ifdef KTRACE
+
+#ifndef KTRACE_REQUEST_POOL
+#define	KTRACE_REQUEST_POOL	100
+#endif
+
+struct ktr_request {
+	struct	ktr_header ktr_header;
+	struct	ucred *ktr_cred;
+	struct	vnode *ktr_vp;
+	union {
+		struct	ktr_syscall ktr_syscall;
+		struct	ktr_sysret ktr_sysret;
+		struct	ktr_genio ktr_genio;
+		struct	ktr_psig ktr_psig;
+		struct	ktr_csw ktr_csw;
+	} ktr_data;
+	int	ktr_synchronous;
+	STAILQ_ENTRY(ktr_request) ktr_list;
+};
+
+static int data_lengths[] = {
+	0,					/* none */
+	offsetof(struct ktr_syscall, ktr_args),	/* KTR_SYSCALL */
+	sizeof(struct ktr_sysret),		/* KTR_SYSRET */
+	0,					/* KTR_NAMEI */
+	sizeof(struct ktr_genio),		/* KTR_GENIO */
+	sizeof(struct ktr_psig),		/* KTR_PSIG */
+	sizeof(struct ktr_csw),			/* KTR_CSW */
+	0					/* KTR_USER */
+};
+
+static STAILQ_HEAD(, ktr_request) ktr_todo;
+static STAILQ_HEAD(, ktr_request) ktr_free;
+
+static uint ktr_requestpool = KTRACE_REQUEST_POOL;
+TUNABLE_INT("kern.ktrace_request_pool", &ktr_requestpool);
+
+static int print_message = 1;
+struct mtx ktrace_mtx;
+static struct sema ktrace_sema;
+
+static void ktrace_init(void *dummy);
+static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
+static uint ktrace_resize_pool(uint newsize);
+static struct ktr_request *ktr_getrequest(int type);
+static void ktr_submitrequest(struct ktr_request *req);
+static void ktr_freerequest(struct ktr_request *req);
+static void ktr_loop(void *dummy);
+static void ktr_writerequest(struct ktr_request *req);
+static int ktrcanset(struct thread *,struct proc *);
+static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
+static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
+
+static void
+ktrace_init(void *dummy)
+{
+	struct ktr_request *req;
+	int i;
+
+	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
+	sema_init(&ktrace_sema, 0, "ktrace");
+	STAILQ_INIT(&ktr_todo);
+	STAILQ_INIT(&ktr_free);
+	for (i = 0; i < ktr_requestpool; i++) {
+		req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
+		STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+	}
+	kthread_create(ktr_loop, NULL, NULL, RFHIGHPID, "ktrace");
+}
+SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
+
+static int
+sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
+{
+	struct thread *td;
+	uint newsize, oldsize, wantsize;
+	int error;
+
+	/* Handle easy read-only case first to avoid warnings from GCC. */
+	if (!req->newptr) {
+		mtx_lock(&ktrace_mtx);
+		oldsize = ktr_requestpool;
+		mtx_unlock(&ktrace_mtx);
+		return (SYSCTL_OUT(req, &oldsize, sizeof(uint)));
+	}
+
+	error = SYSCTL_IN(req, &wantsize, sizeof(uint));
+	if (error)
+		return (error);
+	td = curthread;
+	td->td_inktrace = 1;
+	mtx_lock(&ktrace_mtx);
+	oldsize = ktr_requestpool;
+	newsize = ktrace_resize_pool(wantsize);
+	mtx_unlock(&ktrace_mtx);
+	td->td_inktrace = 0;
+	error = SYSCTL_OUT(req, &oldsize, sizeof(uint));
+	if (error)
+		return (error);
+	if (newsize != wantsize)
+		return (ENOSPC);
+	return (0);
+}
+SYSCTL_PROC(_kern, OID_AUTO, ktrace_request_pool, CTLTYPE_UINT|CTLFLAG_RW,
+    &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", "");
+
+static uint
+ktrace_resize_pool(uint newsize)
+{
+	struct ktr_request *req;
+
+	mtx_assert(&ktrace_mtx, MA_OWNED);
+	print_message = 1;
+	if (newsize == ktr_requestpool)
+		return (newsize);
+	if (newsize < ktr_requestpool)
+		/* Shrink pool down to newsize if possible. */
+		while (ktr_requestpool > newsize) {
+			req = STAILQ_FIRST(&ktr_free);
+			if (req == NULL)
+				return (ktr_requestpool);
+			STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
+			ktr_requestpool--;
+			mtx_unlock(&ktrace_mtx);
+			free(req, M_KTRACE);
+			mtx_lock(&ktrace_mtx);
+		}
+	else
+		/* Grow pool up to newsize. */
+		while (ktr_requestpool < newsize) {
+			mtx_unlock(&ktrace_mtx);
+			req = malloc(sizeof(struct ktr_request), M_KTRACE,
+			    M_WAITOK);
+			mtx_lock(&ktrace_mtx);
+			STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+			ktr_requestpool++;
+		}
+	return (ktr_requestpool);
+}
+
+static struct ktr_request *
+ktr_getrequest(int type)
+{
+	struct ktr_request *req;
+	struct thread *td = curthread;
+	struct proc *p = td->td_proc;
+	int pm;
+
+	td->td_inktrace = 1;
+	mtx_lock(&ktrace_mtx);
+	if (!KTRCHECK(td, type)) {
+		mtx_unlock(&ktrace_mtx);
+		td->td_inktrace = 0;
+		return (NULL);
+	}
+	req = STAILQ_FIRST(&ktr_free);
+	if (req != NULL) {
+		STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
+		req->ktr_header.ktr_type = type;
+		KASSERT(p->p_tracep != NULL, ("ktrace: no trace vnode"));
+		req->ktr_vp = p->p_tracep;
+		VREF(p->p_tracep);
+		mtx_unlock(&ktrace_mtx);
+		microtime(&req->ktr_header.ktr_time);
+		req->ktr_header.ktr_pid = p->p_pid;
+		bcopy(p->p_comm, req->ktr_header.ktr_comm, MAXCOMLEN + 1);
+		req->ktr_cred = crhold(td->td_ucred);
+		req->ktr_header.ktr_buffer = NULL;
+		req->ktr_header.ktr_len = 0;
+		req->ktr_synchronous = 0;
+	} else {
+		pm = print_message;
+		print_message = 0;
+		mtx_unlock(&ktrace_mtx);
+		if (pm)
+			printf("Out of ktrace request objects.\n");
+		td->td_inktrace = 0;
+	}
+	return (req);
+}
+
+static void
+ktr_submitrequest(struct ktr_request *req)
+{
+
+	mtx_lock(&ktrace_mtx);
+	STAILQ_INSERT_TAIL(&ktr_todo, req, ktr_list);
+	sema_post(&ktrace_sema);
+	if (req->ktr_synchronous) {
+		/*
+		 * For a synchronous request, we wait for the ktrace thread
+		 * to get to our item in the todo list and wake us up.  Then
+		 * we write the request out ourselves and wake the ktrace
+		 * thread back up.
+		 */
+		msleep(req, &ktrace_mtx, curthread->td_priority, "ktrsync", 0);
+		mtx_unlock(&ktrace_mtx);
+		ktr_writerequest(req);
+		mtx_lock(&ktrace_mtx);
+		wakeup(req);
+	}
+	mtx_unlock(&ktrace_mtx);
+	curthread->td_inktrace = 0;
+}
+
+static void
+ktr_freerequest(struct ktr_request *req)
+{
+
+	crfree(req->ktr_cred);
+	mtx_lock(&Giant);
+	vrele(req->ktr_vp);
+	mtx_unlock(&Giant);
+	mtx_lock(&ktrace_mtx);
+	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+	mtx_unlock(&ktrace_mtx);
+}
+
+static void
+ktr_loop(void *dummy)
+{
+	struct ktr_request *req;
+	struct thread *td;
+	struct ucred *cred;
+
+	/* Only cache these values once. */
+	td = curthread;
+	cred = td->td_ucred;
+	for (;;) {
+		sema_wait(&ktrace_sema);
+		mtx_lock(&ktrace_mtx);
+		req = STAILQ_FIRST(&ktr_todo);
+		STAILQ_REMOVE_HEAD(&ktr_todo, ktr_list);
+		KASSERT(req != NULL, ("got a NULL request"));
+		if (req->ktr_synchronous) {
+			wakeup(req);
+			msleep(req, &ktrace_mtx, curthread->td_priority,
+			    "ktrwait", 0);
+			mtx_unlock(&ktrace_mtx);
+		} else {
+			mtx_unlock(&ktrace_mtx);
+			/*
+			 * It is not enough just to pass the cached cred
+			 * to the VOP's in ktr_writerequest().  Some VFS
+			 * operations use curthread->td_ucred, so we need
+			 * to modify our thread's credentials as well.
+			 * Evil.
+			 */
+			td->td_ucred = req->ktr_cred;
+			ktr_writerequest(req);
+			td->td_ucred = cred;
+		}
+		ktr_freerequest(req);
+	}
+}
+
+/*
+ * MPSAFE
+ */
+void
+ktrsyscall(code, narg, args)
+	int code, narg;
+	register_t args[];
+{
+	struct ktr_request *req;
+	struct ktr_syscall *ktp;
+	size_t buflen;
+
+	req = ktr_getrequest(KTR_SYSCALL);
+	if (req == NULL)
+		return;
+	ktp = &req->ktr_data.ktr_syscall;
+	ktp->ktr_code = code;
+	ktp->ktr_narg = narg;
+	buflen = sizeof(register_t) * narg;
+	if (buflen > 0) {
+		req->ktr_header.ktr_buffer = malloc(buflen, M_KTRACE, M_WAITOK);
+		bcopy(args, req->ktr_header.ktr_buffer, buflen);
+		req->ktr_header.ktr_len = buflen;
+	}
+	ktr_submitrequest(req);
+}
+
+/*
+ * MPSAFE
+ */
+void
+ktrsysret(code, error, retval)
+	int code, error;
+	register_t retval;
+{
+	struct ktr_request *req;
+	struct ktr_sysret *ktp;
+
+	req = ktr_getrequest(KTR_SYSRET);
+	if (req == NULL)
+		return;
+	ktp = &req->ktr_data.ktr_sysret;
+	ktp->ktr_code = code;
+	ktp->ktr_error = error;
+	ktp->ktr_retval = retval;		/* what about val2 ? */
+	ktr_submitrequest(req);
+}
+
+void
+ktrnamei(path)
+	char *path;
+{
+	struct ktr_request *req;
+	int namelen;
+
+	req = ktr_getrequest(KTR_NAMEI);
+	if (req == NULL)
+		return;
+	namelen = strlen(path);
+	if (namelen > 0) {
+		req->ktr_header.ktr_len = namelen;
+		req->ktr_header.ktr_buffer = malloc(namelen, M_KTRACE,
+		    M_WAITOK);
+		bcopy(path, req->ktr_header.ktr_buffer, namelen);
+	}
+	ktr_submitrequest(req);
+}
+
+/*
+ * Since the uio may not stay valid, we can not hand off this request to
+ * the thread and need to process it synchronously.  However, we wish to
+ * keep the relative order of records in a trace file correct, so we
+ * do put this request on the queue (if it isn't empty) and then block.
+ * The ktrace thread waks us back up when it is time for this event to
+ * be posted and blocks until we have completed writing out the event
+ * and woken it back up.
+ */
+void
+ktrgenio(fd, rw, uio, error)
+	int fd;
+	enum uio_rw rw;
+	struct uio *uio;
+	int error;
+{
+	struct ktr_request *req;
+	struct ktr_genio *ktg;
+
+	if (error)
+		return;
+	req = ktr_getrequest(KTR_GENIO);
+	if (req == NULL)
+		return;
+	ktg = &req->ktr_data.ktr_genio;
+	ktg->ktr_fd = fd;
+	ktg->ktr_rw = rw;
+	req->ktr_header.ktr_buffer = uio;
+	uio->uio_offset = 0;
+	uio->uio_rw = UIO_WRITE;
+	req->ktr_synchronous = 1;
+	ktr_submitrequest(req);
+}
+
+void
+ktrpsig(sig, action, mask, code)
+	int sig;
+	sig_t action;
+	sigset_t *mask;
+	int code;
+{
+	struct ktr_request *req;
+	struct ktr_psig	*kp;
+
+	req = ktr_getrequest(KTR_PSIG);
+	if (req == NULL)
+		return;
+	kp = &req->ktr_data.ktr_psig;
+	kp->signo = (char)sig;
+	kp->action = action;
+	kp->mask = *mask;
+	kp->code = code;
+	ktr_submitrequest(req);
+}
+
+void
+ktrcsw(out, user)
+	int out, user;
+{
+	struct ktr_request *req;
+	struct ktr_csw *kc;
+
+	req = ktr_getrequest(KTR_CSW);
+	if (req == NULL)
+		return;
+	kc = &req->ktr_data.ktr_csw;
+	kc->out = out;
+	kc->user = user;
+	ktr_submitrequest(req);
+}
+#endif
+
+/* Interface and common routines */
+
+/*
+ * ktrace system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ktrace_args {
+	char	*fname;
+	int	ops;
+	int	facs;
+	int	pid;
+};
+#endif
+/* ARGSUSED */
+int
+ktrace(td, uap)
+	struct thread *td;
+	register struct ktrace_args *uap;
+{
+#ifdef KTRACE
+	register struct vnode *vp = NULL;
+	register struct proc *p;
+	struct pgrp *pg;
+	int facs = uap->facs & ~KTRFAC_ROOT;
+	int ops = KTROP(uap->ops);
+	int descend = uap->ops & KTRFLAG_DESCEND;
+	int ret = 0;
+	int flags, error = 0;
+	struct nameidata nd;
+
+	td->td_inktrace = 1;
+	if (ops != KTROP_CLEAR) {
+		/*
+		 * an operation which requires a file argument.
+		 */
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->fname, td);
+		flags = FREAD | FWRITE | O_NOFOLLOW;
+		error = vn_open(&nd, &flags, 0);
+		if (error) {
+			td->td_inktrace = 0;
+			return (error);
+		}
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vp = nd.ni_vp;
+		VOP_UNLOCK(vp, 0, td);
+		if (vp->v_type != VREG) {
+			(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
+			td->td_inktrace = 0;
+			return (EACCES);
+		}
+	}
+	/*
+	 * Clear all uses of the tracefile.
+	 */
+	if (ops == KTROP_CLEARFILE) {
+		sx_slock(&allproc_lock);
+		LIST_FOREACH(p, &allproc, p_list) {
+			PROC_LOCK(p);
+			if (p->p_tracep == vp) {
+				if (ktrcanset(td, p)) {
+					mtx_lock(&ktrace_mtx);
+					p->p_tracep = NULL;
+					p->p_traceflag = 0;
+					mtx_unlock(&ktrace_mtx);
+					PROC_UNLOCK(p);
+					(void) vn_close(vp, FREAD|FWRITE,
+						td->td_ucred, td);
+				} else {
+					PROC_UNLOCK(p);
+					error = EPERM;
+				}
+			} else
+				PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+		goto done;
+	}
+	/*
+	 * need something to (un)trace (XXX - why is this here?)
+	 */
+	if (!facs) {
+		error = EINVAL;
+		goto done;
+	}
+	/*
+	 * do it
+	 */
+	if (uap->pid < 0) {
+		/*
+		 * by process group
+		 */
+		sx_slock(&proctree_lock);
+		pg = pgfind(-uap->pid);
+		if (pg == NULL) {
+			sx_sunlock(&proctree_lock);
+			error = ESRCH;
+			goto done;
+		}
+		/*
+		 * ktrops() may call vrele(). Lock pg_members
+		 * by the proctree_lock rather than pg_mtx.
+		 */
+		PGRP_UNLOCK(pg);
+		LIST_FOREACH(p, &pg->pg_members, p_pglist)
+			if (descend)
+				ret |= ktrsetchildren(td, p, ops, facs, vp);
+			else
+				ret |= ktrops(td, p, ops, facs, vp);
+		sx_sunlock(&proctree_lock);
+	} else {
+		/*
+		 * by pid
+		 */
+		p = pfind(uap->pid);
+		if (p == NULL) {
+			error = ESRCH;
+			goto done;
+		}
+		PROC_UNLOCK(p);
+		/* XXX: UNLOCK above has a race */
+		if (descend)
+			ret |= ktrsetchildren(td, p, ops, facs, vp);
+		else
+			ret |= ktrops(td, p, ops, facs, vp);
+	}
+	if (!ret)
+		error = EPERM;
+done:
+	if (vp != NULL)
+		(void) vn_close(vp, FWRITE, td->td_ucred, td);
+	td->td_inktrace = 0;
+	return (error);
+#else
+	return ENOSYS;
+#endif
+}
+
+/*
+ * utrace system call
+ */
+/* ARGSUSED */
+int
+utrace(td, uap)
+	struct thread *td;
+	register struct utrace_args *uap;
+{
+
+#ifdef KTRACE
+	struct ktr_request *req;
+	register caddr_t cp;
+
+	if (uap->len > KTR_USER_MAXLEN)
+		return (EINVAL);
+	req = ktr_getrequest(KTR_USER);
+	if (req == NULL)
+		return (0);
+	MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK);
+	if (!copyin(uap->addr, cp, uap->len)) {
+		req->ktr_header.ktr_buffer = cp;
+		req->ktr_header.ktr_len = uap->len;
+		ktr_submitrequest(req);
+	} else {
+		ktr_freerequest(req);
+		td->td_inktrace = 0;
+	}
+	return (0);
+#else
+	return (ENOSYS);
+#endif
+}
+
+#ifdef KTRACE
+static int
+ktrops(td, p, ops, facs, vp)
+	struct thread *td;
+	struct proc *p;
+	int ops, facs;
+	struct vnode *vp;
+{
+	struct vnode *tracevp = NULL;
+
+	PROC_LOCK(p);
+	if (!ktrcanset(td, p)) {
+		PROC_UNLOCK(p);
+		return (0);
+	}
+	mtx_lock(&ktrace_mtx);
+	if (ops == KTROP_SET) {
+		if (p->p_tracep != vp) {
+			/*
+			 * if trace file already in use, relinquish below
+			 */
+			tracevp = p->p_tracep;
+			VREF(vp);
+			p->p_tracep = vp;
+		}
+		p->p_traceflag |= facs;
+		if (td->td_ucred->cr_uid == 0)
+			p->p_traceflag |= KTRFAC_ROOT;
+	} else {
+		/* KTROP_CLEAR */
+		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
+			/* no more tracing */
+			p->p_traceflag = 0;
+			tracevp = p->p_tracep;
+			p->p_tracep = NULL;
+		}
+	}
+	mtx_unlock(&ktrace_mtx);
+	PROC_UNLOCK(p);
+	if (tracevp != NULL)
+		vrele(tracevp);
+
+	return (1);
+}
+
+static int
+ktrsetchildren(td, top, ops, facs, vp)
+	struct thread *td;
+	struct proc *top;
+	int ops, facs;
+	struct vnode *vp;
+{
+	register struct proc *p;
+	register int ret = 0;
+
+	p = top;
+	sx_slock(&proctree_lock);
+	for (;;) {
+		ret |= ktrops(td, p, ops, facs, vp);
+		/*
+		 * If this process has children, descend to them next,
+		 * otherwise do any siblings, and if done with this level,
+		 * follow back up the tree (but not past top).
+		 */
+		if (!LIST_EMPTY(&p->p_children))
+			p = LIST_FIRST(&p->p_children);
+		else for (;;) {
+			if (p == top) {
+				sx_sunlock(&proctree_lock);
+				return (ret);
+			}
+			if (LIST_NEXT(p, p_sibling)) {
+				p = LIST_NEXT(p, p_sibling);
+				break;
+			}
+			p = p->p_pptr;
+		}
+	}
+	/*NOTREACHED*/
+}
+
+static void
+ktr_writerequest(struct ktr_request *req)
+{
+	struct ktr_header *kth;
+	struct vnode *vp;
+	struct uio *uio = NULL;
+	struct proc *p;
+	struct thread *td;
+	struct ucred *cred;
+	struct uio auio;
+	struct iovec aiov[3];
+	struct mount *mp;
+	int datalen, buflen, vrele_count;
+	int error;
+
+	vp = req->ktr_vp;
+	/*
+	 * If vp is NULL, the vp has been cleared out from under this
+	 * request, so just drop it.
+	 */
+	if (vp == NULL)
+		return;
+	kth = &req->ktr_header;
+	datalen = data_lengths[kth->ktr_type];
+	buflen = kth->ktr_len;
+	cred = req->ktr_cred;
+	td = curthread;
+	auio.uio_iov = &aiov[0];
+	auio.uio_offset = 0;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_rw = UIO_WRITE;
+	aiov[0].iov_base = (caddr_t)kth;
+	aiov[0].iov_len = sizeof(struct ktr_header);
+	auio.uio_resid = sizeof(struct ktr_header);
+	auio.uio_iovcnt = 1;
+	auio.uio_td = td;
+	if (datalen != 0) {
+		aiov[1].iov_base = (caddr_t)&req->ktr_data;
+		aiov[1].iov_len = datalen;
+		auio.uio_resid += datalen;
+		auio.uio_iovcnt++;
+		kth->ktr_len += datalen;
+	}
+	if (buflen != 0) {
+		KASSERT(kth->ktr_buffer != NULL, ("ktrace: nothing to write"));
+		aiov[auio.uio_iovcnt].iov_base = kth->ktr_buffer;
+		aiov[auio.uio_iovcnt].iov_len = buflen;
+		auio.uio_resid += buflen;
+		auio.uio_iovcnt++;
+	} else
+		uio = kth->ktr_buffer;
+	KASSERT((uio == NULL) ^ (kth->ktr_type == KTR_GENIO),
+	    ("ktrace: uio and genio mismatch"));
+	if (uio != NULL)
+		kth->ktr_len += uio->uio_resid;
+	mtx_lock(&Giant);
+	vn_start_write(vp, &mp, V_WAIT);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	(void)VOP_LEASE(vp, td, cred, LEASE_WRITE);
+	error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
+	if (error == 0 && uio != NULL) {
+		(void)VOP_LEASE(vp, td, cred, LEASE_WRITE);
+		error = VOP_WRITE(vp, uio, IO_UNIT | IO_APPEND, cred);
+	}
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	mtx_unlock(&Giant);
+	if (buflen != 0)
+		free(kth->ktr_buffer, M_KTRACE);
+	if (!error)
+		return;
+	/*
+	 * If error encountered, give up tracing on this vnode.  We defer
+	 * all the vrele()'s on the vnode until after we are finished walking
+	 * the various lists to avoid needlessly holding locks.
+	 */
+	log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
+	    error);
+	vrele_count = 0;
+	/*
+	 * First, clear this vnode from being used by any processes in the
+	 * system.
+	 * XXX - If one process gets an EPERM writing to the vnode, should
+	 * we really do this?  Other processes might have suitable
+	 * credentials for the operation.
+	 */
+	sx_slock(&allproc_lock);
+	LIST_FOREACH(p, &allproc, p_list) {
+		PROC_LOCK(p);
+		if (p->p_tracep == vp) {
+			mtx_lock(&ktrace_mtx);
+			p->p_tracep = NULL;
+			p->p_traceflag = 0;
+			mtx_unlock(&ktrace_mtx);
+			vrele_count++;
+		}
+		PROC_UNLOCK(p);
+	}
+	sx_sunlock(&allproc_lock);
+	/*
+	 * Second, clear this vnode from any pending requests.
+	 */
+	mtx_lock(&ktrace_mtx);
+	STAILQ_FOREACH(req, &ktr_todo, ktr_list) {
+		if (req->ktr_vp == vp) {
+			req->ktr_vp = NULL;
+			vrele_count++;
+		}
+	}
+	mtx_unlock(&ktrace_mtx);
+	mtx_lock(&Giant);
+	while (vrele_count-- > 0)
+		vrele(vp);
+	mtx_unlock(&Giant);
+}
+
+/*
+ * Return true if caller has permission to set the ktracing state
+ * of target.  Essentially, the target can't possess any
+ * more permissions than the caller.  KTRFAC_ROOT signifies that
+ * root previously set the tracing status on the target process, and
+ * so, only root may further change it.
+ */
+static int
+ktrcanset(td, targetp)
+	struct thread *td;
+	struct proc *targetp;
+{
+
+	PROC_LOCK_ASSERT(targetp, MA_OWNED);
+	if (targetp->p_traceflag & KTRFAC_ROOT &&
+	    suser_cred(td->td_ucred, PRISON_ROOT))
+		return (0);
+
+	if (p_candebug(td, targetp) != 0)
+		return (0);
+
+	return (1);
+}
+
+#endif /* KTRACE */
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
new file mode 100644
index 0000000..a506726
--- /dev/null
+++ b/sys/kern/kern_linker.c
@@ -0,0 +1,1812 @@
+/*-
+ * Copyright (c) 1997-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+#include <sys/fcntl.h>
+#include <sys/libkern.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/sysctl.h>
+
+#include "linker_if.h"
+
+#ifdef KLD_DEBUG
+int kld_debug = 0;
+#endif
+
+/*
+ * static char *linker_search_path(const char *name, struct mod_depend
+ * *verinfo);
+ */
+static const char 	*linker_basename(const char *path);
+static int 	linker_load_module(const char *kldname, const char *modname,
+		    struct linker_file *parent, struct mod_depend *verinfo,
+		    struct linker_file **lfpp);
+
+/* Metadata from the static kernel */
+SET_DECLARE(modmetadata_set, struct mod_metadata);
+
+MALLOC_DEFINE(M_LINKER, "linker", "kernel linker");
+
+linker_file_t linker_kernel_file;
+
+static struct mtx kld_mtx;	/* kernel linker mutex */
+
+static linker_class_list_t classes;
+static linker_file_list_t linker_files;
+static int next_file_id = 1;
+static int linker_no_more_classes = 0;
+
+#define	LINKER_GET_NEXT_FILE_ID(a) do {					\
+	linker_file_t lftmp;						\
+									\
+retry:									\
+	mtx_lock(&kld_mtx);						\
+	TAILQ_FOREACH(lftmp, &linker_files, link) {			\
+		if (next_file_id == lftmp->id) {			\
+			next_file_id++;					\
+			mtx_unlock(&kld_mtx);				\
+			goto retry;					\
+		}							\
+	}								\
+	(a) = next_file_id;						\
+	mtx_unlock(&kld_mtx);	/* Hold for safe read of id variable */	\
+} while(0)
+
+
+/* XXX wrong name; we're looking at version provision tags here, not modules */
+typedef TAILQ_HEAD(, modlist) modlisthead_t;
+struct modlist {
+	TAILQ_ENTRY(modlist) link;	/* chain together all modules */
+	linker_file_t   container;
+	const char 	*name;
+	int             version;
+};
+typedef struct modlist *modlist_t;
+static modlisthead_t found_modules;
+
+static modlist_t	modlist_lookup2(const char *name,
+			    struct mod_depend *verinfo);
+
+static char *
+linker_strdup(const char *str)
+{
+	char *result;
+
+	if ((result = malloc((strlen(str) + 1), M_LINKER, M_WAITOK)) != NULL)
+		strcpy(result, str);
+	return (result);
+}
+
+static void
+linker_init(void *arg)
+{
+
+	mtx_init(&kld_mtx, "kernel linker", NULL, MTX_DEF);
+	TAILQ_INIT(&classes);
+	TAILQ_INIT(&linker_files);
+}
+
+SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0)
+
+static void
+linker_stop_class_add(void *arg)
+{
+
+	linker_no_more_classes = 1;
+}
+
+SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL)
+
+int
+linker_add_class(linker_class_t lc)
+{
+
+	/*
+	 * We disallow any class registration passt SI_ORDER_ANY
+	 * of SI_SUB_KLD.
+	 */
+	if (linker_no_more_classes == 1)
+		return (EPERM);
+	kobj_class_compile((kobj_class_t) lc);
+	TAILQ_INSERT_TAIL(&classes, lc, link);
+	return (0);
+}
+
+static void
+linker_file_sysinit(linker_file_t lf)
+{
+	struct sysinit **start, **stop, **sipp, **xipp, *save;
+
+	KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n",
+	    lf->filename));
+
+	if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0)
+		return;
+	/*
+	 * Perform a bubble sort of the system initialization objects by
+	 * their subsystem (primary key) and order (secondary key).
+	 * 
+	 * Since some things care about execution order, this is the operation
+	 * which ensures continued function.
+	 */
+	for (sipp = start; sipp < stop; sipp++) {
+		for (xipp = sipp + 1; xipp < stop; xipp++) {
+			if ((*sipp)->subsystem < (*xipp)->subsystem ||
+			    ((*sipp)->subsystem == (*xipp)->subsystem &&
+			    (*sipp)->order <= (*xipp)->order))
+				continue;	/* skip */
+			save = *sipp;
+			*sipp = *xipp;
+			*xipp = save;
+		}
+	}
+
+	/*
+	 * Traverse the (now) ordered list of system initialization tasks.
+	 * Perform each task, and continue on to the next task.
+	 */
+	for (sipp = start; sipp < stop; sipp++) {
+		if ((*sipp)->subsystem == SI_SUB_DUMMY)
+			continue;	/* skip dummy task(s) */
+
+		/* Call function */
+		(*((*sipp)->func)) ((*sipp)->udata);
+	}
+}
+
+static void
+linker_file_sysuninit(linker_file_t lf)
+{
+	struct sysinit **start, **stop, **sipp, **xipp, *save;
+
+	KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n",
+	    lf->filename));
+
+	if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop,
+	    NULL) != 0)
+		return;
+
+	/*
+	 * Perform a reverse bubble sort of the system initialization objects
+	 * by their subsystem (primary key) and order (secondary key).
+	 * 
+	 * Since some things care about execution order, this is the operation
+	 * which ensures continued function.
+	 */
+	for (sipp = start; sipp < stop; sipp++) {
+		for (xipp = sipp + 1; xipp < stop; xipp++) {
+			if ((*sipp)->subsystem > (*xipp)->subsystem ||
+			    ((*sipp)->subsystem == (*xipp)->subsystem &&
+			    (*sipp)->order >= (*xipp)->order))
+				continue;	/* skip */
+			save = *sipp;
+			*sipp = *xipp;
+			*xipp = save;
+		}
+	}
+
+	/*
+	 * Traverse the (now) ordered list of system initialization tasks.
+	 * Perform each task, and continue on to the next task.
+	 */
+	for (sipp = start; sipp < stop; sipp++) {
+		if ((*sipp)->subsystem == SI_SUB_DUMMY)
+			continue;	/* skip dummy task(s) */
+
+		/* Call function */
+		(*((*sipp)->func)) ((*sipp)->udata);
+	}
+}
+
+static void
+linker_file_register_sysctls(linker_file_t lf)
+{
+	struct sysctl_oid **start, **stop, **oidp;
+
+	KLD_DPF(FILE,
+	    ("linker_file_register_sysctls: registering SYSCTLs for %s\n",
+	    lf->filename));
+
+	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
+		return;
+
+	for (oidp = start; oidp < stop; oidp++)
+		sysctl_register_oid(*oidp);
+}
+
+static void
+linker_file_unregister_sysctls(linker_file_t lf)
+{
+	struct sysctl_oid **start, **stop, **oidp;
+
+	KLD_DPF(FILE, ("linker_file_unregister_sysctls: registering SYSCTLs"
+	    " for %s\n", lf->filename));
+
+	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
+		return;
+
+	for (oidp = start; oidp < stop; oidp++)
+		sysctl_unregister_oid(*oidp);
+}
+
+static int
+linker_file_register_modules(linker_file_t lf)
+{
+	struct mod_metadata **start, **stop, **mdp;
+	const moduledata_t *moddata;
+	int error;
+
+	KLD_DPF(FILE, ("linker_file_register_modules: registering modules"
+	    " in %s\n", lf->filename));
+
+	if (linker_file_lookup_set(lf, "modmetadata_set", &start,
+	    &stop, 0) != 0) {
+		/*
+		 * This fallback should be unnecessary, but if we get booted
+		 * from boot2 instead of loader and we are missing our
+		 * metadata then we have to try the best we can.
+		 */
+		if (lf == linker_kernel_file) {
+			start = SET_BEGIN(modmetadata_set);
+			stop = SET_LIMIT(modmetadata_set);
+		} else
+			return (0);
+	}
+	for (mdp = start; mdp < stop; mdp++) {
+		if ((*mdp)->md_type != MDT_MODULE)
+			continue;
+		moddata = (*mdp)->md_data;
+		KLD_DPF(FILE, ("Registering module %s in %s\n",
+		    moddata->name, lf->filename));
+		error = module_register(moddata, lf);
+		if (error)
+			printf("Module %s failed to register: %d\n",
+			    moddata->name, error);
+	}
+	return (0);
+}
+
+static void
+linker_init_kernel_modules(void)
+{
+
+	linker_file_register_modules(linker_kernel_file);
+}
+
+SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules, 0)
+
+int
+linker_load_file(const char *filename, linker_file_t *result)
+{
+	linker_class_t lc;
+	linker_file_t lf;
+	int foundfile, error = 0;
+
+	/* Refuse to load modules if securelevel raised */
+	if (securelevel > 0)
+		return (EPERM);
+
+	lf = linker_find_file_by_name(filename);
+	if (lf) {
+		KLD_DPF(FILE, ("linker_load_file: file %s is already loaded,"
+		    " incrementing refs\n", filename));
+		*result = lf;
+		lf->refs++;
+		goto out;
+	}
+	lf = NULL;
+	foundfile = 0;
+
+	/*
+	 * We do not need to protect (lock) classes here because there is
+	 * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY)
+	 * and there is no class deregistration mechanism at this time.
+	 */
+	TAILQ_FOREACH(lc, &classes, link) {
+		KLD_DPF(FILE, ("linker_load_file: trying to load %s\n",
+		    filename));
+		error = LINKER_LOAD_FILE(lc, filename, &lf);
+		/*
+		 * If we got something other than ENOENT, then it exists but
+		 * we cannot load it for some other reason.
+		 */
+		if (error != ENOENT)
+			foundfile = 1;
+		if (lf) {
+			linker_file_register_modules(lf);
+			linker_file_register_sysctls(lf);
+			linker_file_sysinit(lf);
+			lf->flags |= LINKER_FILE_LINKED;
+			*result = lf;
+			error = 0;
+			goto out;
+		}
+	}
+	/*
+	 * Less than ideal, but tells the user whether it failed to load or
+	 * the module was not found.
+	 */
+	if (foundfile)
+		/* Format not recognized (or unloadable). */
+		error = ENOEXEC;
+	else
+		error = ENOENT;		/* Nothing found */
+out:
+	return (error);
+}
+
+int
+linker_reference_module(const char *modname, struct mod_depend *verinfo,
+    linker_file_t *result)
+{
+	modlist_t mod;
+
+	if ((mod = modlist_lookup2(modname, verinfo)) != NULL) {
+		*result = mod->container;
+		(*result)->refs++;
+		return (0);
+	}
+
+	return (linker_load_module(NULL, modname, NULL, verinfo, result));
+}
+
+linker_file_t
+linker_find_file_by_name(const char *filename)
+{
+	linker_file_t lf = 0;
+	char *koname;
+
+	koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
+	if (koname == NULL)
+		goto out;
+	sprintf(koname, "%s.ko", filename);
+
+	mtx_lock(&kld_mtx);
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		if (strcmp(lf->filename, koname) == 0)
+			break;
+		if (strcmp(lf->filename, filename) == 0)
+			break;
+	}
+	mtx_unlock(&kld_mtx);
+out:
+	if (koname)
+		free(koname, M_LINKER);
+	return (lf);
+}
+
+linker_file_t
+linker_find_file_by_id(int fileid)
+{
+	linker_file_t lf = 0;
+	
+	mtx_lock(&kld_mtx);
+	TAILQ_FOREACH(lf, &linker_files, link)
+		if (lf->id == fileid)
+			break;
+	mtx_unlock(&kld_mtx);
+	return (lf);
+}
+
+linker_file_t
+linker_make_file(const char *pathname, linker_class_t lc)
+{
+	linker_file_t lf;
+	const char *filename;
+
+	lf = NULL;
+	filename = linker_basename(pathname);
+
+	KLD_DPF(FILE, ("linker_make_file: new file, filename=%s\n", filename));
+	lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK);
+	if (lf == NULL)
+		goto out;
+	lf->refs = 1;
+	lf->userrefs = 0;
+	lf->flags = 0;
+	lf->filename = linker_strdup(filename);
+	LINKER_GET_NEXT_FILE_ID(lf->id);
+	lf->ndeps = 0;
+	lf->deps = NULL;
+	STAILQ_INIT(&lf->common);
+	TAILQ_INIT(&lf->modules);
+	mtx_lock(&kld_mtx);
+	TAILQ_INSERT_TAIL(&linker_files, lf, link);
+	mtx_unlock(&kld_mtx);
+out:
+	return (lf);
+}
+
+int
+linker_file_unload(linker_file_t file)
+{
+	module_t mod, next;
+	modlist_t ml, nextml;
+	struct common_symbol *cp;
+	int error, i;
+
+	error = 0;
+
+	/* Refuse to unload modules if securelevel raised. */
+	if (securelevel > 0)
+		return (EPERM);
+
+	KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
+	if (file->refs == 1) {
+		KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
+		    " informing modules\n"));
+
+		/*
+		 * Inform any modules associated with this file.
+		 */
+		MOD_XLOCK;
+		for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
+			next = module_getfnext(mod);
+			MOD_XUNLOCK;
+
+			/*
+			 * Give the module a chance to veto the unload.
+			 */
+			if ((error = module_unload(mod)) != 0) {
+				KLD_DPF(FILE, ("linker_file_unload: module %x"
+				    " vetoes unload\n", mod));
+				goto out;
+			} else
+				MOD_XLOCK;
+			module_release(mod);
+		}
+		MOD_XUNLOCK;
+	}
+	file->refs--;
+	if (file->refs > 0) {
+		goto out;
+	}
+	for (ml = TAILQ_FIRST(&found_modules); ml; ml = nextml) {
+		nextml = TAILQ_NEXT(ml, link);
+		if (ml->container == file)
+			TAILQ_REMOVE(&found_modules, ml, link);
+	}
+
+	/* 
+	 * Don't try to run SYSUNINITs if we are unloaded due to a 
+	 * link error.
+	 */
+	if (file->flags & LINKER_FILE_LINKED) {
+		linker_file_sysuninit(file);
+		linker_file_unregister_sysctls(file);
+	}
+	mtx_lock(&kld_mtx);
+	TAILQ_REMOVE(&linker_files, file, link);
+	mtx_unlock(&kld_mtx);
+
+	if (file->deps) {
+		for (i = 0; i < file->ndeps; i++)
+			linker_file_unload(file->deps[i]);
+		free(file->deps, M_LINKER);
+		file->deps = NULL;
+	}
+	for (cp = STAILQ_FIRST(&file->common); cp;
+	    cp = STAILQ_FIRST(&file->common)) {
+		STAILQ_REMOVE(&file->common, cp, common_symbol, link);
+		free(cp, M_LINKER);
+	}
+
+	LINKER_UNLOAD(file);
+	if (file->filename) {
+		free(file->filename, M_LINKER);
+		file->filename = NULL;
+	}
+	kobj_delete((kobj_t) file, M_LINKER);
+out:
+	return (error);
+}
+
+int
+linker_file_add_dependency(linker_file_t file, linker_file_t dep)
+{
+	linker_file_t *newdeps;
+
+	newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t *),
+	    M_LINKER, M_WAITOK | M_ZERO);
+	if (newdeps == NULL)
+		return (ENOMEM);
+
+	if (file->deps) {
+		bcopy(file->deps, newdeps,
+		    file->ndeps * sizeof(linker_file_t *));
+		free(file->deps, M_LINKER);
+	}
+	file->deps = newdeps;
+	file->deps[file->ndeps] = dep;
+	file->ndeps++;
+	return (0);
+}
+
+/*
+ * Locate a linker set and its contents.  This is a helper function to avoid
+ * linker_if.h exposure elsewhere.  Note: firstp and lastp are really void ***
+ */
+int
+linker_file_lookup_set(linker_file_t file, const char *name,
+    void *firstp, void *lastp, int *countp)
+{
+
+	return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp));
+}
+
+caddr_t
+linker_file_lookup_symbol(linker_file_t file, const char *name, int deps)
+{
+	c_linker_sym_t sym;
+	linker_symval_t symval;
+	caddr_t address;
+	size_t common_size = 0;
+	int i;
+
+	KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%x, name=%s, deps=%d\n",
+	    file, name, deps));
+
+	if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) {
+		LINKER_SYMBOL_VALUES(file, sym, &symval);
+		if (symval.value == 0)
+			/*
+			 * For commons, first look them up in the
+			 * dependencies and only allocate space if not found
+			 * there.
+			 */
+			common_size = symval.size;
+		else {
+			KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol"
+			    ".value=%x\n", symval.value));
+			return (symval.value);
+		}
+	}
+	if (deps) {
+		for (i = 0; i < file->ndeps; i++) {
+			address = linker_file_lookup_symbol(file->deps[i],
+			    name, 0);
+			if (address) {
+				KLD_DPF(SYM, ("linker_file_lookup_symbol:"
+				    " deps value=%x\n", address));
+				return (address);
+			}
+		}
+	}
+	if (common_size > 0) {
+		/*
+		 * This is a common symbol which was not found in the
+		 * dependencies.  We maintain a simple common symbol table in
+		 * the file object.
+		 */
+		struct common_symbol *cp;
+
+		STAILQ_FOREACH(cp, &file->common, link) {
+			if (strcmp(cp->name, name) == 0) {
+				KLD_DPF(SYM, ("linker_file_lookup_symbol:"
+				    " old common value=%x\n", cp->address));
+				return (cp->address);
+			}
+		}
+		/*
+		 * Round the symbol size up to align.
+		 */
+		common_size = (common_size + sizeof(int) - 1) & -sizeof(int);
+		cp = malloc(sizeof(struct common_symbol)
+		    + common_size + strlen(name) + 1, M_LINKER,
+		    M_WAITOK | M_ZERO);
+		if (cp == NULL) {
+			KLD_DPF(SYM, ("linker_file_lookup_symbol: nomem\n"));
+			return (0);
+		}
+		cp->address = (caddr_t)(cp + 1);
+		cp->name = cp->address + common_size;
+		strcpy(cp->name, name);
+		bzero(cp->address, common_size);
+		STAILQ_INSERT_TAIL(&file->common, cp, link);
+
+		KLD_DPF(SYM, ("linker_file_lookup_symbol: new common"
+		    " value=%x\n", cp->address));
+		return (cp->address);
+	}
+	KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n"));
+	return (0);
+}
+
+#ifdef DDB
+/*
+ * DDB Helpers.  DDB has to look across multiple files with their own symbol
+ * tables and string tables.
+ * 
+ * Note that we do not obey list locking protocols here.  We really don't need
+ * DDB to hang because somebody's got the lock held.  We'll take the chance
+ * that the files list is inconsistant instead.
+ */
+
+int
+linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym)
+{
+	linker_file_t lf;
+
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0)
+			return (0);
+	}
+	return (ENOENT);
+}
+
+int
+linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
+{
+	linker_file_t lf;
+	c_linker_sym_t best, es;
+	u_long diff, bestdiff, off;
+
+	best = 0;
+	off = (uintptr_t)value;
+	bestdiff = off;
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0)
+			continue;
+		if (es != 0 && diff < bestdiff) {
+			best = es;
+			bestdiff = diff;
+		}
+		if (bestdiff == 0)
+			break;
+	}
+	if (best) {
+		*sym = best;
+		*diffp = bestdiff;
+		return (0);
+	} else {
+		*sym = 0;
+		*diffp = off;
+		return (ENOENT);
+	}
+}
+
+int
+linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
+{
+	linker_file_t lf;
+
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0)
+			return (0);
+	}
+	return (ENOENT);
+}
+#endif
+
+/*
+ * Syscalls.
+ */
+/*
+ * MPSAFE
+ */
+int
+kldload(struct thread *td, struct kldload_args *uap)
+{
+	char *kldname, *modname;
+	char *pathname = NULL;
+	linker_file_t lf;
+	int error = 0;
+
+	td->td_retval[0] = -1;
+
+	mtx_lock(&Giant);
+
+	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
+		goto out;
+
+	if ((error = suser(td)) != 0)
+		goto out;
+
+	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	if ((error = copyinstr(SCARG(uap, file), pathname, MAXPATHLEN,
+	    NULL)) != 0)
+		goto out;
+
+	/*
+	 * If path do not contain qualified name or any dot in it
+	 * (kldname.ko, or kldname.ver.ko) treat it as interface
+	 * name.
+	 */
+	if (index(pathname, '/') || index(pathname, '.')) {
+		kldname = pathname;
+		modname = NULL;
+	} else {
+		kldname = NULL;
+		modname = pathname;
+	}
+	error = linker_load_module(kldname, modname, NULL, NULL, &lf);
+	if (error)
+		goto out;
+
+	lf->userrefs++;
+	td->td_retval[0] = lf->id;
+out:
+	if (pathname)
+		free(pathname, M_TEMP);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldunload(struct thread *td, struct kldunload_args *uap)
+{
+	linker_file_t lf;
+	int error = 0;
+
+	mtx_lock(&Giant);
+
+	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
+		goto out;
+
+	if ((error = suser(td)) != 0)
+		goto out;
+
+	lf = linker_find_file_by_id(SCARG(uap, fileid));
+	if (lf) {
+		KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
+		if (lf->userrefs == 0) {
+			printf("kldunload: attempt to unload file that was"
+			    " loaded by the kernel\n");
+			error = EBUSY;
+			goto out;
+		}
+		lf->userrefs--;
+		error = linker_file_unload(lf);
+		if (error)
+			lf->userrefs++;
+	} else
+		error = ENOENT;
+out:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldfind(struct thread *td, struct kldfind_args *uap)
+{
+	char *pathname;
+	const char *filename;
+	linker_file_t lf;
+	int error = 0;
+
+	mtx_lock(&Giant);
+	td->td_retval[0] = -1;
+
+	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	if ((error = copyinstr(SCARG(uap, file), pathname, MAXPATHLEN,
+	    NULL)) != 0)
+		goto out;
+
+	filename = linker_basename(pathname);
+	lf = linker_find_file_by_name(filename);
+	if (lf)
+		td->td_retval[0] = lf->id;
+	else
+		error = ENOENT;
+out:
+	if (pathname)
+		free(pathname, M_TEMP);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldnext(struct thread *td, struct kldnext_args *uap)
+{
+	linker_file_t lf;
+	int error = 0;
+
+	mtx_lock(&Giant);
+
+	if (SCARG(uap, fileid) == 0) {
+		mtx_lock(&kld_mtx);
+		if (TAILQ_FIRST(&linker_files))
+			td->td_retval[0] = TAILQ_FIRST(&linker_files)->id;
+		else
+			td->td_retval[0] = 0;
+		mtx_unlock(&kld_mtx);
+		goto out;
+	}
+	lf = linker_find_file_by_id(SCARG(uap, fileid));
+	if (lf) {
+		if (TAILQ_NEXT(lf, link))
+			td->td_retval[0] = TAILQ_NEXT(lf, link)->id;
+		else
+			td->td_retval[0] = 0;
+	} else
+		error = ENOENT;
+out:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldstat(struct thread *td, struct kldstat_args *uap)
+{
+	linker_file_t lf;
+	int error = 0;
+	int namelen, version;
+	struct kld_file_stat *stat;
+
+	mtx_lock(&Giant);
+
+	lf = linker_find_file_by_id(SCARG(uap, fileid));
+	if (lf == NULL) {
+		error = ENOENT;
+		goto out;
+	}
+	stat = SCARG(uap, stat);
+
+	/*
+	 * Check the version of the user's structure.
+	 */
+	if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
+		goto out;
+	if (version != sizeof(struct kld_file_stat)) {
+		error = EINVAL;
+		goto out;
+	}
+	namelen = strlen(lf->filename) + 1;
+	if (namelen > MAXPATHLEN)
+		namelen = MAXPATHLEN;
+	if ((error = copyout(lf->filename, &stat->name[0], namelen)) != 0)
+		goto out;
+	if ((error = copyout(&lf->refs, &stat->refs, sizeof(int))) != 0)
+		goto out;
+	if ((error = copyout(&lf->id, &stat->id, sizeof(int))) != 0)
+		goto out;
+	if ((error = copyout(&lf->address, &stat->address,
+	    sizeof(caddr_t))) != 0)
+		goto out;
+	if ((error = copyout(&lf->size, &stat->size, sizeof(size_t))) != 0)
+		goto out;
+
+	td->td_retval[0] = 0;
+out:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldfirstmod(struct thread *td, struct kldfirstmod_args *uap)
+{
+	linker_file_t lf;
+	module_t mp;
+	int error = 0;
+
+	mtx_lock(&Giant);
+	lf = linker_find_file_by_id(SCARG(uap, fileid));
+	if (lf) {
+		MOD_SLOCK;
+		mp = TAILQ_FIRST(&lf->modules);
+		if (mp != NULL)
+			td->td_retval[0] = module_getid(mp);
+		else
+			td->td_retval[0] = 0;
+		MOD_SUNLOCK;
+	} else
+		error = ENOENT;
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+kldsym(struct thread *td, struct kldsym_args *uap)
+{
+	char *symstr = NULL;
+	c_linker_sym_t sym;
+	linker_symval_t symval;
+	linker_file_t lf;
+	struct kld_sym_lookup lookup;
+	int error = 0;
+
+	mtx_lock(&Giant);
+
+	if ((error = copyin(SCARG(uap, data), &lookup, sizeof(lookup))) != 0)
+		goto out;
+	if (lookup.version != sizeof(lookup) ||
+	    SCARG(uap, cmd) != KLDSYM_LOOKUP) {
+		error = EINVAL;
+		goto out;
+	}
+	symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0)
+		goto out;
+	if (SCARG(uap, fileid) != 0) {
+		lf = linker_find_file_by_id(SCARG(uap, fileid));
+		if (lf == NULL) {
+			error = ENOENT;
+			goto out;
+		}
+		if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
+		    LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
+			lookup.symvalue = (uintptr_t) symval.value;
+			lookup.symsize = symval.size;
+			error = copyout(&lookup, SCARG(uap, data),
+			    sizeof(lookup));
+		} else
+			error = ENOENT;
+	} else {
+		mtx_lock(&kld_mtx);
+		TAILQ_FOREACH(lf, &linker_files, link) {
+			if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
+			    LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
+				lookup.symvalue = (uintptr_t)symval.value;
+				lookup.symsize = symval.size;
+				error = copyout(&lookup, SCARG(uap, data),
+				    sizeof(lookup));
+				break;
+			}
+		}
+		mtx_unlock(&kld_mtx);
+		if (lf == NULL)
+			error = ENOENT;
+	}
+out:
+	if (symstr)
+		free(symstr, M_TEMP);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Preloaded module support
+ */
+
+static modlist_t
+modlist_lookup(const char *name, int ver)
+{
+	modlist_t mod;
+
+	TAILQ_FOREACH(mod, &found_modules, link) {
+		if (strcmp(mod->name, name) == 0 &&
+		    (ver == 0 || mod->version == ver))
+			return (mod);
+	}
+	return (NULL);
+}
+
+static modlist_t
+modlist_lookup2(const char *name, struct mod_depend *verinfo)
+{
+	modlist_t mod, bestmod;
+	int ver;
+
+	if (verinfo == NULL)
+		return (modlist_lookup(name, 0));
+	bestmod = NULL;
+	for (mod = TAILQ_FIRST(&found_modules); mod;
+	    mod = TAILQ_NEXT(mod, link)) {
+		if (strcmp(mod->name, name) != 0)
+			continue;
+		ver = mod->version;
+		if (ver == verinfo->md_ver_preferred)
+			return (mod);
+		if (ver >= verinfo->md_ver_minimum &&
+		    ver <= verinfo->md_ver_maximum &&
+		    ver > bestmod->version)
+			bestmod = mod;
+	}
+	return (bestmod);
+}
+
+static modlist_t
+modlist_newmodule(const char *modname, int version, linker_file_t container)
+{
+	modlist_t mod;
+
+	mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO);
+	if (mod == NULL)
+		panic("no memory for module list");
+	mod->container = container;
+	mod->name = modname;
+	mod->version = version;
+	TAILQ_INSERT_TAIL(&found_modules, mod, link);
+	return (mod);
+}
+
+/*
+ * This routine is cheap and nasty but will work for data pointers.
+ */
+static void *
+linker_reloc_ptr(linker_file_t lf, const void *offset)
+{
+	return (lf->address + (uintptr_t)offset);
+}
+
+/*
+ * Dereference MDT_VERSION metadata into module name and version
+ */
+static void
+linker_mdt_version(linker_file_t lf, struct mod_metadata *mp,
+    const char **modname, int *version)
+{
+	struct mod_version *mvp;
+
+	if (modname)
+		*modname = linker_reloc_ptr(lf, mp->md_cval);
+	if (version) {
+		mvp = linker_reloc_ptr(lf, mp->md_data);
+		*version = mvp->mv_version;
+	}
+}
+
+/*
+ * Dereference MDT_DEPEND metadata into module name and mod_depend structure
+ */
+static void
+linker_mdt_depend(linker_file_t lf, struct mod_metadata *mp,
+    const char **modname, struct mod_depend **verinfo)
+{
+
+	if (modname)
+		*modname = linker_reloc_ptr(lf, mp->md_cval);
+	if (verinfo)
+		*verinfo = linker_reloc_ptr(lf, mp->md_data);
+}
+
+static void
+linker_addmodules(linker_file_t lf, struct mod_metadata **start,
+    struct mod_metadata **stop, int preload)
+{
+	struct mod_metadata *mp, **mdp;
+	const char *modname;
+	int ver;
+
+	for (mdp = start; mdp < stop; mdp++) {
+		if (preload)
+			mp = *mdp;
+		else
+			mp = linker_reloc_ptr(lf, *mdp);
+		if (mp->md_type != MDT_VERSION)
+			continue;
+		if (preload) {
+			modname = mp->md_cval;
+			ver = ((struct mod_version *)mp->md_data)->mv_version;
+		} else
+	        	linker_mdt_version(lf, mp, &modname, &ver);
+		if (modlist_lookup(modname, ver) != NULL) {
+			printf("module %s already present!\n", modname);
+			/* XXX what can we do? this is a build error. :-( */
+			continue;
+		}
+		modlist_newmodule(modname, ver, lf);
+	}
+}
+
+static void
+linker_preload(void *arg)
+{
+	caddr_t modptr;
+	const char *modname, *nmodname;
+	char *modtype;
+	linker_file_t lf;
+	linker_class_t lc;
+	int error;
+	linker_file_list_t loaded_files;
+	linker_file_list_t depended_files;
+	struct mod_metadata *mp, *nmp;
+	struct mod_metadata **start, **stop, **mdp, **nmdp;
+	struct mod_depend *verinfo;
+	int nver;
+	int resolves;
+	modlist_t mod;
+	struct sysinit **si_start, **si_stop;
+
+	TAILQ_INIT(&loaded_files);
+	TAILQ_INIT(&depended_files);
+	TAILQ_INIT(&found_modules);
+	error = 0;
+
+	modptr = NULL;
+	while ((modptr = preload_search_next_name(modptr)) != NULL) {
+		modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+		modtype = (char *)preload_search_info(modptr, MODINFO_TYPE);
+		if (modname == NULL) {
+			printf("Preloaded module at %p does not have a"
+			    " name!\n", modptr);
+			continue;
+		}
+		if (modtype == NULL) {
+			printf("Preloaded module at %p does not have a type!\n",
+			    modptr);
+			continue;
+		}
+		printf("Preloaded %s \"%s\" at %p.\n", modtype, modname,
+		    modptr);
+		lf = NULL;
+		TAILQ_FOREACH(lc, &classes, link) {
+			error = LINKER_LINK_PRELOAD(lc, modname, &lf);
+			if (error) {
+				lf = NULL;
+				break;
+			}
+		}
+		if (lf)
+			TAILQ_INSERT_TAIL(&loaded_files, lf, loaded);
+	}
+
+	/*
+	 * First get a list of stuff in the kernel.
+	 */
+	if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start,
+	    &stop, NULL) == 0)
+		linker_addmodules(linker_kernel_file, start, stop, 1);
+
+	/*
+	 * this is a once-off kinky bubble sort resolve relocation dependency
+	 * requirements
+	 */
+restart:
+	TAILQ_FOREACH(lf, &loaded_files, loaded) {
+		error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
+		    &stop, NULL);
+		/*
+		 * First, look to see if we would successfully link with this
+		 * stuff.
+		 */
+		resolves = 1;	/* unless we know otherwise */
+		if (!error) {
+			for (mdp = start; mdp < stop; mdp++) {
+				mp = linker_reloc_ptr(lf, *mdp);
+				if (mp->md_type != MDT_DEPEND)
+					continue;
+				linker_mdt_depend(lf, mp, &modname, &verinfo);
+				for (nmdp = start; nmdp < stop; nmdp++) {
+					nmp = linker_reloc_ptr(lf, *nmdp);
+					if (nmp->md_type != MDT_VERSION)
+						continue;
+					linker_mdt_version(lf, nmp, &nmodname,
+					    NULL);
+					nmodname = linker_reloc_ptr(lf,
+					    nmp->md_cval);
+					if (strcmp(modname, nmodname) == 0)
+						break;
+				}
+				if (nmdp < stop)   /* it's a self reference */
+					continue;
+	
+				/*
+				 * ok, the module isn't here yet, we
+				 * are not finished
+				 */
+				if (modlist_lookup2(modname, verinfo) == NULL)
+					resolves = 0;
+			}
+		}
+		/*
+		 * OK, if we found our modules, we can link.  So, "provide"
+		 * the modules inside and add it to the end of the link order
+		 * list.
+		 */
+		if (resolves) {
+			if (!error) {
+				for (mdp = start; mdp < stop; mdp++) {
+					mp = linker_reloc_ptr(lf, *mdp);
+					if (mp->md_type != MDT_VERSION)
+						continue;
+					linker_mdt_version(lf, mp,
+					    &modname, &nver);
+					if (modlist_lookup(modname,
+					    nver) != NULL) {
+						printf("module %s already"
+						    " present!\n", modname);
+						linker_file_unload(lf);
+						TAILQ_REMOVE(&loaded_files,
+						    lf, loaded);
+						/* we changed tailq next ptr */
+						goto restart;
+					}
+					modlist_newmodule(modname, nver, lf);
+				}
+			}
+			TAILQ_REMOVE(&loaded_files, lf, loaded);
+			TAILQ_INSERT_TAIL(&depended_files, lf, loaded);
+			/*
+			 * Since we provided modules, we need to restart the
+			 * sort so that the previous files that depend on us
+			 * have a chance. Also, we've busted the tailq next
+			 * pointer with the REMOVE.
+			 */
+			goto restart;
+		}
+	}
+
+	/*
+	 * At this point, we check to see what could not be resolved..
+	 */
+	TAILQ_FOREACH(lf, &loaded_files, loaded) {
+		printf("KLD file %s is missing dependencies\n", lf->filename);
+		linker_file_unload(lf);
+		TAILQ_REMOVE(&loaded_files, lf, loaded);
+	}
+
+	/*
+	 * We made it. Finish off the linking in the order we determined.
+	 */
+	TAILQ_FOREACH(lf, &depended_files, loaded) {
+		if (linker_kernel_file) {
+			linker_kernel_file->refs++;
+			error = linker_file_add_dependency(lf,
+			    linker_kernel_file);
+			if (error)
+				panic("cannot add dependency");
+		}
+		lf->userrefs++;	/* so we can (try to) kldunload it */
+		error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
+		    &stop, NULL);
+		if (!error) {
+			for (mdp = start; mdp < stop; mdp++) {
+				mp = linker_reloc_ptr(lf, *mdp);
+				if (mp->md_type != MDT_DEPEND)
+					continue;
+				linker_mdt_depend(lf, mp, &modname, &verinfo);
+				mod = modlist_lookup2(modname, verinfo);
+				mod->container->refs++;
+				error = linker_file_add_dependency(lf,
+				    mod->container);
+				if (error)
+					panic("cannot add dependency");
+			}
+		}
+		/*
+		 * Now do relocation etc using the symbol search paths
+		 * established by the dependencies
+		 */
+		error = LINKER_LINK_PRELOAD_FINISH(lf);
+		if (error) {
+			printf("KLD file %s - could not finalize loading\n",
+			    lf->filename);
+			linker_file_unload(lf);
+			continue;
+		}
+		linker_file_register_modules(lf);
+		if (linker_file_lookup_set(lf, "sysinit_set", &si_start,
+		    &si_stop, NULL) == 0)
+			sysinit_add(si_start, si_stop);
+		linker_file_register_sysctls(lf);
+		lf->flags |= LINKER_FILE_LINKED;
+	}
+	/* woohoo! we made it! */
+}
+
+SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0)
+
+/*
+ * Search for a not-loaded module by name.
+ * 
+ * Modules may be found in the following locations:
+ * 
+ * - preloaded (result is just the module name) - on disk (result is full path
+ * to module)
+ * 
+ * If the module name is qualified in any way (contains path, etc.) the we
+ * simply return a copy of it.
+ * 
+ * The search path can be manipulated via sysctl.  Note that we use the ';'
+ * character as a separator to be consistent with the bootloader.
+ */
+
+static char linker_hintfile[] = "linker.hints";
+static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules;/modules";
+
+SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path,
+    sizeof(linker_path), "module load search path");
+
+TUNABLE_STR("module_path", linker_path, sizeof(linker_path));
+
+static char *linker_ext_list[] = {
+	"",
+	".ko",
+	NULL
+};
+
+/*
+ * Check if file actually exists either with or without extension listed in
+ * the linker_ext_list. (probably should be generic for the rest of the
+ * kernel)
+ */
+static char *
+linker_lookup_file(const char *path, int pathlen, const char *name,
+    int namelen, struct vattr *vap)
+{
+	struct nameidata nd;
+	struct thread *td = curthread;	/* XXX */
+	char *result, **cpp, *sep;
+	int error, len, extlen, reclen, flags;
+	enum vtype type;
+
+	extlen = 0;
+	for (cpp = linker_ext_list; *cpp; cpp++) {
+		len = strlen(*cpp);
+		if (len > extlen)
+			extlen = len;
+	}
+	extlen++;		/* trailing '\0' */
+	sep = (path[pathlen - 1] != '/') ? "/" : "";
+
+	reclen = pathlen + strlen(sep) + namelen + extlen + 1;
+	result = malloc(reclen, M_LINKER, M_WAITOK);
+	for (cpp = linker_ext_list; *cpp; cpp++) {
+		snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep,
+		    namelen, name, *cpp);
+		/*
+		 * Attempt to open the file, and return the path if
+		 * we succeed and it's a regular file.
+		 */
+		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td);
+		flags = FREAD;
+		error = vn_open(&nd, &flags, 0);
+		if (error == 0) {
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			type = nd.ni_vp->v_type;
+			if (vap)
+				VOP_GETATTR(nd.ni_vp, vap, td->td_ucred, td);
+			VOP_UNLOCK(nd.ni_vp, 0, td);
+			vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+			if (type == VREG)
+				return (result);
+		}
+	}
+	free(result, M_LINKER);
+	return (NULL);
+}
+
+#define	INT_ALIGN(base, ptr)	ptr =					\
+	(base) + (((ptr) - (base) + sizeof(int) - 1) & ~(sizeof(int) - 1))
+
+/*
+ * Lookup KLD which contains requested module in the "linker.hints" file. If
+ * version specification is available, then try to find the best KLD.
+ * Otherwise just find the latest one.
+ * 
+ * XXX: Vnode locking here is hosed; lock should be held for calls to
+ * VOP_GETATTR() and vn_rdwr().
+ */
+static char *
+linker_hints_lookup(const char *path, int pathlen, const char *modname,
+    int modnamelen, struct mod_depend *verinfo)
+{
+	struct thread *td = curthread;	/* XXX */
+	struct ucred *cred = td ? td->td_ucred : NULL;
+	struct nameidata nd;
+	struct vattr vattr, mattr;
+	u_char *hints = NULL;
+	u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep;
+	int error, ival, bestver, *intp, reclen, found, flags, clen, blen;
+
+	result = NULL;
+	bestver = found = 0;
+
+	sep = (path[pathlen - 1] != '/') ? "/" : "";
+	reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen +
+	    strlen(sep) + 1;
+	pathbuf = malloc(reclen, M_LINKER, M_WAITOK);
+	snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep,
+	    linker_hintfile);
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td);
+	flags = FREAD;
+	error = vn_open(&nd, &flags, 0);
+	if (error)
+		goto bad;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	VOP_UNLOCK(nd.ni_vp, 0, td);
+	if (nd.ni_vp->v_type != VREG)
+		goto bad;
+	best = cp = NULL;
+	error = VOP_GETATTR(nd.ni_vp, &vattr, cred, td);
+	if (error)
+		goto bad;
+	/*
+	 * XXX: we need to limit this number to some reasonable value
+	 */
+	if (vattr.va_size > 100 * 1024) {
+		printf("hints file too large %ld\n", (long)vattr.va_size);
+		goto bad;
+	}
+	hints = malloc(vattr.va_size, M_TEMP, M_WAITOK);
+	if (hints == NULL)
+		goto bad;
+	error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0,
+	    UIO_SYSSPACE, IO_NODELOCKED, cred, &reclen, td);
+	if (error)
+		goto bad;
+	vn_close(nd.ni_vp, FREAD, cred, td);
+	nd.ni_vp = NULL;
+	if (reclen != 0) {
+		printf("can't read %d\n", reclen);
+		goto bad;
+	}
+	intp = (int *)hints;
+	ival = *intp++;
+	if (ival != LINKER_HINTS_VERSION) {
+		printf("hints file version mismatch %d\n", ival);
+		goto bad;
+	}
+	bufend = hints + vattr.va_size;
+	recptr = (u_char *)intp;
+	clen = blen = 0;
+	while (recptr < bufend && !found) {
+		intp = (int *)recptr;
+		reclen = *intp++;
+		ival = *intp++;
+		cp = (char *)intp;
+		switch (ival) {
+		case MDT_VERSION:
+			clen = *cp++;
+			if (clen != modnamelen || bcmp(cp, modname, clen) != 0)
+				break;
+			cp += clen;
+			INT_ALIGN(hints, cp);
+			ival = *(int *)cp;
+			cp += sizeof(int);
+			clen = *cp++;
+			if (verinfo == NULL ||
+			    ival == verinfo->md_ver_preferred) {
+				found = 1;
+				break;
+			}
+			if (ival >= verinfo->md_ver_minimum &&
+			    ival <= verinfo->md_ver_maximum &&
+			    ival > bestver) {
+				bestver = ival;
+				best = cp;
+				blen = clen;
+			}
+			break;
+		default:
+			break;
+		}
+		recptr += reclen + sizeof(int);
+	}
+	/*
+	 * Finally check if KLD is in the place
+	 */
+	if (found)
+		result = linker_lookup_file(path, pathlen, cp, clen, &mattr);
+	else if (best)
+		result = linker_lookup_file(path, pathlen, best, blen, &mattr);
+
+	/*
+	 * KLD is newer than hints file. What we should do now?
+	 */
+	if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >))
+		printf("warning: KLD '%s' is newer than the linker.hints"
+		    " file\n", result);
+bad:
+	if (hints)
+		free(hints, M_TEMP);
+	if (nd.ni_vp != NULL)
+		vn_close(nd.ni_vp, FREAD, cred, td);
+	/*
+	 * If nothing found or hints is absent - fallback to the old
+	 * way by using "kldname[.ko]" as module name.
+	 */
+	if (!found && !bestver && result == NULL)
+		result = linker_lookup_file(path, pathlen, modname,
+		    modnamelen, NULL);
+	return (result);
+}
+
+/*
+ * Lookup KLD which contains requested module in the all directories.
+ */
+static char *
+linker_search_module(const char *modname, int modnamelen,
+    struct mod_depend *verinfo)
+{
+	char *cp, *ep, *result;
+
+	/*
+	 * traverse the linker path
+	 */
+	for (cp = linker_path; *cp; cp = ep + 1) {
+		/* find the end of this component */
+		for (ep = cp; (*ep != 0) && (*ep != ';'); ep++);
+		result = linker_hints_lookup(cp, ep - cp, modname,
+		    modnamelen, verinfo);
+		if (result != NULL)
+			return (result);
+		if (*ep == 0)
+			break;
+	}
+	return (NULL);
+}
+
+/*
+ * Search for module in all directories listed in the linker_path.
+ */
+static char *
+linker_search_kld(const char *name)
+{
+	char *cp, *ep, *result, **cpp;
+	int extlen, len;
+
+	/* qualified at all? */
+	if (index(name, '/'))
+		return (linker_strdup(name));
+
+	extlen = 0;
+	for (cpp = linker_ext_list; *cpp; cpp++) {
+		len = strlen(*cpp);
+		if (len > extlen)
+			extlen = len;
+	}
+	extlen++;		/* trailing '\0' */
+
+	/* traverse the linker path */
+	len = strlen(name);
+	for (ep = linker_path; *ep; ep++) {
+		cp = ep;
+		/* find the end of this component */
+		for (; *ep != 0 && *ep != ';'; ep++);
+		result = linker_lookup_file(cp, ep - cp, name, len, NULL);
+		if (result != NULL)
+			return (result);
+	}
+	return (NULL);
+}
+
+static const char *
+linker_basename(const char *path)
+{
+	const char *filename;
+
+	filename = rindex(path, '/');
+	if (filename == NULL)
+		return path;
+	if (filename[1])
+		filename++;
+	return (filename);
+}
+
+/*
+ * Find a file which contains given module and load it, if "parent" is not
+ * NULL, register a reference to it.
+ */
+static int
+linker_load_module(const char *kldname, const char *modname,
+    struct linker_file *parent, struct mod_depend *verinfo,
+    struct linker_file **lfpp)
+{
+	linker_file_t lfdep;
+	const char *filename;
+	char *pathname;
+	int error;
+
+	if (modname == NULL) {
+		/*
+ 		 * We have to load KLD
+ 		 */
+		KASSERT(verinfo == NULL, ("linker_load_module: verinfo"
+		    " is not NULL"));
+		pathname = linker_search_kld(kldname);
+	} else {
+		if (modlist_lookup2(modname, verinfo) != NULL)
+			return (EEXIST);
+		if (kldname != NULL)
+			pathname = linker_strdup(kldname);
+		else if (rootvnode == NULL)
+			pathname = NULL;
+		else
+			/*
+			 * Need to find a KLD with required module
+			 */
+			pathname = linker_search_module(modname,
+			    strlen(modname), verinfo);
+	}
+	if (pathname == NULL)
+		return (ENOENT);
+
+	/*
+	 * Can't load more than one file with the same basename XXX:
+	 * Actually it should be possible to have multiple KLDs with
+	 * the same basename but different path because they can
+	 * provide different versions of the same modules.
+	 */
+	filename = linker_basename(pathname);
+	if (linker_find_file_by_name(filename)) {
+		error = EEXIST;
+		goto out;
+	}
+	do {
+		error = linker_load_file(pathname, &lfdep);
+		if (error)
+			break;
+		if (modname && verinfo &&
+		    modlist_lookup2(modname, verinfo) == NULL) {
+			linker_file_unload(lfdep);
+			error = ENOENT;
+			break;
+		}
+		if (parent) {
+			error = linker_file_add_dependency(parent, lfdep);
+			if (error)
+				break;
+		}
+		if (lfpp)
+			*lfpp = lfdep;
+	} while (0);
+out:
+	if (pathname)
+		free(pathname, M_LINKER);
+	return (error);
+}
+
+/*
+ * This routine is responsible for finding dependencies of userland initiated
+ * kldload(2)'s of files.
+ */
+int
+linker_load_dependencies(linker_file_t lf)
+{
+	linker_file_t lfdep;
+	struct mod_metadata **start, **stop, **mdp, **nmdp;
+	struct mod_metadata *mp, *nmp;
+	struct mod_depend *verinfo;
+	modlist_t mod;
+	const char *modname, *nmodname;
+	int ver, error = 0, count;
+
+	/*
+	 * All files are dependant on /kernel.
+	 */
+	if (linker_kernel_file) {
+		linker_kernel_file->refs++;
+		error = linker_file_add_dependency(lf, linker_kernel_file);
+		if (error)
+			return (error);
+	}
+	if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop,
+	    &count) != 0)
+		return (0);
+	for (mdp = start; mdp < stop; mdp++) {
+		mp = linker_reloc_ptr(lf, *mdp);
+		if (mp->md_type != MDT_VERSION)
+			continue;
+		linker_mdt_version(lf, mp, &modname, &ver);
+		mod = modlist_lookup(modname, ver);
+		if (mod != NULL) {
+			printf("interface %s.%d already present in the KLD"
+			    " '%s'!\n", modname, ver,
+			    mod->container->filename);
+			return (EEXIST);
+		}
+	}
+
+	for (mdp = start; mdp < stop; mdp++) {
+		mp = linker_reloc_ptr(lf, *mdp);
+		if (mp->md_type != MDT_DEPEND)
+			continue;
+		linker_mdt_depend(lf, mp, &modname, &verinfo);
+		nmodname = NULL;
+		for (nmdp = start; nmdp < stop; nmdp++) {
+			nmp = linker_reloc_ptr(lf, *nmdp);
+			if (nmp->md_type != MDT_VERSION)
+				continue;
+			nmodname = linker_reloc_ptr(lf, nmp->md_cval);
+			if (strcmp(modname, nmodname) == 0)
+				break;
+		}
+		if (nmdp < stop)/* early exit, it's a self reference */
+			continue;
+		mod = modlist_lookup2(modname, verinfo);
+		if (mod) {	/* woohoo, it's loaded already */
+			lfdep = mod->container;
+			lfdep->refs++;
+			error = linker_file_add_dependency(lf, lfdep);
+			if (error)
+				break;
+			continue;
+		}
+		error = linker_load_module(NULL, modname, lf, verinfo, NULL);
+		if (error) {
+			printf("KLD %s: depends on %s - not available\n",
+			    lf->filename, modname);
+			break;
+		}
+	}
+
+	if (error)
+		return (error);
+	linker_addmodules(lf, start, stop, 0);
+	return (error);
+}
+
+static int
+sysctl_kern_function_list_iterate(const char *name, void *opaque)
+{
+	struct sysctl_req *req;
+
+	req = opaque;
+	return (SYSCTL_OUT(req, name, strlen(name) + 1));
+}
+
+/*
+ * Export a nul-separated, double-nul-terminated list of all function names
+ * in the kernel.
+ */
+static int
+sysctl_kern_function_list(SYSCTL_HANDLER_ARGS)
+{
+	linker_file_t lf;
+	int error;
+
+	mtx_lock(&kld_mtx);
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		error = LINKER_EACH_FUNCTION_NAME(lf,
+		    sysctl_kern_function_list_iterate, req);
+		if (error) {
+			mtx_unlock(&kld_mtx);
+			return (error);
+		}
+	}
+	mtx_unlock(&kld_mtx);
+	return (SYSCTL_OUT(req, "", 1));
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLFLAG_RD,
+    NULL, 0, sysctl_kern_function_list, "", "kernel function list");
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
new file mode 100644
index 0000000..5189bb7
--- /dev/null
+++ b/sys/kern/kern_lock.c
@@ -0,0 +1,594 @@
+/* 
+ * Copyright (c) 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Copyright (C) 1997
+ *	John S. Dyson.  All rights reserved.
+ *
+ * This code contains ideas from software contributed to Berkeley by
+ * Avadis Tevanian, Jr., Michael Wayne Young, and the Mach Operating
+ * System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_lock.c	8.18 (Berkeley) 5/21/95
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+/*
+ * Locking primitives implementation.
+ * Locks provide shared/exclusive sychronization.
+ */
+
+#define LOCK_WAIT_TIME 100
+#define LOCK_SAMPLE_WAIT 7
+
+#if defined(DIAGNOSTIC)
+#define LOCK_INLINE
+#else
+#define LOCK_INLINE __inline
+#endif
+
+#define LK_ALL (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | \
+	LK_SHARE_NONZERO | LK_WAIT_NONZERO)
+
+/*
+ * Mutex array variables.  Rather than each lockmgr lock having its own mutex,
+ * share a fixed (at boot time) number of mutexes across all lockmgr locks in
+ * order to keep sizeof(struct lock) down.
+ */
+int lock_mtx_valid;
+static struct mtx lock_mtx;
+
+static int acquire(struct lock *lkp, int extflags, int wanted);
+static int apause(struct lock *lkp, int flags);
+static int acquiredrain(struct lock *lkp, int extflags) ;
+
+static void
+lockmgr_init(void *dummy __unused)
+{
+	/*
+	 * Initialize the lockmgr protection mutex if it hasn't already been
+	 * done.  Unless something changes about kernel startup order, VM
+	 * initialization will always cause this mutex to already be
+	 * initialized in a call to lockinit().
+	 */
+	if (lock_mtx_valid == 0) {
+		mtx_init(&lock_mtx, "lockmgr", NULL, MTX_DEF);
+		lock_mtx_valid = 1;
+	}
+}
+SYSINIT(lmgrinit, SI_SUB_LOCK, SI_ORDER_FIRST, lockmgr_init, NULL)
+
+static LOCK_INLINE void
+sharelock(struct lock *lkp, int incr) {
+	lkp->lk_flags |= LK_SHARE_NONZERO;
+	lkp->lk_sharecount += incr;
+}
+
+static LOCK_INLINE void
+shareunlock(struct lock *lkp, int decr) {
+
+	KASSERT(lkp->lk_sharecount >= decr, ("shareunlock: count < decr"));
+
+	if (lkp->lk_sharecount == decr) {
+		lkp->lk_flags &= ~LK_SHARE_NONZERO;
+		if (lkp->lk_flags & (LK_WANT_UPGRADE | LK_WANT_EXCL)) {
+			wakeup(lkp);
+		}
+		lkp->lk_sharecount = 0;
+	} else {
+		lkp->lk_sharecount -= decr;
+	}
+}
+
+/*
+ * This is the waitloop optimization.
+ */
+static int
+apause(struct lock *lkp, int flags)
+{
+#ifdef SMP
+	int i, lock_wait;
+#endif
+
+	if ((lkp->lk_flags & flags) == 0)
+		return 0;
+#ifdef SMP
+	for (lock_wait = LOCK_WAIT_TIME; lock_wait > 0; lock_wait--) {
+		mtx_unlock(lkp->lk_interlock);
+		for (i = LOCK_SAMPLE_WAIT; i > 0; i--)
+			if ((lkp->lk_flags & flags) == 0)
+				break;
+		mtx_lock(lkp->lk_interlock);
+		if ((lkp->lk_flags & flags) == 0)
+			return 0;
+	}
+#endif
+	return 1;
+}
+
+static int
+acquire(struct lock *lkp, int extflags, int wanted) {
+	int s, error;
+
+	CTR3(KTR_LOCKMGR,
+	    "acquire(): lkp == %p, extflags == 0x%x, wanted == 0x%x\n",
+	    lkp, extflags, wanted);
+
+	if ((extflags & LK_NOWAIT) && (lkp->lk_flags & wanted)) {
+		return EBUSY;
+	}
+
+	if (((lkp->lk_flags | extflags) & LK_NOPAUSE) == 0) {
+		error = apause(lkp, wanted);
+		if (error == 0)
+			return 0;
+	}
+
+	s = splhigh();
+	while ((lkp->lk_flags & wanted) != 0) {
+		lkp->lk_flags |= LK_WAIT_NONZERO;
+		lkp->lk_waitcount++;
+		error = msleep(lkp, lkp->lk_interlock, lkp->lk_prio,
+		    lkp->lk_wmesg, 
+		    ((extflags & LK_TIMELOCK) ? lkp->lk_timo : 0));
+		if (lkp->lk_waitcount == 1) {
+			lkp->lk_flags &= ~LK_WAIT_NONZERO;
+			lkp->lk_waitcount = 0;
+		} else {
+			lkp->lk_waitcount--;
+		}
+		if (error) {
+			splx(s);
+			return error;
+		}
+		if (extflags & LK_SLEEPFAIL) {
+			splx(s);
+			return ENOLCK;
+		}
+	}
+	splx(s);
+	return 0;
+}
+
+/*
+ * Set, change, or release a lock.
+ *
+ * Shared requests increment the shared count. Exclusive requests set the
+ * LK_WANT_EXCL flag (preventing further shared locks), and wait for already
+ * accepted shared locks and shared-to-exclusive upgrades to go away.
+ */
+int
+#ifndef	DEBUG_LOCKS
+lockmgr(lkp, flags, interlkp, td)
+#else
+debuglockmgr(lkp, flags, interlkp, td, name, file, line)
+#endif
+	struct lock *lkp;
+	u_int flags;
+	struct mtx *interlkp;
+	struct thread *td;
+#ifdef	DEBUG_LOCKS
+	const char *name;	/* Name of lock function */
+	const char *file;	/* Name of file call is from */
+	int line;		/* Line number in file */
+#endif
+{
+	int error;
+	pid_t pid;
+	int extflags, lockflags;
+
+	CTR5(KTR_LOCKMGR,
+	    "lockmgr(): lkp == %p (lk_wmesg == \"%s\"), flags == 0x%x, "
+	    "interlkp == %p, td == %p", lkp, lkp->lk_wmesg, flags, interlkp, td);
+
+	error = 0;
+	if (td == NULL)
+		pid = LK_KERNPROC;
+	else
+		pid = td->td_proc->p_pid;
+
+	mtx_lock(lkp->lk_interlock);
+	if (flags & LK_INTERLOCK) {
+		mtx_assert(interlkp, MA_OWNED | MA_NOTRECURSED);
+		mtx_unlock(interlkp);
+	}
+
+	if (panicstr != NULL) {
+		mtx_unlock(lkp->lk_interlock);
+		return (0);
+	}
+
+	extflags = (flags | lkp->lk_flags) & LK_EXTFLG_MASK;
+
+	switch (flags & LK_TYPE_MASK) {
+
+	case LK_SHARED:
+		/*
+		 * If we are not the exclusive lock holder, we have to block
+		 * while there is an exclusive lock holder or while an
+		 * exclusive lock request or upgrade request is in progress.
+		 *
+		 * However, if TDF_DEADLKTREAT is set, we override exclusive
+		 * lock requests or upgrade requests ( but not the exclusive
+		 * lock itself ).
+		 */
+		if (lkp->lk_lockholder != pid) {
+			lockflags = LK_HAVE_EXCL;
+			mtx_lock_spin(&sched_lock);
+			if (td != NULL && !(td->td_flags & TDF_DEADLKTREAT))
+				lockflags |= LK_WANT_EXCL | LK_WANT_UPGRADE;
+			mtx_unlock_spin(&sched_lock);
+			error = acquire(lkp, extflags, lockflags);
+			if (error)
+				break;
+			sharelock(lkp, 1);
+#if defined(DEBUG_LOCKS)
+			lkp->lk_slockholder = pid;
+			lkp->lk_sfilename = file;
+			lkp->lk_slineno = line;
+			lkp->lk_slockername = name;
+#endif
+			break;
+		}
+		/*
+		 * We hold an exclusive lock, so downgrade it to shared.
+		 * An alternative would be to fail with EDEADLK.
+		 */
+		sharelock(lkp, 1);
+		/* fall into downgrade */
+
+	case LK_DOWNGRADE:
+		KASSERT(lkp->lk_lockholder == pid && lkp->lk_exclusivecount != 0,
+			("lockmgr: not holding exclusive lock "
+			"(owner pid (%d) != pid (%d), exlcnt (%d) != 0",
+			lkp->lk_lockholder, pid, lkp->lk_exclusivecount));
+		sharelock(lkp, lkp->lk_exclusivecount);
+		lkp->lk_exclusivecount = 0;
+		lkp->lk_flags &= ~LK_HAVE_EXCL;
+		lkp->lk_lockholder = LK_NOPROC;
+		if (lkp->lk_waitcount)
+			wakeup((void *)lkp);
+		break;
+
+	case LK_EXCLUPGRADE:
+		/*
+		 * If another process is ahead of us to get an upgrade,
+		 * then we want to fail rather than have an intervening
+		 * exclusive access.
+		 */
+		if (lkp->lk_flags & LK_WANT_UPGRADE) {
+			shareunlock(lkp, 1);
+			error = EBUSY;
+			break;
+		}
+		/* fall into normal upgrade */
+
+	case LK_UPGRADE:
+		/*
+		 * Upgrade a shared lock to an exclusive one. If another
+		 * shared lock has already requested an upgrade to an
+		 * exclusive lock, our shared lock is released and an
+		 * exclusive lock is requested (which will be granted
+		 * after the upgrade). If we return an error, the file
+		 * will always be unlocked.
+		 */
+		if ((lkp->lk_lockholder == pid) || (lkp->lk_sharecount <= 0))
+			panic("lockmgr: upgrade exclusive lock");
+		shareunlock(lkp, 1);
+		/*
+		 * If we are just polling, check to see if we will block.
+		 */
+		if ((extflags & LK_NOWAIT) &&
+		    ((lkp->lk_flags & LK_WANT_UPGRADE) ||
+		     lkp->lk_sharecount > 1)) {
+			error = EBUSY;
+			break;
+		}
+		if ((lkp->lk_flags & LK_WANT_UPGRADE) == 0) {
+			/*
+			 * We are first shared lock to request an upgrade, so
+			 * request upgrade and wait for the shared count to
+			 * drop to zero, then take exclusive lock.
+			 */
+			lkp->lk_flags |= LK_WANT_UPGRADE;
+			error = acquire(lkp, extflags, LK_SHARE_NONZERO);
+			lkp->lk_flags &= ~LK_WANT_UPGRADE;
+
+			if (error)
+				break;
+			lkp->lk_flags |= LK_HAVE_EXCL;
+			lkp->lk_lockholder = pid;
+			if (lkp->lk_exclusivecount != 0)
+				panic("lockmgr: non-zero exclusive count");
+			lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+			lkp->lk_filename = file;
+			lkp->lk_lineno = line;
+			lkp->lk_lockername = name;
+#endif
+			break;
+		}
+		/*
+		 * Someone else has requested upgrade. Release our shared
+		 * lock, awaken upgrade requestor if we are the last shared
+		 * lock, then request an exclusive lock.
+		 */
+		if ( (lkp->lk_flags & (LK_SHARE_NONZERO|LK_WAIT_NONZERO)) ==
+			LK_WAIT_NONZERO)
+			wakeup((void *)lkp);
+		/* fall into exclusive request */
+
+	case LK_EXCLUSIVE:
+		if (lkp->lk_lockholder == pid && pid != LK_KERNPROC) {
+			/*
+			 *	Recursive lock.
+			 */
+			if ((extflags & (LK_NOWAIT | LK_CANRECURSE)) == 0)
+				panic("lockmgr: locking against myself");
+			if ((extflags & LK_CANRECURSE) != 0) {
+				lkp->lk_exclusivecount++;
+				break;
+			}
+		}
+		/*
+		 * If we are just polling, check to see if we will sleep.
+		 */
+		if ((extflags & LK_NOWAIT) &&
+		    (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | LK_SHARE_NONZERO))) {
+			error = EBUSY;
+			break;
+		}
+		/*
+		 * Try to acquire the want_exclusive flag.
+		 */
+		error = acquire(lkp, extflags, (LK_HAVE_EXCL | LK_WANT_EXCL));
+		if (error)
+			break;
+		lkp->lk_flags |= LK_WANT_EXCL;
+		/*
+		 * Wait for shared locks and upgrades to finish.
+		 */
+		error = acquire(lkp, extflags, LK_WANT_UPGRADE | LK_SHARE_NONZERO);
+		lkp->lk_flags &= ~LK_WANT_EXCL;
+		if (error)
+			break;
+		lkp->lk_flags |= LK_HAVE_EXCL;
+		lkp->lk_lockholder = pid;
+		if (lkp->lk_exclusivecount != 0)
+			panic("lockmgr: non-zero exclusive count");
+		lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+			lkp->lk_filename = file;
+			lkp->lk_lineno = line;
+			lkp->lk_lockername = name;
+#endif
+		break;
+
+	case LK_RELEASE:
+		if (lkp->lk_exclusivecount != 0) {
+			if (lkp->lk_lockholder != pid &&
+			    lkp->lk_lockholder != LK_KERNPROC) {
+				panic("lockmgr: pid %d, not %s %d unlocking",
+				    pid, "exclusive lock holder",
+				    lkp->lk_lockholder);
+			}
+			if (lkp->lk_exclusivecount == 1) {
+				lkp->lk_flags &= ~LK_HAVE_EXCL;
+				lkp->lk_lockholder = LK_NOPROC;
+				lkp->lk_exclusivecount = 0;
+			} else {
+				lkp->lk_exclusivecount--;
+			}
+		} else if (lkp->lk_flags & LK_SHARE_NONZERO)
+			shareunlock(lkp, 1);
+		if (lkp->lk_flags & LK_WAIT_NONZERO)
+			wakeup((void *)lkp);
+		break;
+
+	case LK_DRAIN:
+		/*
+		 * Check that we do not already hold the lock, as it can 
+		 * never drain if we do. Unfortunately, we have no way to
+		 * check for holding a shared lock, but at least we can
+		 * check for an exclusive one.
+		 */
+		if (lkp->lk_lockholder == pid)
+			panic("lockmgr: draining against myself");
+
+		error = acquiredrain(lkp, extflags);
+		if (error)
+			break;
+		lkp->lk_flags |= LK_DRAINING | LK_HAVE_EXCL;
+		lkp->lk_lockholder = pid;
+		lkp->lk_exclusivecount = 1;
+#if defined(DEBUG_LOCKS)
+			lkp->lk_filename = file;
+			lkp->lk_lineno = line;
+			lkp->lk_lockername = name;
+#endif
+		break;
+
+	default:
+		mtx_unlock(lkp->lk_interlock);
+		panic("lockmgr: unknown locktype request %d",
+		    flags & LK_TYPE_MASK);
+		/* NOTREACHED */
+	}
+	if ((lkp->lk_flags & LK_WAITDRAIN) &&
+	    (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE |
+		LK_SHARE_NONZERO | LK_WAIT_NONZERO)) == 0) {
+		lkp->lk_flags &= ~LK_WAITDRAIN;
+		wakeup((void *)&lkp->lk_flags);
+	}
+	mtx_unlock(lkp->lk_interlock);
+	return (error);
+}
+
+static int
+acquiredrain(struct lock *lkp, int extflags) {
+	int error;
+
+	if ((extflags & LK_NOWAIT) && (lkp->lk_flags & LK_ALL)) {
+		return EBUSY;
+	}
+
+	error = apause(lkp, LK_ALL);
+	if (error == 0)
+		return 0;
+
+	while (lkp->lk_flags & LK_ALL) {
+		lkp->lk_flags |= LK_WAITDRAIN;
+		error = msleep(&lkp->lk_flags, lkp->lk_interlock, lkp->lk_prio,
+			lkp->lk_wmesg, 
+			((extflags & LK_TIMELOCK) ? lkp->lk_timo : 0));
+		if (error)
+			return error;
+		if (extflags & LK_SLEEPFAIL) {
+			return ENOLCK;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Initialize a lock; required before use.
+ */
+void
+lockinit(lkp, prio, wmesg, timo, flags)
+	struct lock *lkp;
+	int prio;
+	const char *wmesg;
+	int timo;
+	int flags;
+{
+	CTR5(KTR_LOCKMGR, "lockinit(): lkp == %p, prio == %d, wmesg == \"%s\", "
+	    "timo == %d, flags = 0x%x\n", lkp, prio, wmesg, timo, flags);
+
+	if (lock_mtx_valid == 0) {
+		mtx_init(&lock_mtx, "lockmgr", NULL, MTX_DEF);
+		lock_mtx_valid = 1;
+	}
+	/*
+	 * XXX cleanup - make sure mtxpool is always initialized before
+	 * this is ever called.
+	 */
+	if (mtx_pool_valid) {
+		mtx_lock(&lock_mtx);
+		lkp->lk_interlock = mtx_pool_alloc();
+		mtx_unlock(&lock_mtx);
+	} else {
+		lkp->lk_interlock = &lock_mtx;
+	}
+	lkp->lk_flags = (flags & LK_EXTFLG_MASK);
+	lkp->lk_sharecount = 0;
+	lkp->lk_waitcount = 0;
+	lkp->lk_exclusivecount = 0;
+	lkp->lk_prio = prio;
+	lkp->lk_wmesg = wmesg;
+	lkp->lk_timo = timo;
+	lkp->lk_lockholder = LK_NOPROC;
+}
+
+/*
+ * Destroy a lock.
+ */
+void
+lockdestroy(lkp)
+	struct lock *lkp;
+{
+	CTR2(KTR_LOCKMGR, "lockdestroy(): lkp == %p (lk_wmesg == \"%s\")",
+	    lkp, lkp->lk_wmesg);
+}
+
+/*
+ * Determine the status of a lock.
+ */
+int
+lockstatus(lkp, td)
+	struct lock *lkp;
+	struct thread *td;
+{
+	int lock_type = 0;
+
+	mtx_lock(lkp->lk_interlock);
+	if (lkp->lk_exclusivecount != 0) {
+		if (td == NULL || lkp->lk_lockholder == td->td_proc->p_pid)
+			lock_type = LK_EXCLUSIVE;
+		else
+			lock_type = LK_EXCLOTHER;
+	} else if (lkp->lk_sharecount != 0)
+		lock_type = LK_SHARED;
+	mtx_unlock(lkp->lk_interlock);
+	return (lock_type);
+}
+
+/*
+ * Determine the number of holders of a lock.
+ */
+int
+lockcount(lkp)
+	struct lock *lkp;
+{
+	int count;
+
+	mtx_lock(lkp->lk_interlock);
+	count = lkp->lk_exclusivecount + lkp->lk_sharecount;
+	mtx_unlock(lkp->lk_interlock);
+	return (count);
+}
+
+/*
+ * Print out information about state of a lock. Used by VOP_PRINT
+ * routines to display status about contained locks.
+ */
+void
+lockmgr_printinfo(lkp)
+	struct lock *lkp;
+{
+
+	if (lkp->lk_sharecount)
+		printf(" lock type %s: SHARED (count %d)", lkp->lk_wmesg,
+		    lkp->lk_sharecount);
+	else if (lkp->lk_flags & LK_HAVE_EXCL)
+		printf(" lock type %s: EXCL (count %d) by pid %d",
+		    lkp->lk_wmesg, lkp->lk_exclusivecount, lkp->lk_lockholder);
+	if (lkp->lk_waitcount > 0)
+		printf(" with %d pending", lkp->lk_waitcount);
+}
diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c
new file mode 100644
index 0000000..c1cadb1
--- /dev/null
+++ b/sys/kern/kern_lockf.c
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Scooter Morris at Genentech Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_lockf.c	8.3 (Berkeley) 1/6/94
+ * $FreeBSD$
+ */
+
+#include "opt_debug_lockf.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/lockf.h>
+
+#include <machine/limits.h>
+
+/*
+ * This variable controls the maximum number of processes that will
+ * be checked in doing deadlock detection.
+ */
+static int maxlockdepth = MAXDEPTH;
+
+#ifdef LOCKF_DEBUG
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+
+
+static int	lockf_debug = 0;
+SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
+#endif
+
+MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
+
+#define NOLOCKF (struct lockf *)0
+#define SELF	0x1
+#define OTHERS	0x2
+static int	 lf_clearlock(struct lockf *);
+static int	 lf_findoverlap(struct lockf *,
+	    struct lockf *, int, struct lockf ***, struct lockf **);
+static struct lockf *
+	 lf_getblock(struct lockf *);
+static int	 lf_getlock(struct lockf *, struct flock *);
+static int	 lf_setlock(struct lockf *);
+static void	 lf_split(struct lockf *, struct lockf *);
+static void	 lf_wakelock(struct lockf *);
+
+/*
+ * Advisory record locking support
+ */
+int
+lf_advlock(ap, head, size)
+	struct vop_advlock_args /* {
+		struct vnode *a_vp;
+		caddr_t  a_id;
+		int  a_op;
+		struct flock *a_fl;
+		int  a_flags;
+	} */ *ap;
+	struct lockf **head;
+	u_quad_t size;
+{
+	register struct flock *fl = ap->a_fl;
+	register struct lockf *lock;
+	off_t start, end, oadd;
+	int error;
+
+	/*
+	 * Convert the flock structure into a start and end.
+	 */
+	switch (fl->l_whence) {
+
+	case SEEK_SET:
+	case SEEK_CUR:
+		/*
+		 * Caller is responsible for adding any necessary offset
+		 * when SEEK_CUR is used.
+		 */
+		start = fl->l_start;
+		break;
+
+	case SEEK_END:
+		if (size > OFF_MAX ||
+		    (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
+			return (EOVERFLOW);
+		start = size + fl->l_start;
+		break;
+
+	default:
+		return (EINVAL);
+	}
+	if (start < 0)
+		return (EINVAL);
+	if (fl->l_len < 0) {
+		if (start == 0)
+			return (EINVAL);
+		end = start - 1;
+		start += fl->l_len;
+		if (start < 0)
+			return (EINVAL);
+	} else if (fl->l_len == 0)
+		end = -1;
+	else {
+		oadd = fl->l_len - 1;
+		if (oadd > OFF_MAX - start)
+			return (EOVERFLOW);
+		end = start + oadd;
+	}
+	/*
+	 * Avoid the common case of unlocking when inode has no locks.
+	 */
+	if (*head == (struct lockf *)0) {
+		if (ap->a_op != F_SETLK) {
+			fl->l_type = F_UNLCK;
+			return (0);
+		}
+	}
+	/*
+	 * Create the lockf structure
+	 */
+	MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+	lock->lf_start = start;
+	lock->lf_end = end;
+	lock->lf_id = ap->a_id;
+	/*
+	 * XXX The problem is that VTOI is ufs specific, so it will
+	 * break LOCKF_DEBUG for all other FS's other than UFS because
+	 * it casts the vnode->data ptr to struct inode *.
+	 */
+/*	lock->lf_inode = VTOI(ap->a_vp); */
+	lock->lf_inode = (struct inode *)0;
+	lock->lf_type = fl->l_type;
+	lock->lf_head = head;
+	lock->lf_next = (struct lockf *)0;
+	TAILQ_INIT(&lock->lf_blkhd);
+	lock->lf_flags = ap->a_flags;
+	/*
+	 * Do the requested operation.
+	 */
+	switch(ap->a_op) {
+	case F_SETLK:
+		return (lf_setlock(lock));
+
+	case F_UNLCK:
+		error = lf_clearlock(lock);
+		FREE(lock, M_LOCKF);
+		return (error);
+
+	case F_GETLK:
+		error = lf_getlock(lock, fl);
+		FREE(lock, M_LOCKF);
+		return (error);
+
+	default:
+		free(lock, M_LOCKF);
+		return (EINVAL);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Set a byte-range lock.
+ */
+static int
+lf_setlock(lock)
+	register struct lockf *lock;
+{
+	register struct lockf *block;
+	struct lockf **head = lock->lf_head;
+	struct lockf **prev, *overlap, *ltmp;
+	static char lockstr[] = "lockf";
+	int ovcase, priority, needtolink, error;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_print("lf_setlock", lock);
+#endif /* LOCKF_DEBUG */
+
+	/*
+	 * Set the priority
+	 */
+	priority = PLOCK;
+	if (lock->lf_type == F_WRLCK)
+		priority += 4;
+	priority |= PCATCH;
+	/*
+	 * Scan lock list for this file looking for locks that would block us.
+	 */
+	while ((block = lf_getblock(lock))) {
+		/*
+		 * Free the structure and return if nonblocking.
+		 */
+		if ((lock->lf_flags & F_WAIT) == 0) {
+			FREE(lock, M_LOCKF);
+			return (EAGAIN);
+		}
+		/*
+		 * We are blocked. Since flock style locks cover
+		 * the whole file, there is no chance for deadlock.
+		 * For byte-range locks we must check for deadlock.
+		 *
+		 * Deadlock detection is done by looking through the
+		 * wait channels to see if there are any cycles that
+		 * involve us. MAXDEPTH is set just to make sure we
+		 * do not go off into neverland.
+		 */
+		if ((lock->lf_flags & F_POSIX) &&
+		    (block->lf_flags & F_POSIX)) {
+			register struct proc *wproc;
+			struct thread *td;
+			register struct lockf *waitblock;
+			int i = 0;
+
+			/* The block is waiting on something */
+			/* XXXKSE this is not complete under threads */
+			wproc = (struct proc *)block->lf_id;
+			mtx_lock_spin(&sched_lock);
+			FOREACH_THREAD_IN_PROC(wproc, td) {
+				while (td->td_wchan &&
+				    (td->td_wmesg == lockstr) &&
+				    (i++ < maxlockdepth)) {
+					waitblock = (struct lockf *)td->td_wchan;
+					/* Get the owner of the blocking lock */
+					waitblock = waitblock->lf_next;
+					if ((waitblock->lf_flags & F_POSIX) == 0)
+						break;
+					wproc = (struct proc *)waitblock->lf_id;
+					if (wproc == (struct proc *)lock->lf_id) {
+						mtx_unlock_spin(&sched_lock);
+						free(lock, M_LOCKF);
+						return (EDEADLK);
+					}
+				}
+			}
+			mtx_unlock_spin(&sched_lock);
+		}
+		/*
+		 * For flock type locks, we must first remove
+		 * any shared locks that we hold before we sleep
+		 * waiting for an exclusive lock.
+		 */
+		if ((lock->lf_flags & F_FLOCK) &&
+		    lock->lf_type == F_WRLCK) {
+			lock->lf_type = F_UNLCK;
+			(void) lf_clearlock(lock);
+			lock->lf_type = F_WRLCK;
+		}
+		/*
+		 * Add our lock to the blocked list and sleep until we're free.
+		 * Remember who blocked us (for deadlock detection).
+		 */
+		lock->lf_next = block;
+		TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 1) {
+			lf_print("lf_setlock: blocking on", block);
+			lf_printlist("lf_setlock", block);
+		}
+#endif /* LOCKF_DEBUG */
+		error = tsleep(lock, priority, lockstr, 0);
+		/*
+		 * We may have been awakened by a signal and/or by a
+		 * debugger continuing us (in which cases we must remove
+		 * ourselves from the blocked list) and/or by another
+		 * process releasing a lock (in which case we have
+		 * already been removed from the blocked list and our
+		 * lf_next field set to NOLOCKF).
+		 */
+		if (lock->lf_next) {
+			TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
+			lock->lf_next = NOLOCKF;
+		}
+		if (error) {
+			free(lock, M_LOCKF);
+			return (error);
+		}
+	}
+	/*
+	 * No blocks!!  Add the lock.  Note that we will
+	 * downgrade or upgrade any overlapping locks this
+	 * process already owns.
+	 *
+	 * Skip over locks owned by other processes.
+	 * Handle any locks that overlap and are owned by ourselves.
+	 */
+	prev = head;
+	block = *head;
+	needtolink = 1;
+	for (;;) {
+		ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
+		if (ovcase)
+			block = overlap->lf_next;
+		/*
+		 * Six cases:
+		 *	0) no overlap
+		 *	1) overlap == lock
+		 *	2) overlap contains lock
+		 *	3) lock contains overlap
+		 *	4) overlap starts before lock
+		 *	5) overlap ends after lock
+		 */
+		switch (ovcase) {
+		case 0: /* no overlap */
+			if (needtolink) {
+				*prev = lock;
+				lock->lf_next = overlap;
+			}
+			break;
+
+		case 1: /* overlap == lock */
+			/*
+			 * If downgrading lock, others may be
+			 * able to acquire it.
+			 */
+			if (lock->lf_type == F_RDLCK &&
+			    overlap->lf_type == F_WRLCK)
+				lf_wakelock(overlap);
+			overlap->lf_type = lock->lf_type;
+			FREE(lock, M_LOCKF);
+			lock = overlap; /* for debug output below */
+			break;
+
+		case 2: /* overlap contains lock */
+			/*
+			 * Check for common starting point and different types.
+			 */
+			if (overlap->lf_type == lock->lf_type) {
+				free(lock, M_LOCKF);
+				lock = overlap; /* for debug output below */
+				break;
+			}
+			if (overlap->lf_start == lock->lf_start) {
+				*prev = lock;
+				lock->lf_next = overlap;
+				overlap->lf_start = lock->lf_end + 1;
+			} else
+				lf_split(overlap, lock);
+			lf_wakelock(overlap);
+			break;
+
+		case 3: /* lock contains overlap */
+			/*
+			 * If downgrading lock, others may be able to
+			 * acquire it, otherwise take the list.
+			 */
+			if (lock->lf_type == F_RDLCK &&
+			    overlap->lf_type == F_WRLCK) {
+				lf_wakelock(overlap);
+			} else {
+				while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
+					ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
+					TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
+					    lf_block);
+					TAILQ_INSERT_TAIL(&lock->lf_blkhd,
+					    ltmp, lf_block);
+					ltmp->lf_next = lock;
+				}
+			}
+			/*
+			 * Add the new lock if necessary and delete the overlap.
+			 */
+			if (needtolink) {
+				*prev = lock;
+				lock->lf_next = overlap->lf_next;
+				prev = &lock->lf_next;
+				needtolink = 0;
+			} else
+				*prev = overlap->lf_next;
+			free(overlap, M_LOCKF);
+			continue;
+
+		case 4: /* overlap starts before lock */
+			/*
+			 * Add lock after overlap on the list.
+			 */
+			lock->lf_next = overlap->lf_next;
+			overlap->lf_next = lock;
+			overlap->lf_end = lock->lf_start - 1;
+			prev = &lock->lf_next;
+			lf_wakelock(overlap);
+			needtolink = 0;
+			continue;
+
+		case 5: /* overlap ends after lock */
+			/*
+			 * Add the new lock before overlap.
+			 */
+			if (needtolink) {
+				*prev = lock;
+				lock->lf_next = overlap;
+			}
+			overlap->lf_start = lock->lf_end + 1;
+			lf_wakelock(overlap);
+			break;
+		}
+		break;
+	}
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1) {
+		lf_print("lf_setlock: got the lock", lock);
+		lf_printlist("lf_setlock", lock);
+	}
+#endif /* LOCKF_DEBUG */
+	return (0);
+}
+
+/*
+ * Remove a byte-range lock on an inode.
+ *
+ * Generally, find the lock (or an overlap to that lock)
+ * and remove it (or shrink it), then wakeup anyone we can.
+ */
+static int
+lf_clearlock(unlock)
+	register struct lockf *unlock;
+{
+	struct lockf **head = unlock->lf_head;
+	register struct lockf *lf = *head;
+	struct lockf *overlap, **prev;
+	int ovcase;
+
+	if (lf == NOLOCKF)
+		return (0);
+#ifdef LOCKF_DEBUG
+	if (unlock->lf_type != F_UNLCK)
+		panic("lf_clearlock: bad type");
+	if (lockf_debug & 1)
+		lf_print("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+	prev = head;
+	while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) {
+		/*
+		 * Wakeup the list of locks to be retried.
+		 */
+		lf_wakelock(overlap);
+
+		switch (ovcase) {
+
+		case 1: /* overlap == lock */
+			*prev = overlap->lf_next;
+			FREE(overlap, M_LOCKF);
+			break;
+
+		case 2: /* overlap contains lock: split it */
+			if (overlap->lf_start == unlock->lf_start) {
+				overlap->lf_start = unlock->lf_end + 1;
+				break;
+			}
+			lf_split(overlap, unlock);
+			overlap->lf_next = unlock->lf_next;
+			break;
+
+		case 3: /* lock contains overlap */
+			*prev = overlap->lf_next;
+			lf = overlap->lf_next;
+			free(overlap, M_LOCKF);
+			continue;
+
+		case 4: /* overlap starts before lock */
+			overlap->lf_end = unlock->lf_start - 1;
+			prev = &overlap->lf_next;
+			lf = overlap->lf_next;
+			continue;
+
+		case 5: /* overlap ends after lock */
+			overlap->lf_start = unlock->lf_end + 1;
+			break;
+		}
+		break;
+	}
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_printlist("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+	return (0);
+}
+
+/*
+ * Check whether there is a blocking lock,
+ * and if so return its process identifier.
+ */
+static int
+lf_getlock(lock, fl)
+	register struct lockf *lock;
+	register struct flock *fl;
+{
+	register struct lockf *block;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_print("lf_getlock", lock);
+#endif /* LOCKF_DEBUG */
+
+	if ((block = lf_getblock(lock))) {
+		fl->l_type = block->lf_type;
+		fl->l_whence = SEEK_SET;
+		fl->l_start = block->lf_start;
+		if (block->lf_end == -1)
+			fl->l_len = 0;
+		else
+			fl->l_len = block->lf_end - block->lf_start + 1;
+		if (block->lf_flags & F_POSIX)
+			fl->l_pid = ((struct proc *)(block->lf_id))->p_pid;
+		else
+			fl->l_pid = -1;
+	} else {
+		fl->l_type = F_UNLCK;
+	}
+	return (0);
+}
+
+/*
+ * Walk the list of locks for an inode and
+ * return the first blocking lock.
+ */
+static struct lockf *
+lf_getblock(lock)
+	register struct lockf *lock;
+{
+	struct lockf **prev, *overlap, *lf = *(lock->lf_head);
+	int ovcase;
+
+	prev = lock->lf_head;
+	while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) {
+		/*
+		 * We've found an overlap, see if it blocks us
+		 */
+		if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
+			return (overlap);
+		/*
+		 * Nope, point to the next one on the list and
+		 * see if it blocks us
+		 */
+		lf = overlap->lf_next;
+	}
+	return (NOLOCKF);
+}
+
+/*
+ * Walk the list of locks for an inode to
+ * find an overlapping lock (if any).
+ *
+ * NOTE: this returns only the FIRST overlapping lock.  There
+ *	 may be more than one.
+ */
+static int
+lf_findoverlap(lf, lock, type, prev, overlap)
+	register struct lockf *lf;
+	struct lockf *lock;
+	int type;
+	struct lockf ***prev;
+	struct lockf **overlap;
+{
+	off_t start, end;
+
+	*overlap = lf;
+	if (lf == NOLOCKF)
+		return (0);
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 2)
+		lf_print("lf_findoverlap: looking for overlap in", lock);
+#endif /* LOCKF_DEBUG */
+	start = lock->lf_start;
+	end = lock->lf_end;
+	while (lf != NOLOCKF) {
+		if (((type & SELF) && lf->lf_id != lock->lf_id) ||
+		    ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
+			*prev = &lf->lf_next;
+			*overlap = lf = lf->lf_next;
+			continue;
+		}
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 2)
+			lf_print("\tchecking", lf);
+#endif /* LOCKF_DEBUG */
+		/*
+		 * OK, check for overlap
+		 *
+		 * Six cases:
+		 *	0) no overlap
+		 *	1) overlap == lock
+		 *	2) overlap contains lock
+		 *	3) lock contains overlap
+		 *	4) overlap starts before lock
+		 *	5) overlap ends after lock
+		 */
+		if ((lf->lf_end != -1 && start > lf->lf_end) ||
+		    (end != -1 && lf->lf_start > end)) {
+			/* Case 0 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("no overlap\n");
+#endif /* LOCKF_DEBUG */
+			if ((type & SELF) && end != -1 && lf->lf_start > end)
+				return (0);
+			*prev = &lf->lf_next;
+			*overlap = lf = lf->lf_next;
+			continue;
+		}
+		if ((lf->lf_start == start) && (lf->lf_end == end)) {
+			/* Case 1 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap == lock\n");
+#endif /* LOCKF_DEBUG */
+			return (1);
+		}
+		if ((lf->lf_start <= start) &&
+		    (end != -1) &&
+		    ((lf->lf_end >= end) || (lf->lf_end == -1))) {
+			/* Case 2 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap contains lock\n");
+#endif /* LOCKF_DEBUG */
+			return (2);
+		}
+		if (start <= lf->lf_start &&
+		           (end == -1 ||
+			   (lf->lf_end != -1 && end >= lf->lf_end))) {
+			/* Case 3 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("lock contains overlap\n");
+#endif /* LOCKF_DEBUG */
+			return (3);
+		}
+		if ((lf->lf_start < start) &&
+			((lf->lf_end >= start) || (lf->lf_end == -1))) {
+			/* Case 4 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap starts before lock\n");
+#endif /* LOCKF_DEBUG */
+			return (4);
+		}
+		if ((lf->lf_start > start) &&
+			(end != -1) &&
+			((lf->lf_end > end) || (lf->lf_end == -1))) {
+			/* Case 5 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap ends after lock\n");
+#endif /* LOCKF_DEBUG */
+			return (5);
+		}
+		panic("lf_findoverlap: default");
+	}
+	return (0);
+}
+
+/*
+ * Split a lock and a contained region into
+ * two or three locks as necessary.
+ */
+static void
+lf_split(lock1, lock2)
+	register struct lockf *lock1;
+	register struct lockf *lock2;
+{
+	register struct lockf *splitlock;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 2) {
+		lf_print("lf_split", lock1);
+		lf_print("splitting from", lock2);
+	}
+#endif /* LOCKF_DEBUG */
+	/*
+	 * Check to see if spliting into only two pieces.
+	 */
+	if (lock1->lf_start == lock2->lf_start) {
+		lock1->lf_start = lock2->lf_end + 1;
+		lock2->lf_next = lock1;
+		return;
+	}
+	if (lock1->lf_end == lock2->lf_end) {
+		lock1->lf_end = lock2->lf_start - 1;
+		lock2->lf_next = lock1->lf_next;
+		lock1->lf_next = lock2;
+		return;
+	}
+	/*
+	 * Make a new lock consisting of the last part of
+	 * the encompassing lock
+	 */
+	MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
+	bcopy(lock1, splitlock, sizeof *splitlock);
+	splitlock->lf_start = lock2->lf_end + 1;
+	TAILQ_INIT(&splitlock->lf_blkhd);
+	lock1->lf_end = lock2->lf_start - 1;
+	/*
+	 * OK, now link it in
+	 */
+	splitlock->lf_next = lock1->lf_next;
+	lock2->lf_next = splitlock;
+	lock1->lf_next = lock2;
+}
+
+/*
+ * Wakeup a blocklist
+ */
+static void
+lf_wakelock(listhead)
+	struct lockf *listhead;
+{
+	register struct lockf *wakelock;
+
+	while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
+		wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
+		TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
+		wakelock->lf_next = NOLOCKF;
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 2)
+			lf_print("lf_wakelock: awakening", wakelock);
+#endif /* LOCKF_DEBUG */
+		wakeup(wakelock);
+	}
+}
+
+#ifdef LOCKF_DEBUG
+/*
+ * Print out a lock.
+ */
+void
+lf_print(tag, lock)
+	char *tag;
+	register struct lockf *lock;
+{
+
+	printf("%s: lock %p for ", tag, (void *)lock);
+	if (lock->lf_flags & F_POSIX)
+		printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
+	else
+		printf("id %p", (void *)lock->lf_id);
+	if (lock->lf_inode != (struct inode *)0)
+		/* XXX no %qd in kernel.  Truncate. */
+		printf(" in ino %lu on dev <%d, %d>, %s, start %ld, end %ld",
+		    (u_long)lock->lf_inode->i_number,
+		    major(lock->lf_inode->i_dev),
+		    minor(lock->lf_inode->i_dev),
+		    lock->lf_type == F_RDLCK ? "shared" :
+		    lock->lf_type == F_WRLCK ? "exclusive" :
+		    lock->lf_type == F_UNLCK ? "unlock" :
+		    "unknown", (long)lock->lf_start, (long)lock->lf_end);
+	else
+		printf(" %s, start %ld, end %ld",
+		    lock->lf_type == F_RDLCK ? "shared" :
+		    lock->lf_type == F_WRLCK ? "exclusive" :
+		    lock->lf_type == F_UNLCK ? "unlock" :
+		    "unknown", (long)lock->lf_start, (long)lock->lf_end);
+	if (!TAILQ_EMPTY(&lock->lf_blkhd))
+		printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
+	else
+		printf("\n");
+}
+
+void
+lf_printlist(tag, lock)
+	char *tag;
+	struct lockf *lock;
+{
+	register struct lockf *lf, *blk;
+
+	if (lock->lf_inode == (struct inode *)0)
+		return;
+
+	printf("%s: Lock list for ino %lu on dev <%d, %d>:\n",
+	    tag, (u_long)lock->lf_inode->i_number,
+	    major(lock->lf_inode->i_dev),
+	    minor(lock->lf_inode->i_dev));
+	for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) {
+		printf("\tlock %p for ",(void *)lf);
+		if (lf->lf_flags & F_POSIX)
+			printf("proc %ld",
+			    (long)((struct proc *)lf->lf_id)->p_pid);
+		else
+			printf("id %p", (void *)lf->lf_id);
+		/* XXX no %qd in kernel.  Truncate. */
+		printf(", %s, start %ld, end %ld",
+		    lf->lf_type == F_RDLCK ? "shared" :
+		    lf->lf_type == F_WRLCK ? "exclusive" :
+		    lf->lf_type == F_UNLCK ? "unlock" :
+		    "unknown", (long)lf->lf_start, (long)lf->lf_end);
+		TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
+			printf("\n\t\tlock request %p for ", (void *)blk);
+			if (blk->lf_flags & F_POSIX)
+				printf("proc %ld",
+				    (long)((struct proc *)blk->lf_id)->p_pid);
+			else
+				printf("id %p", (void *)blk->lf_id);
+			/* XXX no %qd in kernel.  Truncate. */
+			printf(", %s, start %ld, end %ld",
+			    blk->lf_type == F_RDLCK ? "shared" :
+			    blk->lf_type == F_WRLCK ? "exclusive" :
+			    blk->lf_type == F_UNLCK ? "unlock" :
+			    "unknown", (long)blk->lf_start,
+			    (long)blk->lf_end);
+			if (!TAILQ_EMPTY(&blk->lf_blkhd))
+				panic("lf_printlist: bad list");
+		}
+		printf("\n");
+	}
+}
+#endif /* LOCKF_DEBUG */
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
new file mode 100644
index 0000000..c7bec3e
--- /dev/null
+++ b/sys/kern/kern_malloc.c
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 1987, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/vmmeter.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#include <vm/uma_dbg.h>
+
+#if defined(INVARIANTS) && defined(__i386__)
+#include <machine/cpu.h>
+#endif
+
+/*
+ * When realloc() is called, if the new size is sufficiently smaller than
+ * the old size, realloc() will allocate a new, smaller block to avoid
+ * wasting memory. 'Sufficiently smaller' is defined as: newsize <=
+ * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'.
+ */
+#ifndef REALLOC_FRACTION
+#define	REALLOC_FRACTION	1	/* new block if <= half the size */
+#endif
+
+MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
+MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
+MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
+
+MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
+MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
+
+static void kmeminit(void *);
+SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL)
+
+static MALLOC_DEFINE(M_FREE, "free", "should be on free list");
+
+static struct malloc_type *kmemstatistics;
+static char *kmembase;
+static char *kmemlimit;
+
+#define KMEM_ZSHIFT	4
+#define KMEM_ZBASE	16
+#define KMEM_ZMASK	(KMEM_ZBASE - 1)
+
+#define KMEM_ZMAX	65536
+#define KMEM_ZSIZE	(KMEM_ZMAX >> KMEM_ZSHIFT)
+static u_int8_t kmemsize[KMEM_ZSIZE + 1];
+
+/* These won't be powers of two for long */
+struct {
+	int kz_size;
+	char *kz_name;
+	uma_zone_t kz_zone;
+} kmemzones[] = {
+	{16, "16", NULL},
+	{32, "32", NULL},
+	{64, "64", NULL},
+	{128, "128", NULL},
+	{256, "256", NULL},
+	{512, "512", NULL},
+	{1024, "1024", NULL},
+	{2048, "2048", NULL},
+	{4096, "4096", NULL},
+	{8192, "8192", NULL},
+	{16384, "16384", NULL},
+	{32768, "32768", NULL},
+	{65536, "65536", NULL},
+	{0, NULL},
+};
+
+u_int vm_kmem_size;
+
+/*
+ * The malloc_mtx protects the kmemstatistics linked list as well as the
+ * mallochash.
+ */
+
+struct mtx malloc_mtx;
+
+#ifdef MALLOC_PROFILE
+uint64_t krequests[KMEM_ZSIZE + 1];
+
+static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS);
+#endif
+
+static int sysctl_kern_malloc(SYSCTL_HANDLER_ARGS);
+
+/*
+ *	malloc:
+ *
+ *	Allocate a block of memory.
+ *
+ *	If M_NOWAIT is set, this routine will not block and return NULL if
+ *	the allocation fails.
+ */
+void *
+malloc(size, type, flags)
+	unsigned long size;
+	struct malloc_type *type;
+	int flags;
+{
+	int indx;
+	caddr_t va;
+	uma_zone_t zone;
+	register struct malloc_type *ksp = type;
+
+#if 0
+	if (size == 0)
+		Debugger("zero size malloc");
+#endif
+	if (!(flags & M_NOWAIT))
+		KASSERT(curthread->td_intr_nesting_level == 0,
+		   ("malloc(M_WAITOK) in interrupt context"));
+	if (size <= KMEM_ZMAX) {
+		if (size & KMEM_ZMASK)
+			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
+		indx = kmemsize[size >> KMEM_ZSHIFT];
+		zone = kmemzones[indx].kz_zone;
+#ifdef MALLOC_PROFILE
+		krequests[size >> KMEM_ZSHIFT]++;
+#endif
+		va = uma_zalloc(zone, flags);
+		mtx_lock(&ksp->ks_mtx);
+		if (va == NULL) 
+			goto out;
+
+		ksp->ks_size |= 1 << indx;
+		size = zone->uz_size;
+	} else {
+		size = roundup(size, PAGE_SIZE);
+		zone = NULL;
+		va = uma_large_malloc(size, flags);
+		mtx_lock(&ksp->ks_mtx);
+		if (va == NULL)
+			goto out;
+	}
+	ksp->ks_memuse += size;
+	ksp->ks_inuse++;
+out:
+	ksp->ks_calls++;
+	if (ksp->ks_memuse > ksp->ks_maxused)
+		ksp->ks_maxused = ksp->ks_memuse;
+
+	mtx_unlock(&ksp->ks_mtx);
+	return ((void *) va);
+}
+
+/*
+ *	free:
+ *
+ *	Free a block of memory allocated by malloc.
+ *
+ *	This routine may not block.
+ */
+void
+free(addr, type)
+	void *addr;
+	struct malloc_type *type;
+{
+	uma_slab_t slab;
+	void *mem;
+	u_long size;
+	register struct malloc_type *ksp = type;
+
+	/* free(NULL, ...) does nothing */
+	if (addr == NULL)
+		return;
+
+	size = 0;
+
+	mem = (void *)((u_long)addr & (~UMA_SLAB_MASK));
+	mtx_lock(&malloc_mtx);
+	slab = hash_sfind(mallochash, mem);
+	mtx_unlock(&malloc_mtx);
+
+	if (slab == NULL)
+		panic("free: address %p(%p) has not been allocated.\n",
+		    addr, mem);
+
+	if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
+#ifdef INVARIANTS
+		struct malloc_type **mtp = addr;
+#endif
+		size = slab->us_zone->uz_size;
+#ifdef INVARIANTS
+		/*
+		 * Cache a pointer to the malloc_type that most recently freed
+		 * this memory here.  This way we know who is most likely to
+		 * have stepped on it later.
+		 *
+		 * This code assumes that size is a multiple of 8 bytes for
+		 * 64 bit machines
+		 */
+		mtp = (struct malloc_type **)
+		    ((unsigned long)mtp & ~UMA_ALIGN_PTR);
+		mtp += (size - sizeof(struct malloc_type *)) /
+		    sizeof(struct malloc_type *);
+		*mtp = type;
+#endif
+		uma_zfree_arg(slab->us_zone, addr, slab);
+	} else {
+		size = slab->us_size;
+		uma_large_free(slab);
+	}
+	mtx_lock(&ksp->ks_mtx);
+	ksp->ks_memuse -= size;
+	ksp->ks_inuse--;
+	mtx_unlock(&ksp->ks_mtx);
+}
+
+/*
+ *	realloc: change the size of a memory block
+ */
+void *
+realloc(addr, size, type, flags)
+	void *addr;
+	unsigned long size;
+	struct malloc_type *type;
+	int flags;
+{
+	uma_slab_t slab;
+	unsigned long alloc;
+	void *newaddr;
+
+	/* realloc(NULL, ...) is equivalent to malloc(...) */
+	if (addr == NULL)
+		return (malloc(size, type, flags));
+
+	mtx_lock(&malloc_mtx);
+	slab = hash_sfind(mallochash,
+	    (void *)((u_long)addr & ~(UMA_SLAB_MASK)));
+	mtx_unlock(&malloc_mtx);
+
+	/* Sanity check */
+	KASSERT(slab != NULL,
+	    ("realloc: address %p out of range", (void *)addr));
+
+	/* Get the size of the original block */
+	if (slab->us_zone)
+		alloc = slab->us_zone->uz_size;
+	else
+		alloc = slab->us_size;
+
+	/* Reuse the original block if appropriate */
+	if (size <= alloc
+	    && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE))
+		return (addr);
+
+	/* Allocate a new, bigger (or smaller) block */
+	if ((newaddr = malloc(size, type, flags)) == NULL)
+		return (NULL);
+
+	/* Copy over original contents */
+	bcopy(addr, newaddr, min(size, alloc));
+	free(addr, type);
+	return (newaddr);
+}
+
+/*
+ *	reallocf: same as realloc() but free memory on failure.
+ */
+void *
+reallocf(addr, size, type, flags)
+	void *addr;
+	unsigned long size;
+	struct malloc_type *type;
+	int flags;
+{
+	void *mem;
+
+	if ((mem = realloc(addr, size, type, flags)) == NULL)
+		free(addr, type);
+	return (mem);
+}
+
+/*
+ * Initialize the kernel memory allocator
+ */
+/* ARGSUSED*/
+static void
+kmeminit(dummy)
+	void *dummy;
+{
+	u_int8_t indx;
+	u_long npg;
+	u_long mem_size;
+	void *hashmem;
+	u_long hashsize;
+	int highbit;
+	int bits;
+	int i;
+ 
+	mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF);
+
+	/*
+	 * Try to auto-tune the kernel memory size, so that it is
+	 * more applicable for a wider range of machine sizes.
+	 * On an X86, a VM_KMEM_SIZE_SCALE value of 4 is good, while
+	 * a VM_KMEM_SIZE of 12MB is a fair compromise.  The
+	 * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space
+	 * available, and on an X86 with a total KVA space of 256MB,
+	 * try to keep VM_KMEM_SIZE_MAX at 80MB or below.
+	 *
+	 * Note that the kmem_map is also used by the zone allocator,
+	 * so make sure that there is enough space.
+	 */
+	vm_kmem_size = VM_KMEM_SIZE;
+	mem_size = cnt.v_page_count * PAGE_SIZE;
+
+#if defined(VM_KMEM_SIZE_SCALE)
+	if ((mem_size / VM_KMEM_SIZE_SCALE) > vm_kmem_size)
+		vm_kmem_size = mem_size / VM_KMEM_SIZE_SCALE;
+#endif
+
+#if defined(VM_KMEM_SIZE_MAX)
+	if (vm_kmem_size >= VM_KMEM_SIZE_MAX)
+		vm_kmem_size = VM_KMEM_SIZE_MAX;
+#endif
+
+	/* Allow final override from the kernel environment */
+	TUNABLE_INT_FETCH("kern.vm.kmem.size", &vm_kmem_size);
+
+	/*
+	 * Limit kmem virtual size to twice the physical memory.
+	 * This allows for kmem map sparseness, but limits the size
+	 * to something sane. Be careful to not overflow the 32bit
+	 * ints while doing the check.
+	 */
+	if ((vm_kmem_size / 2) > (cnt.v_page_count * PAGE_SIZE))
+		vm_kmem_size = 2 * cnt.v_page_count * PAGE_SIZE;
+
+	/*
+	 * In mbuf_init(), we set up submaps for mbufs and clusters, in which
+	 * case we rounddown() (nmbufs * MSIZE) and (nmbclusters * MCLBYTES),
+	 * respectively. Mathematically, this means that what we do here may
+	 * amount to slightly more address space than we need for the submaps,
+	 * but it never hurts to have an extra page in kmem_map.
+	 */
+	npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt *
+	    sizeof(u_int) + vm_kmem_size) / PAGE_SIZE;
+
+	kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
+		(vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE));
+	kmem_map->system_map = 1;
+
+	hashsize = npg * sizeof(void *);
+
+	highbit = 0;
+	bits = 0;
+	/* The hash size must be a power of two */
+	for (i = 0; i < 8 * sizeof(hashsize); i++)
+		if (hashsize & (1 << i)) {
+			highbit = i;
+			bits++;
+		}
+	if (bits > 1) 
+		hashsize = 1 << (highbit);
+
+	hashmem = (void *)kmem_alloc(kernel_map, (vm_size_t)hashsize);
+	uma_startup2(hashmem, hashsize / sizeof(void *));
+
+	for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) {
+		int size = kmemzones[indx].kz_size;
+		char *name = kmemzones[indx].kz_name;
+
+		kmemzones[indx].kz_zone = uma_zcreate(name, size,
+#ifdef INVARIANTS
+		    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
+#else
+		    NULL, NULL, NULL, NULL,
+#endif
+		    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
+		    
+		for (;i <= size; i+= KMEM_ZBASE)
+			kmemsize[i >> KMEM_ZSHIFT] = indx;
+		
+	}
+}
+
+void
+malloc_init(data)
+	void *data;
+{
+	struct malloc_type *type = (struct malloc_type *)data;
+
+	mtx_lock(&malloc_mtx);
+	if (type->ks_magic != M_MAGIC)
+		panic("malloc type lacks magic");
+
+	if (cnt.v_page_count == 0)
+		panic("malloc_init not allowed before vm init");
+
+	if (type->ks_next != NULL)
+		return;
+
+	type->ks_next = kmemstatistics;	
+	kmemstatistics = type;
+	mtx_init(&type->ks_mtx, type->ks_shortdesc, "Malloc Stats", MTX_DEF);
+	mtx_unlock(&malloc_mtx);
+}
+
+void
+malloc_uninit(data)
+	void *data;
+{
+	struct malloc_type *type = (struct malloc_type *)data;
+	struct malloc_type *t;
+
+	mtx_lock(&malloc_mtx);
+	mtx_lock(&type->ks_mtx);
+	if (type->ks_magic != M_MAGIC)
+		panic("malloc type lacks magic");
+
+	if (cnt.v_page_count == 0)
+		panic("malloc_uninit not allowed before vm init");
+
+	if (type == kmemstatistics)
+		kmemstatistics = type->ks_next;
+	else {
+		for (t = kmemstatistics; t->ks_next != NULL; t = t->ks_next) {
+			if (t->ks_next == type) {
+				t->ks_next = type->ks_next;
+				break;
+			}
+		}
+	}
+	type->ks_next = NULL;
+	mtx_destroy(&type->ks_mtx);
+	mtx_unlock(&malloc_mtx);
+}
+
+static int
+sysctl_kern_malloc(SYSCTL_HANDLER_ARGS)
+{
+	struct malloc_type *type;
+	int linesize = 128;
+	int curline;
+	int bufsize;
+	int first;
+	int error;
+	char *buf;
+	char *p;
+	int cnt;
+	int len;
+	int i;
+
+	cnt = 0;
+
+	mtx_lock(&malloc_mtx);
+	for (type = kmemstatistics; type != NULL; type = type->ks_next)
+		cnt++;
+
+	mtx_unlock(&malloc_mtx);
+	bufsize = linesize * (cnt + 1);
+	p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
+	mtx_lock(&malloc_mtx);
+
+	len = snprintf(p, linesize,
+	    "\n        Type  InUse MemUse HighUse Requests  Size(s)\n");
+	p += len;
+
+	for (type = kmemstatistics; cnt != 0 && type != NULL;
+	    type = type->ks_next, cnt--) {
+		if (type->ks_calls == 0)
+			continue;
+
+		curline = linesize - 2;	/* Leave room for the \n */
+		len = snprintf(p, curline, "%13s%6lu%6luK%7luK%9llu",
+			type->ks_shortdesc,
+			type->ks_inuse,
+			(type->ks_memuse + 1023) / 1024,
+			(type->ks_maxused + 1023) / 1024,
+			(long long unsigned)type->ks_calls);
+		curline -= len;
+		p += len;
+
+		first = 1;
+		for (i = 0; i < sizeof(kmemzones) / sizeof(kmemzones[0]) - 1;
+		    i++) {
+			if (type->ks_size & (1 << i)) {
+				if (first)
+					len = snprintf(p, curline, "  ");
+				else
+					len = snprintf(p, curline, ",");
+				curline -= len;
+				p += len;
+
+				len = snprintf(p, curline,
+				    "%s", kmemzones[i].kz_name);
+				curline -= len;
+				p += len;
+
+				first = 0;
+			}
+		}
+
+		len = snprintf(p, 2, "\n");
+		p += len;
+	}
+
+	mtx_unlock(&malloc_mtx);
+	error = SYSCTL_OUT(req, buf, p - buf);
+
+	free(buf, M_TEMP);
+	return (error);
+}
+
+SYSCTL_OID(_kern, OID_AUTO, malloc, CTLTYPE_STRING|CTLFLAG_RD,
+    NULL, 0, sysctl_kern_malloc, "A", "Malloc Stats");
+
+#ifdef MALLOC_PROFILE
+
+static int
+sysctl_kern_mprof(SYSCTL_HANDLER_ARGS)
+{
+	int linesize = 64;
+	uint64_t count;
+	uint64_t waste;
+	uint64_t mem;
+	int bufsize;
+	int error;
+	char *buf;
+	int rsize;
+	int size;
+	char *p;
+	int len;
+	int i;
+
+	bufsize = linesize * (KMEM_ZSIZE + 1);
+	bufsize += 128; 	/* For the stats line */
+	bufsize += 128; 	/* For the banner line */
+	waste = 0;
+	mem = 0;
+
+	p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
+	len = snprintf(p, bufsize,
+	    "\n  Size                    Requests  Real Size\n");
+	bufsize -= len;
+	p += len;
+
+	for (i = 0; i < KMEM_ZSIZE; i++) {
+		size = i << KMEM_ZSHIFT;
+		rsize = kmemzones[kmemsize[i]].kz_size;
+		count = (long long unsigned)krequests[i];
+
+		len = snprintf(p, bufsize, "%6d%28llu%11d\n",
+		    size, (unsigned long long)count, rsize);
+		bufsize -= len;
+		p += len;
+
+		if ((rsize * count) > (size * count))
+			waste += (rsize * count) - (size * count);
+		mem += (rsize * count);
+	}
+
+	len = snprintf(p, bufsize,
+	    "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n",
+	    (unsigned long long)mem, (unsigned long long)waste);
+	p += len;
+
+	error = SYSCTL_OUT(req, buf, p - buf);
+
+	free(buf, M_TEMP);
+	return (error);
+}
+
+SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD,
+    NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling");
+#endif /* MALLOC_PROFILE */
diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
new file mode 100644
index 0000000..ebcba94
--- /dev/null
+++ b/sys/kern/kern_mib.c
@@ -0,0 +1,336 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
+ * $FreeBSD$
+ */
+
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/jail.h>
+#include <sys/smp.h>
+
+SYSCTL_NODE(, 0,	  sysctl, CTLFLAG_RW, 0,
+	"Sysctl internal magic");
+SYSCTL_NODE(, CTL_KERN,	  kern,   CTLFLAG_RW, 0,
+	"High kernel, proc, limits &c");
+SYSCTL_NODE(, CTL_VM,	  vm,     CTLFLAG_RW, 0,
+	"Virtual memory");
+SYSCTL_NODE(, CTL_VFS,	  vfs,     CTLFLAG_RW, 0,
+	"File system");
+SYSCTL_NODE(, CTL_NET,	  net,    CTLFLAG_RW, 0,
+	"Network, (see socket.h)");
+SYSCTL_NODE(, CTL_DEBUG,  debug,  CTLFLAG_RW, 0,
+	"Debugging");
+SYSCTL_NODE(_debug, OID_AUTO,  sizeof,  CTLFLAG_RW, 0,
+	"Sizeof various things");
+SYSCTL_NODE(, CTL_HW,	  hw,     CTLFLAG_RW, 0,
+	"hardware");
+SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
+	"machine dependent");
+SYSCTL_NODE(, CTL_USER,	  user,   CTLFLAG_RW, 0,
+	"user-level");
+SYSCTL_NODE(, CTL_P1003_1B,  p1003_1b,   CTLFLAG_RW, 0,
+	"p1003_1b, (see p1003_1b.h)");
+
+SYSCTL_NODE(, OID_AUTO,  compat, CTLFLAG_RW, 0,
+	"Compatibility code");
+SYSCTL_NODE(, OID_AUTO, security, CTLFLAG_RW, 0, 
+     	"Security");
+#ifdef REGRESSION
+SYSCTL_NODE(, OID_AUTO, regression, CTLFLAG_RW, 0,
+     "Regression test MIB");
+#endif
+
+SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD,
+    osrelease, 0, "Operating system release");
+
+SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD,
+    0, BSD, "Operating system revision");
+
+SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD,
+    version, 0, "Kernel version");
+
+SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD,
+    ostype, 0, "Operating system type");
+
+extern int osreldate;
+SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD,
+    &osreldate, 0, "Operating system release date");
+
+SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RD,
+    &maxproc, 0, "Maximum number of processes");
+
+SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
+    &maxprocperuid, 0, "Maximum processes allowed per userid");
+
+SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RD,
+    &maxusers, 0, "Hint for kernel tuning");
+
+SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD,
+    0, ARG_MAX, "Maximum bytes of argument to execve(2)");
+
+SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD,
+    0, _KPOSIX_VERSION, "Version of POSIX attempting to comply to");
+
+SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD,
+    0, NGROUPS_MAX, "Maximum number of groups a user can belong to");
+
+SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD,
+    0, 1, "Whether job control is available");
+
+#ifdef _POSIX_SAVED_IDS
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD,
+    0, 1, "Whether saved set-group/user ID is available");
+#else
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD,
+    0, 0, "Whether saved set-group/user ID is available");
+#endif
+
+char kernelname[MAXPATHLEN] = "/kernel";	/* XXX bloat */
+
+SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW,
+    kernelname, sizeof kernelname, "Name of kernel file booted");
+
+#ifdef SMP
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD,
+    &mp_ncpus, 0, "Number of active CPUs");
+#else
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD,
+    0, 1, "Number of active CPUs");
+#endif
+
+SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD,
+    0, BYTE_ORDER, "System byte order");
+
+SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD,
+    0, PAGE_SIZE, "System memory page size");
+
+static char	machine_arch[] = MACHINE_ARCH;
+SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD,
+    machine_arch, 0, "System architecture");
+
+char hostname[MAXHOSTNAMELEN];
+
+static int
+sysctl_hostname(SYSCTL_HANDLER_ARGS)
+{
+	struct prison *pr;
+	char tmphostname[MAXHOSTNAMELEN];
+	int error;
+
+	pr = req->td->td_ucred->cr_prison;
+	if (pr != NULL) {
+		if (!jail_set_hostname_allowed && req->newptr)
+			return (EPERM);
+		/*
+		 * Process is in jail, so make a local copy of jail
+		 * hostname to get/set so we don't have to hold the jail
+		 * mutex during the sysctl copyin/copyout activities.
+		 */
+		mtx_lock(&pr->pr_mtx);
+		bcopy(pr->pr_host, tmphostname, MAXHOSTNAMELEN);
+		mtx_unlock(&pr->pr_mtx);
+
+		error = sysctl_handle_string(oidp, tmphostname,
+		    sizeof pr->pr_host, req);
+
+		if (req->newptr != NULL && error == 0) {
+			/*
+			 * Copy the locally set hostname to the jail, if
+			 * appropriate.
+			 */
+			mtx_lock(&pr->pr_mtx);
+			bcopy(tmphostname, pr->pr_host, MAXHOSTNAMELEN);
+			mtx_unlock(&pr->pr_mtx);
+		}
+	} else
+		error = sysctl_handle_string(oidp,
+		    hostname, sizeof hostname, req);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
+       CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_PRISON,
+       0, 0, sysctl_hostname, "A", "Hostname");
+
+static int	regression_securelevel_nonmonotonic = 0;
+
+#ifdef REGRESSION
+SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW,
+    &regression_securelevel_nonmonotonic, 0, "securelevel may be lowered");
+#endif
+
+int securelevel = -1;
+struct mtx securelevel_mtx;
+
+MTX_SYSINIT(securelevel_lock, &securelevel_mtx, "securelevel mutex lock",
+    MTX_DEF);
+
+static int
+sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
+{
+	struct prison *pr;
+	int error, level;
+
+	pr = req->td->td_ucred->cr_prison;
+
+	/*
+	 * If the process is in jail, return the maximum of the global and
+	 * local levels; otherwise, return the global level.
+	 */
+	if (pr != NULL) {
+		mtx_lock(&pr->pr_mtx);
+		level = imax(securelevel, pr->pr_securelevel);
+		mtx_unlock(&pr->pr_mtx);
+	} else
+		level = securelevel;
+	error = sysctl_handle_int(oidp, &level, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	/*
+	 * Permit update only if the new securelevel exceeds the
+	 * global level, and local level if any.
+	 */
+	if (pr != NULL) {
+		mtx_lock(&pr->pr_mtx);
+		if (!regression_securelevel_nonmonotonic &&
+		    (level < imax(securelevel, pr->pr_securelevel))) {
+			mtx_unlock(&pr->pr_mtx);
+			return (EPERM);
+		}
+		pr->pr_securelevel = level;
+		mtx_unlock(&pr->pr_mtx);
+	} else {
+		mtx_lock(&securelevel_mtx);
+		if (!regression_securelevel_nonmonotonic &&
+		    (level < securelevel)) {
+			mtx_unlock(&securelevel_mtx);
+			return (EPERM);
+		}
+		securelevel = level;
+		mtx_unlock(&securelevel_mtx);
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel,
+    CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_kern_securelvl,
+    "I", "Current secure level");
+
+char domainname[MAXHOSTNAMELEN];
+SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW,
+    &domainname, sizeof(domainname), "Name of the current YP/NIS domain");
+
+u_long hostid;
+SYSCTL_ULONG(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID");
+
+/*
+ * This is really cheating.  These actually live in the libc, something
+ * which I'm not quite sure is a good idea anyway, but in order for
+ * getnext and friends to actually work, we define dummies here.
+ */
+SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD,
+    "", 0, "PATH that finds all the standard utilities");
+SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD,
+    0, 0, "Max ibase/obase values in bc(1)");
+SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD,
+    0, 0, "Max array size in bc(1)");
+SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD,
+    0, 0, "Max scale value in bc(1)");
+SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD,
+    0, 0, "Max string length in bc(1)");
+SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD,
+    0, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
+SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD,
+    0, 0, "Max length (bytes) of a text-processing utility's input line");
+SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD,
+    0, 0, "Maximum number of repeats of a regexp permitted");
+SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD,
+    0, 0,
+    "The version of POSIX 1003.2 with which the system attempts to comply");
+SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD,
+    0, 0, "Whether C development supports the C bindings option");
+SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD,
+    0, 0, "Whether system supports the C development utilities option");
+SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD,
+    0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD,
+    0, 0, "Whether system supports FORTRAN development utilities");
+SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD,
+    0, 0, "Whether system supports FORTRAN runtime utilities");
+SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD,
+    0, 0, "Whether system supports creation of locales");
+SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD,
+    0, 0, "Whether system supports software development utilities");
+SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD,
+    0, 0, "Whether system supports the user portability utilities");
+SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD,
+    0, 0, "Min Maximum number of streams a process may have open at one time");
+SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD,
+    0, 0, "Min Maximum number of types supported for timezone names");
+
+#include <sys/vnode.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD,
+    0, sizeof(struct vnode), "sizeof(struct vnode)");
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD,
+    0, sizeof(struct proc), "sizeof(struct proc)");
+
+#include <sys/conf.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, specinfo, CTLFLAG_RD,
+    0, sizeof(struct specinfo), "sizeof(struct specinfo)");
+
+#include <sys/bio.h>
+#include <sys/buf.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
+    0, sizeof(struct bio), "sizeof(struct bio)");
+SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
+    0, sizeof(struct buf), "sizeof(struct buf)");
+
+#include <sys/user.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
+    0, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");
diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c
new file mode 100644
index 0000000..74a0259
--- /dev/null
+++ b/sys/kern/kern_module.c
@@ -0,0 +1,394 @@
+/*-
+ * Copyright (c) 1997 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+
+static MALLOC_DEFINE(M_MODULE, "module", "module data structures");
+
+typedef TAILQ_HEAD(, module) modulelist_t;
+struct module {
+	TAILQ_ENTRY(module)	link;	/* chain together all modules */
+	TAILQ_ENTRY(module)	flink;	/* all modules in a file */
+	struct linker_file	*file;	/* file which contains this module */
+	int			refs;	/* reference count */
+	int 			id;	/* unique id number */
+	char 			*name;	/* module name */
+	modeventhand_t 		handler;	/* event handler */
+	void 			*arg;	/* argument for handler */
+	modspecific_t 		data;	/* module specific data */
+};
+
+#define MOD_EVENT(mod, type)	(mod)->handler((mod), (type), (mod)->arg)
+
+static modulelist_t modules;
+struct sx modules_sx;
+static int nextid = 1;
+static void module_shutdown(void *, int);
+
+static int
+modevent_nop(module_t mod, int what, void *arg)
+{
+	return (0);
+}
+
+static void
+module_init(void *arg)
+{
+
+	sx_init(&modules_sx, "module subsystem sx lock");
+	TAILQ_INIT(&modules);
+	EVENTHANDLER_REGISTER(shutdown_post_sync, module_shutdown, NULL,
+	    SHUTDOWN_PRI_DEFAULT);
+}
+
+SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0)
+
+static void
+module_shutdown(void *arg1, int arg2)
+{
+	module_t mod;
+
+	MOD_SLOCK;
+	TAILQ_FOREACH(mod, &modules, link)
+		MOD_EVENT(mod, MOD_SHUTDOWN);
+	MOD_SUNLOCK;
+}
+
+void
+module_register_init(const void *arg)
+{
+	const moduledata_t *data = (const moduledata_t *)arg;
+	int error;
+	module_t mod;
+
+	MOD_SLOCK;
+	mod = module_lookupbyname(data->name);
+	if (mod == NULL)
+		panic("module_register_init: module named %s not found\n",
+		    data->name);
+	MOD_SUNLOCK;
+	error = MOD_EVENT(mod, MOD_LOAD);
+	if (error) {
+		MOD_EVENT(mod, MOD_UNLOAD);
+		MOD_XLOCK;
+		module_release(mod);
+		MOD_XUNLOCK;
+		printf("module_register_init: MOD_LOAD (%s, %p, %p) error"
+		    " %d\n", data->name, (void *)data->evhand, data->priv,
+		    error); 
+	}
+}
+
+int
+module_register(const moduledata_t *data, linker_file_t container)
+{
+	size_t namelen;
+	module_t newmod;
+
+	MOD_SLOCK;
+	newmod = module_lookupbyname(data->name);
+	if (newmod != NULL) {
+		MOD_SUNLOCK;
+		printf("module_register: module %s already exists!\n",
+		    data->name);
+		return (EEXIST);
+	}
+	MOD_SUNLOCK;
+	namelen = strlen(data->name) + 1;
+	newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK);
+	if (newmod == NULL)
+		return (ENOMEM);
+	MOD_XLOCK;
+	newmod->refs = 1;
+	newmod->id = nextid++;
+	newmod->name = (char *)(newmod + 1);
+	strcpy(newmod->name, data->name);
+	newmod->handler = data->evhand ? data->evhand : modevent_nop;
+	newmod->arg = data->priv;
+	bzero(&newmod->data, sizeof(newmod->data));
+	TAILQ_INSERT_TAIL(&modules, newmod, link);
+
+	if (container)
+		TAILQ_INSERT_TAIL(&container->modules, newmod, flink);
+	newmod->file = container;
+	MOD_XUNLOCK;
+	return (0);
+}
+
+void
+module_reference(module_t mod)
+{
+
+	MOD_XLOCK_ASSERT;
+
+	MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs));
+	mod->refs++;
+}
+
+void
+module_release(module_t mod)
+{
+
+	MOD_XLOCK_ASSERT;
+
+	if (mod->refs <= 0)
+		panic("module_release: bad reference count");
+
+	MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs));
+	
+	mod->refs--;
+	if (mod->refs == 0) {
+		TAILQ_REMOVE(&modules, mod, link);
+		if (mod->file)
+			TAILQ_REMOVE(&mod->file->modules, mod, flink);
+		MOD_XUNLOCK;
+		free(mod, M_MODULE);
+		MOD_XLOCK;
+	}
+}
+
+module_t
+module_lookupbyname(const char *name)
+{
+	module_t mod;
+	int err;
+
+	MOD_LOCK_ASSERT;
+
+	TAILQ_FOREACH(mod, &modules, link) {
+		err = strcmp(mod->name, name);
+		if (err == 0)
+			return (mod);
+	}
+	return (NULL);
+}
+
+module_t
+module_lookupbyid(int modid)
+{
+        module_t mod;
+
+        MOD_LOCK_ASSERT;
+
+        TAILQ_FOREACH(mod, &modules, link)
+                if (mod->id == modid)
+                        return(mod);
+        return (NULL);
+}
+
+int
+module_unload(module_t mod)
+{
+
+        return (MOD_EVENT(mod, MOD_UNLOAD));
+}
+
+int
+module_getid(module_t mod)
+{
+
+	MOD_LOCK_ASSERT;
+	return (mod->id);
+}
+
+module_t
+module_getfnext(module_t mod)
+{
+
+	MOD_LOCK_ASSERT;
+	return (TAILQ_NEXT(mod, flink));
+}
+
+void
+module_setspecific(module_t mod, modspecific_t *datap)
+{
+
+	MOD_XLOCK_ASSERT;
+	mod->data = *datap;
+}
+
+/*
+ * Syscalls.
+ */
+/*
+ * MPSAFE
+ */
+int
+modnext(struct thread *td, struct modnext_args *uap)
+{
+	module_t mod;
+	int error = 0;
+
+	td->td_retval[0] = -1;
+
+	MOD_SLOCK;
+	if (SCARG(uap, modid) == 0) {
+		mod = TAILQ_FIRST(&modules);
+		if (mod)
+			td->td_retval[0] = mod->id;
+		else
+			error = ENOENT;
+		goto done2;
+	}
+	mod = module_lookupbyid(SCARG(uap, modid));
+	if (mod == NULL) {
+		error = ENOENT;
+		goto done2;
+	}
+	if (TAILQ_NEXT(mod, link))
+		td->td_retval[0] = TAILQ_NEXT(mod, link)->id;
+	else
+		td->td_retval[0] = 0;
+done2:
+	MOD_SUNLOCK;
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+modfnext(struct thread *td, struct modfnext_args *uap)
+{
+	module_t mod;
+	int error;
+
+	td->td_retval[0] = -1;
+
+	MOD_SLOCK;
+	mod = module_lookupbyid(SCARG(uap, modid));
+	if (mod == NULL) {
+		error = ENOENT;
+	} else {
+		error = 0;
+		if (TAILQ_NEXT(mod, flink))
+			td->td_retval[0] = TAILQ_NEXT(mod, flink)->id;
+		else
+			td->td_retval[0] = 0;
+	}
+	MOD_SUNLOCK;
+	return (error);
+}
+
+struct module_stat_v1 {
+	int	version;		/* set to sizeof(struct module_stat) */
+	char	name[MAXMODNAME];
+	int	refs;
+	int	id;
+};
+
+/*
+ * MPSAFE
+ */
+int
+modstat(struct thread *td, struct modstat_args *uap)
+{
+	module_t mod;
+	modspecific_t data;
+	int error = 0;
+	int id, namelen, refs, version;
+	struct module_stat *stat;
+	char *name;
+
+	MOD_SLOCK;
+	mod = module_lookupbyid(SCARG(uap, modid));
+	if (mod == NULL) {
+		MOD_SUNLOCK;
+		return (ENOENT);
+	}
+	id = mod->id;
+	refs = mod->refs;
+	name = mod->name;
+	data = mod->data;
+	MOD_SUNLOCK;
+	stat = SCARG(uap, stat);
+
+	/*
+	 * Check the version of the user's structure.
+	 */
+	if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
+		return (error);
+	if (version != sizeof(struct module_stat_v1)
+	    && version != sizeof(struct module_stat))
+		return (EINVAL);
+	namelen = strlen(mod->name) + 1;
+	if (namelen > MAXMODNAME)
+		namelen = MAXMODNAME;
+	if ((error = copyout(name, &stat->name[0], namelen)) != 0)
+		return (error);
+
+	if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0)
+		return (error);
+	if ((error = copyout(&id, &stat->id, sizeof(int))) != 0)
+		return (error);
+
+	/*
+	 * >v1 stat includes module data.
+	 */
+	if (version == sizeof(struct module_stat))
+		if ((error = copyout(&data, &stat->data, 
+		    sizeof(data))) != 0)
+			return (error);
+	td->td_retval[0] = 0;
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+modfind(struct thread *td, struct modfind_args *uap)
+{
+	int error = 0;
+	char name[MAXMODNAME];
+	module_t mod;
+
+	if ((error = copyinstr(SCARG(uap, name), name, sizeof name, 0)) != 0)
+		return (error);
+
+	MOD_SLOCK;
+	mod = module_lookupbyname(name);
+	if (mod == NULL)
+		error = ENOENT;
+	else
+		td->td_retval[0] = module_getid(mod);
+	MOD_SUNLOCK;
+	return (error);
+}
diff --git a/sys/kern/kern_mtxpool.c b/sys/kern/kern_mtxpool.c
new file mode 100644
index 0000000..3d4aa1c
--- /dev/null
+++ b/sys/kern/kern_mtxpool.c
@@ -0,0 +1,115 @@
+/*-
+ * Copyright (c) 2001 Matthew Dillon.  All Rights Reserved.  Copyright 
+ * terms are as specified in the COPYRIGHT file at the base of the source
+ * tree.
+ *
+ * Mutex pool routines.  These routines are designed to be used as short
+ * term leaf mutexes (e.g. the last mutex you might aquire other then
+ * calling msleep()).  They operate using a shared pool.  A mutex is chosen
+ * from the pool based on the supplied pointer (which may or may not be
+ * valid).
+ *
+ * Advantages:
+ *	- no structural overhead.  Mutexes can be associated with structures
+ *	  without adding bloat to the structures.
+ *	- mutexes can be obtained for invalid pointers, useful when uses
+ *	  mutexes to interlock destructor ops.
+ *	- no initialization/destructor overhead
+ *	- can be used with msleep.
+ *
+ * Disadvantages:
+ *	- should generally only be used as leaf mutexes
+ *	- pool/pool dependancy ordering cannot be depended on.
+ *	- possible L1 cache mastersip contention between cpus
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+#ifndef MTX_POOL_SIZE
+#define MTX_POOL_SIZE	128
+#endif
+#define MTX_POOL_MASK	(MTX_POOL_SIZE-1)
+
+static struct mtx mtx_pool_ary[MTX_POOL_SIZE];
+
+int mtx_pool_valid = 0;
+
+/*
+ * Inline version of mtx_pool_find(), used to streamline our main API
+ * function calls.
+ */
+static __inline
+struct mtx *
+_mtx_pool_find(void *ptr)
+{
+    int p;
+
+    p = (int)(uintptr_t)ptr;
+    return(&mtx_pool_ary[(p ^ (p >> 6)) & MTX_POOL_MASK]);
+}
+
+static void
+mtx_pool_setup(void *dummy __unused)
+{
+    int i;
+
+    for (i = 0; i < MTX_POOL_SIZE; ++i)
+	mtx_init(&mtx_pool_ary[i], "pool mutex", NULL, MTX_DEF | MTX_NOWITNESS | MTX_QUIET);
+    mtx_pool_valid = 1;
+}
+
+/*
+ * Obtain a (shared) mutex from the pool.  The returned mutex is a leaf
+ * level mutex, meaning that if you obtain it you cannot obtain any other
+ * mutexes until you release it.  You can legally msleep() on the mutex.
+ */
+struct mtx *
+mtx_pool_alloc(void)
+{
+    static int si;
+    return(&mtx_pool_ary[si++ & MTX_POOL_MASK]);
+}
+
+/*
+ * Return the (shared) pool mutex associated with the specified address.
+ * The returned mutex is a leaf level mutex, meaning that if you obtain it
+ * you cannot obtain any other mutexes until you release it.  You can
+ * legally msleep() on the mutex.
+ */
+struct mtx *
+mtx_pool_find(void *ptr)
+{
+    return(_mtx_pool_find(ptr));
+}
+
+/*
+ * Combined find/lock operation.  Lock the pool mutex associated with
+ * the specified address.
+ */
+void 
+mtx_pool_lock(void *ptr)
+{
+    mtx_lock(_mtx_pool_find(ptr));
+}
+
+/*
+ * Combined find/unlock operation.  Unlock the pool mutex associated with
+ * the specified address.
+ */
+void
+mtx_pool_unlock(void *ptr)
+{
+    mtx_unlock(_mtx_pool_find(ptr));
+}
+
+SYSINIT(mtxpooli, SI_SUB_MTX_POOL, SI_ORDER_FIRST, mtx_pool_setup, NULL)   
+
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
new file mode 100644
index 0000000..08bca8d
--- /dev/null
+++ b/sys/kern/kern_mutex.c
@@ -0,0 +1,986 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ *    promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Machine independent bits of mutex implementation.
+ */
+
+#include "opt_adaptive_mutexes.h"
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/stdint.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+
+#include <machine/atomic.h>
+#include <machine/bus.h>
+#include <machine/clock.h>
+#include <machine/cpu.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+/*
+ * Internal utility macros.
+ */
+#define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
+
+#define mtx_owner(m)	(mtx_unowned((m)) ? NULL \
+	: (struct thread *)((m)->mtx_lock & MTX_FLAGMASK))
+
+/* XXXKSE This test will change. */
+#define	thread_running(td)						\
+	((td)->td_kse != NULL && (td)->td_kse->ke_oncpu != NOCPU)
+    
+/*
+ * Lock classes for sleep and spin mutexes.
+ */
+struct lock_class lock_class_mtx_sleep = {
+	"sleep mutex",
+	LC_SLEEPLOCK | LC_RECURSABLE
+};
+struct lock_class lock_class_mtx_spin = {
+	"spin mutex",
+	LC_SPINLOCK | LC_RECURSABLE
+};
+
+/*
+ * System-wide mutexes
+ */
+struct mtx sched_lock;
+struct mtx Giant;
+
+/*
+ * Prototypes for non-exported routines.
+ */
+static void	propagate_priority(struct thread *);
+
+static void
+propagate_priority(struct thread *td)
+{
+	int pri = td->td_priority;
+	struct mtx *m = td->td_blocked;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	for (;;) {
+		struct thread *td1;
+
+		td = mtx_owner(m);
+
+		if (td == NULL) {
+			/*
+			 * This really isn't quite right. Really
+			 * ought to bump priority of thread that
+			 * next acquires the mutex.
+			 */
+			MPASS(m->mtx_lock == MTX_CONTESTED);
+			return;
+		}
+
+		MPASS(td->td_proc->p_magic == P_MAGIC);
+		KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));
+		if (td->td_priority <= pri) /* lower is higher priority */
+			return;
+
+		/*
+		 * Bump this thread's priority.
+		 */
+		td->td_priority = pri;
+
+		/*
+		 * If lock holder is actually running, just bump priority.
+		 */
+		if (thread_running(td)) {
+			MPASS(td->td_proc->p_stat == SRUN
+			|| td->td_proc->p_stat == SZOMB
+			|| td->td_proc->p_stat == SSTOP);
+			return;
+		}
+
+#ifndef SMP
+		/*
+		 * For UP, we check to see if td is curthread (this shouldn't
+		 * ever happen however as it would mean we are in a deadlock.)
+		 */
+		KASSERT(td != curthread, ("Deadlock detected"));
+#endif
+
+		/*
+		 * If on run queue move to new run queue, and quit.
+		 * XXXKSE this gets a lot more complicated under threads
+		 * but try anyhow.
+		 */
+		if (td->td_proc->p_stat == SRUN) {
+			MPASS(td->td_blocked == NULL);
+			remrunqueue(td);
+			setrunqueue(td);
+			return;
+		}
+
+		/*
+		 * If we aren't blocked on a mutex, we should be.
+		 */
+		KASSERT(td->td_proc->p_stat == SMTX, (
+		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
+		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,
+		    m->mtx_object.lo_name));
+
+		/*
+		 * Pick up the mutex that td is blocked on.
+		 */
+		m = td->td_blocked;
+		MPASS(m != NULL);
+
+		/*
+		 * Check if the thread needs to be moved up on
+		 * the blocked chain
+		 */
+		if (td == TAILQ_FIRST(&m->mtx_blocked)) {
+			continue;
+		}
+
+		td1 = TAILQ_PREV(td, threadqueue, td_blkq);
+		if (td1->td_priority <= pri) {
+			continue;
+		}
+
+		/*
+		 * Remove thread from blocked chain and determine where
+		 * it should be moved up to.  Since we know that td1 has
+		 * a lower priority than td, we know that at least one
+		 * thread in the chain has a lower priority and that
+		 * td1 will thus not be NULL after the loop.
+		 */
+		TAILQ_REMOVE(&m->mtx_blocked, td, td_blkq);
+		TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq) {
+			MPASS(td1->td_proc->p_magic == P_MAGIC);
+			if (td1->td_priority > pri)
+				break;
+		}
+
+		MPASS(td1 != NULL);
+		TAILQ_INSERT_BEFORE(td1, td, td_blkq);
+		CTR4(KTR_LOCK,
+		    "propagate_priority: p %p moved before %p on [%p] %s",
+		    td, td1, m, m->mtx_object.lo_name);
+	}
+}
+
+#ifdef MUTEX_PROFILING
+SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging");
+SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling");
+static int mutex_prof_enable = 0;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW,
+    &mutex_prof_enable, 0, "Enable tracing of mutex holdtime");
+
+struct mutex_prof {
+	const char *name;
+	const char *file;
+	int line;
+#define MPROF_MAX 0
+#define MPROF_TOT 1
+#define MPROF_CNT 2
+#define MPROF_AVG 3
+	uintmax_t counter[4];
+	struct mutex_prof *next;
+};
+
+/*
+ * mprof_buf is a static pool of profiling records to avoid possible
+ * reentrance of the memory allocation functions.
+ *
+ * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE.
+ */
+#define NUM_MPROF_BUFFERS 1000
+static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS];
+static int first_free_mprof_buf;
+#define MPROF_HASH_SIZE 1009
+static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE];
+
+static int mutex_prof_acquisitions;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD,
+    &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded");
+static int mutex_prof_records;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD,
+    &mutex_prof_records, 0, "Number of profiling records");
+static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD,
+    &mutex_prof_maxrecords, 0, "Maximum number of profiling records");
+static int mutex_prof_rejected;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD,
+    &mutex_prof_rejected, 0, "Number of rejected profiling records");
+static int mutex_prof_hashsize = MPROF_HASH_SIZE;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD,
+    &mutex_prof_hashsize, 0, "Hash size");
+static int mutex_prof_collisions = 0;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD,
+    &mutex_prof_collisions, 0, "Number of hash collisions");
+
+/*
+ * mprof_mtx protects the profiling buffers and the hash.
+ */
+static struct mtx mprof_mtx;
+MTX_SYSINIT(mprof, &mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET);
+
+static u_int64_t
+nanoseconds(void)
+{
+	struct timespec tv;
+
+	nanotime(&tv);
+	return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec);
+}
+
+static int
+dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf *sb;
+	int error, i;
+
+	if (first_free_mprof_buf == 0)
+		return SYSCTL_OUT(req, "No locking recorded",
+		    sizeof("No locking recorded"));
+
+	sb = sbuf_new(NULL, NULL, 1024, SBUF_AUTOEXTEND);
+	sbuf_printf(sb, "%12s %12s %12s %12s %s\n",
+	    "max", "total", "count", "average", "name");
+	mtx_lock_spin(&mprof_mtx);
+	for (i = 0; i < first_free_mprof_buf; ++i)
+		sbuf_printf(sb, "%12ju %12ju %12ju %12ju %s:%d (%s)\n",
+		    mprof_buf[i].counter[MPROF_MAX] / 1000,
+		    mprof_buf[i].counter[MPROF_TOT] / 1000,
+		    mprof_buf[i].counter[MPROF_CNT],
+		    mprof_buf[i].counter[MPROF_AVG] / 1000,
+		    mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name);
+	mtx_unlock_spin(&mprof_mtx);
+	sbuf_finish(sb);
+	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+	sbuf_delete(sb);
+	return (error);
+}
+SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
+    NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics");
+#endif
+
+/*
+ * Function versions of the inlined __mtx_* macros.  These are used by
+ * modules and can also be called from assembly language if needed.
+ */
+void
+_mtx_lock_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+	_get_sleep_lock(m, curthread, opts, file, line);
+	LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	    line);
+	WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+#ifdef MUTEX_PROFILING
+	/* don't reset the timer when/if recursing */
+	if (m->acqtime == 0) {
+		m->file = file;
+		m->line = line;
+		m->acqtime = mutex_prof_enable ? nanoseconds() : 0;
+		++mutex_prof_acquisitions;
+	}
+#endif
+}
+
+void
+_mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+	mtx_assert(m, MA_OWNED);
+#ifdef MUTEX_PROFILING
+	if (m->acqtime != 0) {
+		static const char *unknown = "(unknown)";
+		struct mutex_prof *mpp;
+		u_int64_t acqtime, now;
+		const char *p, *q;
+		volatile u_int hash;
+
+		now = nanoseconds();
+		acqtime = m->acqtime;
+		m->acqtime = 0;
+		if (now <= acqtime)
+			goto out;
+		for (p = file; strncmp(p, "../", 3) == 0; p += 3)
+			/* nothing */ ;
+		if (p == NULL || *p == '\0')
+			p = unknown;
+		for (hash = line, q = p; *q != '\0'; ++q)
+			hash = (hash * 2 + *q) % MPROF_HASH_SIZE;
+		mtx_lock_spin(&mprof_mtx);
+		for (mpp = mprof_hash[hash]; mpp != NULL; mpp = mpp->next)
+			if (mpp->line == line && strcmp(mpp->file, p) == 0)
+				break;
+		if (mpp == NULL) {
+			/* Just exit if we cannot get a trace buffer */
+			if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) {
+				++mutex_prof_rejected;
+				goto unlock;
+			}
+			mpp = &mprof_buf[first_free_mprof_buf++];
+			mpp->name = mtx_name(m);
+			mpp->file = p;
+			mpp->line = line;
+			mpp->next = mprof_hash[hash];
+			if (mprof_hash[hash] != NULL)
+				++mutex_prof_collisions;
+			mprof_hash[hash] = mpp;	
+			++mutex_prof_records;
+		}
+		/*
+		 * Record if the mutex has been held longer now than ever
+		 * before
+		 */
+		if ((now - acqtime) > mpp->counter[MPROF_MAX])
+			mpp->counter[MPROF_MAX] = now - acqtime;
+		mpp->counter[MPROF_TOT] += now - acqtime;
+		mpp->counter[MPROF_CNT] += 1;
+		mpp->counter[MPROF_AVG] =
+		    mpp->counter[MPROF_TOT] / mpp->counter[MPROF_CNT];
+unlock:
+		mtx_unlock_spin(&mprof_mtx);
+	}
+out:
+#endif
+ 	WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	    line);
+	_rel_sleep_lock(m, curthread, opts, file, line);
+}
+
+void
+_mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+#if defined(SMP) || LOCK_DEBUG > 0
+	_get_spin_lock(m, curthread, opts, file, line);
+#else
+	critical_enter();
+#endif
+	LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	    line);
+	WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+}
+
+void
+_mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+	mtx_assert(m, MA_OWNED);
+ 	WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	    line);
+#if defined(SMP) || LOCK_DEBUG > 0
+	_rel_spin_lock(m);
+#else
+	critical_exit();
+#endif
+}
+
+/*
+ * The important part of mtx_trylock{,_flags}()
+ * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that
+ * if we're called, it's because we know we don't already own this lock.
+ */
+int
+_mtx_trylock(struct mtx *m, int opts, const char *file, int line)
+{
+	int rval;
+
+	MPASS(curthread != NULL);
+
+	rval = _obtain_lock(m, curthread);
+
+	LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line);
+	if (rval) {
+		/*
+		 * We do not handle recursion in _mtx_trylock; see the
+		 * note at the top of the routine.
+		 */
+		KASSERT(!mtx_recursed(m),
+		    ("mtx_trylock() called on a recursed mutex"));
+		WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+	}
+
+	return (rval);
+}
+
+/*
+ * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
+ *
+ * We call this if the lock is either contested (i.e. we need to go to
+ * sleep waiting for it), or if we need to recurse on it.
+ */
+void
+_mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
+{
+	struct thread *td = curthread;
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+	struct thread *owner;
+#endif
+
+	if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)td) {
+		m->mtx_recurse++;
+		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
+		return;
+	}
+
+	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		CTR4(KTR_LOCK,
+		    "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
+		    m->mtx_object.lo_name, (void *)m->mtx_lock, file, line);
+
+	while (!_obtain_lock(m, td)) {
+		uintptr_t v;
+		struct thread *td1;
+
+		mtx_lock_spin(&sched_lock);
+		/*
+		 * Check if the lock has been released while spinning for
+		 * the sched_lock.
+		 */
+		if ((v = m->mtx_lock) == MTX_UNOWNED) {
+			mtx_unlock_spin(&sched_lock);
+#ifdef __i386__
+			ia32_pause();
+#endif
+			continue;
+		}
+
+		/*
+		 * The mutex was marked contested on release. This means that
+		 * there are threads blocked on it.
+		 */
+		if (v == MTX_CONTESTED) {
+			td1 = TAILQ_FIRST(&m->mtx_blocked);
+			MPASS(td1 != NULL);
+			m->mtx_lock = (uintptr_t)td | MTX_CONTESTED;
+
+			if (td1->td_priority < td->td_priority)
+				td->td_priority = td1->td_priority; 
+			mtx_unlock_spin(&sched_lock);
+			return;
+		}
+
+		/*
+		 * If the mutex isn't already contested and a failure occurs
+		 * setting the contested bit, the mutex was either released
+		 * or the state of the MTX_RECURSED bit changed.
+		 */
+		if ((v & MTX_CONTESTED) == 0 &&
+		    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
+			(void *)(v | MTX_CONTESTED))) {
+			mtx_unlock_spin(&sched_lock);
+#ifdef __i386__
+			ia32_pause();
+#endif
+			continue;
+		}
+
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+		/*
+		 * If the current owner of the lock is executing on another
+		 * CPU, spin instead of blocking.
+		 */
+		owner = (struct thread *)(v & MTX_FLAGMASK);
+		if (m != &Giant && thread_running(owner)) {
+			mtx_unlock_spin(&sched_lock);
+			while (mtx_owner(m) == owner && thread_running(owner)) {
+#ifdef __i386__
+				ia32_pause();
+#endif
+			}
+			continue;
+		}
+#endif	/* SMP && ADAPTIVE_MUTEXES */
+
+		/*
+		 * We definitely must sleep for this lock.
+		 */
+		mtx_assert(m, MA_NOTOWNED);
+
+#ifdef notyet
+		/*
+		 * If we're borrowing an interrupted thread's VM context, we
+		 * must clean up before going to sleep.
+		 */
+		if (td->td_ithd != NULL) {
+			struct ithd *it = td->td_ithd;
+
+			if (it->it_interrupted) {
+				if (LOCK_LOG_TEST(&m->mtx_object, opts))
+					CTR2(KTR_LOCK,
+				    "_mtx_lock_sleep: %p interrupted %p",
+					    it, it->it_interrupted);
+				intr_thd_fixup(it);
+			}
+		}
+#endif
+
+		/*
+		 * Put us on the list of threads blocked on this mutex.
+		 */
+		if (TAILQ_EMPTY(&m->mtx_blocked)) {
+			td1 = mtx_owner(m);
+			LIST_INSERT_HEAD(&td1->td_contested, m, mtx_contested);
+			TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq);
+		} else {
+			TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq)
+				if (td1->td_priority > td->td_priority)
+					break;
+			if (td1)
+				TAILQ_INSERT_BEFORE(td1, td, td_blkq);
+			else
+				TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq);
+		}
+
+		/*
+		 * Save who we're blocked on.
+		 */
+		td->td_blocked = m;
+		td->td_mtxname = m->mtx_object.lo_name;
+		td->td_proc->p_stat = SMTX;
+		propagate_priority(td);
+
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR3(KTR_LOCK,
+			    "_mtx_lock_sleep: p %p blocked on [%p] %s", td, m,
+			    m->mtx_object.lo_name);
+
+		td->td_proc->p_stats->p_ru.ru_nvcsw++;
+		mi_switch();
+
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR3(KTR_LOCK,
+			  "_mtx_lock_sleep: p %p free from blocked on [%p] %s",
+			  td, m, m->mtx_object.lo_name);
+
+		mtx_unlock_spin(&sched_lock);
+	}
+
+	return;
+}
+
+/*
+ * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
+ *
+ * This is only called if we need to actually spin for the lock. Recursion
+ * is handled inline.
+ */
+void
+_mtx_lock_spin(struct mtx *m, int opts, const char *file, int line)
+{
+	int i = 0;
+
+	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
+
+	for (;;) {
+		if (_obtain_lock(m, curthread))
+			break;
+
+		/* Give interrupts a chance while we spin. */
+		critical_exit();
+		while (m->mtx_lock != MTX_UNOWNED) {
+			if (i++ < 10000000) {
+#ifdef __i386__
+				ia32_pause();
+#endif
+				continue;
+			}
+			if (i < 60000000)
+				DELAY(1);
+#ifdef DDB
+			else if (!db_active)
+#else
+			else
+#endif
+				panic("spin lock %s held by %p for > 5 seconds",
+				    m->mtx_object.lo_name, (void *)m->mtx_lock);
+#ifdef __i386__
+			ia32_pause();
+#endif
+		}
+		critical_enter();
+	}
+
+	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
+
+	return;
+}
+
+/*
+ * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
+ *
+ * We are only called here if the lock is recursed or contested (i.e. we
+ * need to wake up a blocked thread).
+ */
+void
+_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
+{
+	struct thread *td, *td1;
+	struct mtx *m1;
+	int pri;
+
+	td = curthread;
+
+	if (mtx_recursed(m)) {
+		if (--(m->mtx_recurse) == 0)
+			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
+		return;
+	}
+
+	mtx_lock_spin(&sched_lock);
+	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
+
+	td1 = TAILQ_FIRST(&m->mtx_blocked);
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+	if (td1 == NULL) {
+		_release_lock_quick(m);
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m);
+		mtx_unlock_spin(&sched_lock);
+		return;
+	}
+#endif
+	MPASS(td->td_proc->p_magic == P_MAGIC);
+	MPASS(td1->td_proc->p_magic == P_MAGIC);
+
+	TAILQ_REMOVE(&m->mtx_blocked, td1, td_blkq);
+
+	if (TAILQ_EMPTY(&m->mtx_blocked)) {
+		LIST_REMOVE(m, mtx_contested);
+		_release_lock_quick(m);
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
+	} else
+		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);
+
+	pri = PRI_MAX;
+	LIST_FOREACH(m1, &td->td_contested, mtx_contested) {
+		int cp = TAILQ_FIRST(&m1->mtx_blocked)->td_priority;
+		if (cp < pri)
+			pri = cp;
+	}
+
+	if (pri > td->td_base_pri)
+		pri = td->td_base_pri;
+	td->td_priority = pri;
+
+	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p",
+		    m, td1);
+
+	td1->td_blocked = NULL;
+	td1->td_proc->p_stat = SRUN;
+	setrunqueue(td1);
+
+	if (td->td_critnest == 1 && td1->td_priority < pri) {
+#ifdef notyet
+		if (td->td_ithd != NULL) {
+			struct ithd *it = td->td_ithd;
+
+			if (it->it_interrupted) {
+				if (LOCK_LOG_TEST(&m->mtx_object, opts))
+					CTR2(KTR_LOCK,
+				    "_mtx_unlock_sleep: %p interrupted %p",
+					    it, it->it_interrupted);
+				intr_thd_fixup(it);
+			}
+		}
+#endif
+		setrunqueue(td);
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR2(KTR_LOCK,
+			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
+			    (void *)m->mtx_lock);
+
+		td->td_proc->p_stats->p_ru.ru_nivcsw++;
+		mi_switch();
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
+			    m, (void *)m->mtx_lock);
+	}
+
+	mtx_unlock_spin(&sched_lock);
+
+	return;
+}
+
+/*
+ * All the unlocking of MTX_SPIN locks is done inline.
+ * See the _rel_spin_lock() macro for the details. 
+ */
+
+/*
+ * The backing function for the INVARIANTS-enabled mtx_assert()
+ */
+#ifdef INVARIANT_SUPPORT
+void
+_mtx_assert(struct mtx *m, int what, const char *file, int line)
+{
+
+	if (panicstr != NULL)
+		return;
+	switch (what) {
+	case MA_OWNED:
+	case MA_OWNED | MA_RECURSED:
+	case MA_OWNED | MA_NOTRECURSED:
+		if (!mtx_owned(m))
+			panic("mutex %s not owned at %s:%d",
+			    m->mtx_object.lo_name, file, line);
+		if (mtx_recursed(m)) {
+			if ((what & MA_NOTRECURSED) != 0)
+				panic("mutex %s recursed at %s:%d",
+				    m->mtx_object.lo_name, file, line);
+		} else if ((what & MA_RECURSED) != 0) {
+			panic("mutex %s unrecursed at %s:%d",
+			    m->mtx_object.lo_name, file, line);
+		}
+		break;
+	case MA_NOTOWNED:
+		if (mtx_owned(m))
+			panic("mutex %s owned at %s:%d",
+			    m->mtx_object.lo_name, file, line);
+		break;
+	default:
+		panic("unknown mtx_assert at %s:%d", file, line);
+	}
+}
+#endif
+
+/*
+ * The MUTEX_DEBUG-enabled mtx_validate()
+ *
+ * Most of these checks have been moved off into the LO_INITIALIZED flag
+ * maintained by the witness code.
+ */
+#ifdef MUTEX_DEBUG
+
+void	mtx_validate(struct mtx *);
+
+void
+mtx_validate(struct mtx *m)
+{
+
+/*
+ * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
+ * we can re-enable the kernacc() checks.
+ */
+#ifndef __alpha__
+	/*
+	 * Can't call kernacc() from early init386(), especially when
+	 * initializing Giant mutex, because some stuff in kernacc()
+	 * requires Giant itself.
+	 */ 
+	if (!cold)
+		if (!kernacc((caddr_t)m, sizeof(m),
+		    VM_PROT_READ | VM_PROT_WRITE))
+			panic("Can't read and write to mutex %p", m);
+#endif
+}
+#endif
+
+/*
+ * General init routine used by the MTX_SYSINIT() macro.
+ */
+void
+mtx_sysinit(void *arg)
+{
+	struct mtx_args *margs = arg;
+
+	mtx_init(margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts);
+}
+
+/*
+ * Mutex initialization routine; initialize lock `m' of type contained in
+ * `opts' with options contained in `opts' and name `name.'  The optional
+ * lock type `type' is used as a general lock category name for use with
+ * witness.
+ */ 
+void
+mtx_init(struct mtx *m, const char *name, const char *type, int opts)
+{
+	struct lock_object *lock;
+
+	MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
+	    MTX_SLEEPABLE | MTX_NOWITNESS | MTX_DUPOK)) == 0);
+
+#ifdef MUTEX_DEBUG
+	/* Diagnostic and error correction */
+	mtx_validate(m);
+#endif
+
+	lock = &m->mtx_object;
+	KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
+	    ("mutex %s %p already initialized", name, m));
+	bzero(m, sizeof(*m));
+	if (opts & MTX_SPIN)
+		lock->lo_class = &lock_class_mtx_spin;
+	else
+		lock->lo_class = &lock_class_mtx_sleep;
+	lock->lo_name = name;
+	lock->lo_type = type != NULL ? type : name;
+	if (opts & MTX_QUIET)
+		lock->lo_flags = LO_QUIET;
+	if (opts & MTX_RECURSE)
+		lock->lo_flags |= LO_RECURSABLE;
+	if (opts & MTX_SLEEPABLE)
+		lock->lo_flags |= LO_SLEEPABLE;
+	if ((opts & MTX_NOWITNESS) == 0)
+		lock->lo_flags |= LO_WITNESS;
+	if (opts & MTX_DUPOK)
+		lock->lo_flags |= LO_DUPOK;
+
+	m->mtx_lock = MTX_UNOWNED;
+	TAILQ_INIT(&m->mtx_blocked);
+
+	LOCK_LOG_INIT(lock, opts);
+
+	WITNESS_INIT(lock);
+}
+
+/*
+ * Remove lock `m' from all_mtx queue.  We don't allow MTX_QUIET to be
+ * passed in as a flag here because if the corresponding mtx_init() was
+ * called with MTX_QUIET set, then it will already be set in the mutex's
+ * flags.
+ */
+void
+mtx_destroy(struct mtx *m)
+{
+
+	LOCK_LOG_DESTROY(&m->mtx_object, 0);
+
+	if (!mtx_owned(m))
+		MPASS(mtx_unowned(m));
+	else {
+		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
+
+		/* Tell witness this isn't locked to make it happy. */
+		WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__,
+		    __LINE__);
+	}
+
+	WITNESS_DESTROY(&m->mtx_object);
+}
+
+/*
+ * Intialize the mutex code and system mutexes.  This is called from the MD
+ * startup code prior to mi_startup().  The per-CPU data space needs to be
+ * setup before this is called.
+ */
+void
+mutex_init(void)
+{
+
+	/* Setup thread0 so that mutexes work. */
+	LIST_INIT(&thread0.td_contested);
+
+	/*
+	 * Initialize mutexes.
+	 */
+	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
+	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
+	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+	mtx_lock(&Giant);
+}
+
+/*
+ * Encapsulated Giant mutex routines.  These routines provide encapsulation
+ * control for the Giant mutex, allowing sysctls to be used to turn on and
+ * off Giant around certain subsystems.  The default value for the sysctls
+ * are set to what developers believe is stable and working in regards to
+ * the Giant pushdown.  Developers should not turn off Giant via these
+ * sysctls unless they know what they are doing.
+ *
+ * Callers of mtx_lock_giant() are expected to pass the return value to an
+ * accompanying mtx_unlock_giant() later on.  If multiple subsystems are 
+ * effected by a Giant wrap, all related sysctl variables must be zero for
+ * the subsystem call to operate without Giant (as determined by the caller).
+ */
+
+SYSCTL_NODE(_kern, OID_AUTO, giant, CTLFLAG_RD, NULL, "Giant mutex manipulation");
+
+static int kern_giant_all = 0;
+SYSCTL_INT(_kern_giant, OID_AUTO, all, CTLFLAG_RW, &kern_giant_all, 0, "");
+
+int kern_giant_proc = 1;	/* Giant around PROC locks */
+int kern_giant_file = 1;	/* Giant around struct file & filedesc */
+int kern_giant_ucred = 1;	/* Giant around ucred */
+SYSCTL_INT(_kern_giant, OID_AUTO, proc, CTLFLAG_RW, &kern_giant_proc, 0, "");
+SYSCTL_INT(_kern_giant, OID_AUTO, file, CTLFLAG_RW, &kern_giant_file, 0, "");
+SYSCTL_INT(_kern_giant, OID_AUTO, ucred, CTLFLAG_RW, &kern_giant_ucred, 0, "");
+
+int
+mtx_lock_giant(int sysctlvar)
+{
+	if (sysctlvar || kern_giant_all) {
+		mtx_lock(&Giant);
+		return(1);
+	}
+	return(0);
+}
+
+void
+mtx_unlock_giant(int s)
+{
+	if (s)
+		mtx_unlock(&Giant);
+}
+
diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c
new file mode 100644
index 0000000..cd2db73
--- /dev/null
+++ b/sys/kern/kern_ntptime.c
@@ -0,0 +1,935 @@
+/***********************************************************************
+ *								       *
+ * Copyright (c) David L. Mills 1993-2001			       *
+ *								       *
+ * Permission to use, copy, modify, and distribute this software and   *
+ * its documentation for any purpose and without fee is hereby	       *
+ * granted, provided that the above copyright notice appears in all    *
+ * copies and that both the copyright notice and this permission       *
+ * notice appear in supporting documentation, and that the name	       *
+ * University of Delaware not be used in advertising or publicity      *
+ * pertaining to distribution of the software without specific,	       *
+ * written prior permission. The University of Delaware makes no       *
+ * representations about the suitability this software for any	       *
+ * purpose. It is provided "as is" without express or implied	       *
+ * warranty.							       *
+ *								       *
+ **********************************************************************/
+
+/*
+ * Adapted from the original sources for FreeBSD and timecounters by:
+ * Poul-Henning Kamp <phk@FreeBSD.org>.
+ *
+ * The 32bit version of the "LP" macros seems a bit past its "sell by" 
+ * date so I have retained only the 64bit version and included it directly
+ * in this file.
+ *
+ * Only minor changes done to interface with the timecounters over in
+ * sys/kern/kern_clock.c.   Some of the comments below may be (even more)
+ * confusing and/or plain wrong in that context.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ntp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <sys/timetc.h>
+#include <sys/timepps.h>
+#include <sys/sysctl.h>
+
+/*
+ * Single-precision macros for 64-bit machines
+ */
+typedef long long l_fp;
+#define L_ADD(v, u)	((v) += (u))
+#define L_SUB(v, u)	((v) -= (u))
+#define L_ADDHI(v, a)	((v) += (long long)(a) << 32)
+#define L_NEG(v)	((v) = -(v))
+#define L_RSHIFT(v, n) \
+	do { \
+		if ((v) < 0) \
+			(v) = -(-(v) >> (n)); \
+		else \
+			(v) = (v) >> (n); \
+	} while (0)
+#define L_MPY(v, a)	((v) *= (a))
+#define L_CLR(v)	((v) = 0)
+#define L_ISNEG(v)	((v) < 0)
+#define L_LINT(v, a)	((v) = (long long)(a) << 32)
+#define L_GINT(v)	((v) < 0 ? -(-(v) >> 32) : (v) >> 32)
+
+/*
+ * Generic NTP kernel interface
+ *
+ * These routines constitute the Network Time Protocol (NTP) interfaces
+ * for user and daemon application programs. The ntp_gettime() routine
+ * provides the time, maximum error (synch distance) and estimated error
+ * (dispersion) to client user application programs. The ntp_adjtime()
+ * routine is used by the NTP daemon to adjust the system clock to an
+ * externally derived time. The time offset and related variables set by
+ * this routine are used by other routines in this module to adjust the
+ * phase and frequency of the clock discipline loop which controls the
+ * system clock.
+ *
+ * When the kernel time is reckoned directly in nanoseconds (NTP_NANO
+ * defined), the time at each tick interrupt is derived directly from
+ * the kernel time variable. When the kernel time is reckoned in
+ * microseconds, (NTP_NANO undefined), the time is derived from the
+ * kernel time variable together with a variable representing the
+ * leftover nanoseconds at the last tick interrupt. In either case, the
+ * current nanosecond time is reckoned from these values plus an
+ * interpolated value derived by the clock routines in another
+ * architecture-specific module. The interpolation can use either a
+ * dedicated counter or a processor cycle counter (PCC) implemented in
+ * some architectures.
+ *
+ * Note that all routines must run at priority splclock or higher.
+ */
+/*
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The nanosecond clock discipline uses two variable types, time
+ * variables and frequency variables. Both types are represented as 64-
+ * bit fixed-point quantities with the decimal point between two 32-bit
+ * halves. On a 32-bit machine, each half is represented as a single
+ * word and mathematical operations are done using multiple-precision
+ * arithmetic. On a 64-bit machine, ordinary computer arithmetic is
+ * used.
+ *
+ * A time variable is a signed 64-bit fixed-point number in ns and
+ * fraction. It represents the remaining time offset to be amortized
+ * over succeeding tick interrupts. The maximum time offset is about
+ * 0.5 s and the resolution is about 2.3e-10 ns.
+ *
+ *			1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |s s s|			 ns				   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |			    fraction				   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * A frequency variable is a signed 64-bit fixed-point number in ns/s
+ * and fraction. It represents the ns and fraction to be added to the
+ * kernel time variable at each second. The maximum frequency offset is
+ * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
+ *
+ *			1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |s s s s s s s s s s s s s|	          ns/s			   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |			    fraction				   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock.
+ */
+#define SHIFT_PLL	4		/* PLL loop gain (shift) */
+#define SHIFT_FLL	2		/* FLL loop gain (shift) */
+
+static int time_state = TIME_OK;	/* clock state */
+static int time_status = STA_UNSYNC;	/* clock status bits */
+static long time_tai;			/* TAI offset (s) */
+static long time_monitor;		/* last time offset scaled (ns) */
+static long time_constant;		/* poll interval (shift) (s) */
+static long time_precision = 1;		/* clock precision (ns) */
+static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
+static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
+static long time_reftime;		/* time at last adjustment (s) */
+static long time_tick;			/* nanoseconds per tick (ns) */
+static l_fp time_offset;		/* time offset (ns) */
+static l_fp time_freq;			/* frequency offset (ns/s) */
+static l_fp time_adj;			/* tick adjust (ns/s) */
+
+static int64_t time_adjtime;		/* correction from adjtime(2) (usec) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available and connected via a modem control lead. They establish
+ * the engineering parameters of the clock discipline loop when
+ * controlled by the PPS signal.
+ */
+#define PPS_FAVG	2		/* min freq avg interval (s) (shift) */
+#define PPS_FAVGDEF	8		/* default freq avg int (s) (shift) */
+#define PPS_FAVGMAX	15		/* max freq avg interval (s) (shift) */
+#define PPS_PAVG	4		/* phase avg interval (s) (shift) */
+#define PPS_VALID	120		/* PPS signal watchdog max (s) */
+#define PPS_MAXWANDER	100000		/* max PPS wander (ns/s) */
+#define PPS_POPCORN	2		/* popcorn spike threshold (shift) */
+
+static struct timespec pps_tf[3];	/* phase median filter */
+static l_fp pps_freq;			/* scaled frequency offset (ns/s) */
+static long pps_fcount;			/* frequency accumulator */
+static long pps_jitter;			/* nominal jitter (ns) */
+static long pps_stabil;			/* nominal stability (scaled ns/s) */
+static long pps_lastsec;		/* time at last calibration (s) */
+static int pps_valid;			/* signal watchdog counter */
+static int pps_shift = PPS_FAVG;	/* interval duration (s) (shift) */
+static int pps_shiftmax = PPS_FAVGDEF;	/* max interval duration (s) (shift) */
+static int pps_intcnt;			/* wander counter */
+
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt;			/* calibration intervals */
+static long pps_jitcnt;			/* jitter limit exceeded */
+static long pps_stbcnt;			/* stability limit exceeded */
+static long pps_errcnt;			/* calibration errors */
+#endif /* PPS_SYNC */
+/*
+ * End of phase/frequency-lock loop (PLL/FLL) definitions
+ */
+
+static void ntp_init(void);
+static void hardupdate(long offset);
+
+/*
+ * ntp_gettime() - NTP user application interface
+ *
+ * See the timex.h header file for synopsis and API description. Note
+ * that the TAI offset is returned in the ntvtimeval.tai structure
+ * member.
+ */
+static int
+ntp_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct ntptimeval ntv;	/* temporary structure */
+	struct timespec atv;	/* nanosecond time */
+
+	nanotime(&atv);
+	ntv.time.tv_sec = atv.tv_sec;
+	ntv.time.tv_nsec = atv.tv_nsec;
+	ntv.maxerror = time_maxerror;
+	ntv.esterror = time_esterror;
+	ntv.tai = time_tai;
+	ntv.time_state = time_state;
+
+	/*
+	 * Status word error decode. If any of these conditions occur,
+	 * an error is returned, instead of the status word. Most
+	 * applications will care only about the fact the system clock
+	 * may not be trusted, not about the details.
+	 *
+	 * Hardware or software error
+	 */
+	if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
+
+	/*
+	 * PPS signal lost when either time or frequency synchronization
+	 * requested
+	 */
+	    (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+	    !(time_status & STA_PPSSIGNAL)) ||
+
+	/*
+	 * PPS jitter exceeded when time synchronization requested
+	 */
+	    (time_status & STA_PPSTIME &&
+	    time_status & STA_PPSJITTER) ||
+
+	/*
+	 * PPS wander exceeded or calibration error when frequency
+	 * synchronization requested
+	 */
+	    (time_status & STA_PPSFREQ &&
+	    time_status & (STA_PPSWANDER | STA_PPSERROR)))
+		ntv.time_state = TIME_ERROR;
+	return (sysctl_handle_opaque(oidp, &ntv, sizeof ntv, req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW, 0, "");
+SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
+
+#ifdef PPS_SYNC
+SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW, &pps_shiftmax, 0, "");
+SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW, &pps_shift, 0, "");
+SYSCTL_INT(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD, &time_monitor, 0, "");
+
+SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD, &pps_freq, sizeof(pps_freq), "I", "");
+SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD, &time_freq, sizeof(time_freq), "I", "");
+#endif
+/*
+ * ntp_adjtime() - NTP daemon application interface
+ *
+ * See the timex.h header file for synopsis and API description. Note
+ * that the timex.constant structure member has a dual purpose to set
+ * the time constant and to set the TAI offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ntp_adjtime_args {
+	struct timex *tp;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
+{
+	struct timex ntv;	/* temporary structure */
+	long freq;		/* frequency ns/s) */
+	int modes;		/* mode bits from structure */
+	int s;			/* caller priority */
+	int error;
+
+	error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
+	if (error)
+		return(error);
+
+	/*
+	 * Update selected clock variables - only the superuser can
+	 * change anything. Note that there is no error checking here on
+	 * the assumption the superuser should know what it is doing.
+	 * Note that either the time constant or TAI offset are loaded
+	 * from the ntv.constant member, depending on the mode bits. If
+	 * the STA_PLL bit in the status word is cleared, the state and
+	 * status words are reset to the initial values at boot.
+	 */
+	mtx_lock(&Giant);
+	modes = ntv.modes;
+	if (modes)
+		error = suser(td);
+	if (error)
+		goto done2;
+	s = splclock();
+	if (modes & MOD_MAXERROR)
+		time_maxerror = ntv.maxerror;
+	if (modes & MOD_ESTERROR)
+		time_esterror = ntv.esterror;
+	if (modes & MOD_STATUS) {
+		if (time_status & STA_PLL && !(ntv.status & STA_PLL)) {
+			time_state = TIME_OK;
+			time_status = STA_UNSYNC;
+#ifdef PPS_SYNC
+			pps_shift = PPS_FAVG;
+#endif /* PPS_SYNC */
+		}
+		time_status &= STA_RONLY;
+		time_status |= ntv.status & ~STA_RONLY;
+	}
+	if (modes & MOD_TIMECONST) {
+		if (ntv.constant < 0)
+			time_constant = 0;
+		else if (ntv.constant > MAXTC)
+			time_constant = MAXTC;
+		else
+			time_constant = ntv.constant;
+	}
+	if (modes & MOD_TAI) {
+		if (ntv.constant > 0) /* XXX zero & negative numbers ? */
+			time_tai = ntv.constant;
+	}
+#ifdef PPS_SYNC
+	if (modes & MOD_PPSMAX) {
+		if (ntv.shift < PPS_FAVG)
+			pps_shiftmax = PPS_FAVG;
+		else if (ntv.shift > PPS_FAVGMAX)
+			pps_shiftmax = PPS_FAVGMAX;
+		else
+			pps_shiftmax = ntv.shift;
+	}
+#endif /* PPS_SYNC */
+	if (modes & MOD_NANO)
+		time_status |= STA_NANO;
+	if (modes & MOD_MICRO)
+		time_status &= ~STA_NANO;
+	if (modes & MOD_CLKB)
+		time_status |= STA_CLK;
+	if (modes & MOD_CLKA)
+		time_status &= ~STA_CLK;
+	if (modes & MOD_OFFSET) {
+		if (time_status & STA_NANO)
+			hardupdate(ntv.offset);
+		else
+			hardupdate(ntv.offset * 1000);
+	}
+	if (modes & MOD_FREQUENCY) {
+		freq = (ntv.freq * 1000LL) >> 16;
+		if (freq > MAXFREQ)
+			L_LINT(time_freq, MAXFREQ);
+		else if (freq < -MAXFREQ)
+			L_LINT(time_freq, -MAXFREQ);
+		else
+			L_LINT(time_freq, freq);
+#ifdef PPS_SYNC
+		pps_freq = time_freq;
+#endif /* PPS_SYNC */
+	}
+
+	/*
+	 * Retrieve all clock variables. Note that the TAI offset is
+	 * returned only by ntp_gettime();
+	 */
+	if (time_status & STA_NANO)
+		ntv.offset = L_GINT(time_offset);
+	else
+		ntv.offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
+	ntv.freq = L_GINT((time_freq / 1000LL) << 16);
+	ntv.maxerror = time_maxerror;
+	ntv.esterror = time_esterror;
+	ntv.status = time_status;
+	ntv.constant = time_constant;
+	if (time_status & STA_NANO)
+		ntv.precision = time_precision;
+	else
+		ntv.precision = time_precision / 1000;
+	ntv.tolerance = MAXFREQ * SCALE_PPM;
+#ifdef PPS_SYNC
+	ntv.shift = pps_shift;
+	ntv.ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
+	if (time_status & STA_NANO)
+		ntv.jitter = pps_jitter;
+	else
+		ntv.jitter = pps_jitter / 1000;
+	ntv.stabil = pps_stabil;
+	ntv.calcnt = pps_calcnt;
+	ntv.errcnt = pps_errcnt;
+	ntv.jitcnt = pps_jitcnt;
+	ntv.stbcnt = pps_stbcnt;
+#endif /* PPS_SYNC */
+	splx(s);
+
+	error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
+	if (error)
+		goto done2;
+
+	/*
+	 * Status word error decode. See comments in
+	 * ntp_gettime() routine.
+	 */
+	if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
+	    (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+	    !(time_status & STA_PPSSIGNAL)) ||
+	    (time_status & STA_PPSTIME &&
+	    time_status & STA_PPSJITTER) ||
+	    (time_status & STA_PPSFREQ &&
+	    time_status & (STA_PPSWANDER | STA_PPSERROR))) {
+		td->td_retval[0] = TIME_ERROR;
+	} else {
+		td->td_retval[0] = time_state;
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * second_overflow() - called after ntp_tick_adjust()
+ *
+ * This routine is ordinarily called immediately following the above
+ * routine ntp_tick_adjust(). While these two routines are normally
+ * combined, they are separated here only for the purposes of
+ * simulation.
+ */
+void
+ntp_update_second(int64_t *adjustment, time_t *newsec)
+{
+	int tickrate;
+	l_fp ftemp;		/* 32/64-bit temporary */
+
+	/*
+	 * On rollover of the second both the nanosecond and microsecond
+	 * clocks are updated and the state machine cranked as
+	 * necessary. The phase adjustment to be used for the next
+	 * second is calculated and the maximum error is increased by
+	 * the tolerance.
+	 */
+	time_maxerror += MAXFREQ / 1000;
+
+	/*
+	 * Leap second processing. If in leap-insert state at
+	 * the end of the day, the system clock is set back one
+	 * second; if in leap-delete state, the system clock is
+	 * set ahead one second. The nano_time() routine or
+	 * external clock driver will insure that reported time
+	 * is always monotonic.
+	 */
+	switch (time_state) {
+
+		/*
+		 * No warning.
+		 */
+		case TIME_OK:
+		if (time_status & STA_INS)
+			time_state = TIME_INS;
+		else if (time_status & STA_DEL)
+			time_state = TIME_DEL;
+		break;
+
+		/*
+		 * Insert second 23:59:60 following second
+		 * 23:59:59.
+		 */
+		case TIME_INS:
+		if (!(time_status & STA_INS))
+			time_state = TIME_OK;
+		else if ((*newsec) % 86400 == 0) {
+			(*newsec)--;
+			time_state = TIME_OOP;
+		}
+		break;
+
+		/*
+		 * Delete second 23:59:59.
+		 */
+		case TIME_DEL:
+		if (!(time_status & STA_DEL))
+			time_state = TIME_OK;
+		else if (((*newsec) + 1) % 86400 == 0) {
+			(*newsec)++;
+			time_tai--;
+			time_state = TIME_WAIT;
+		}
+		break;
+
+		/*
+		 * Insert second in progress.
+		 */
+		case TIME_OOP:
+			time_tai++;
+			time_state = TIME_WAIT;
+		break;
+
+		/*
+		 * Wait for status bits to clear.
+		 */
+		case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+			time_state = TIME_OK;
+	}
+
+	/*
+	 * Compute the total time adjustment for the next second
+	 * in ns. The offset is reduced by a factor depending on
+	 * whether the PPS signal is operating. Note that the
+	 * value is in effect scaled by the clock frequency,
+	 * since the adjustment is added at each tick interrupt.
+	 */
+	ftemp = time_offset;
+#ifdef PPS_SYNC
+	/* XXX even if PPS signal dies we should finish adjustment ? */
+	if (time_status & STA_PPSTIME && time_status &
+	    STA_PPSSIGNAL)
+		L_RSHIFT(ftemp, pps_shift);
+	else
+		L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
+#else
+		L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
+#endif /* PPS_SYNC */
+	time_adj = ftemp;
+	L_SUB(time_offset, ftemp);
+	L_ADD(time_adj, time_freq);
+	
+	/*
+	 * Apply any correction from adjtime(2).  If more than one second
+	 * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
+	 * until the last second is slewed the final < 500 usecs.
+	 */
+	if (time_adjtime != 0) {
+		if (time_adjtime > 1000000)
+			tickrate = 5000;
+		else if (time_adjtime < -1000000)
+			tickrate = -5000;
+		else if (time_adjtime > 500)
+			tickrate = 500;
+		else if (time_adjtime < -500)
+			tickrate = -500;
+		else if (time_adjtime != 0)
+			tickrate = time_adjtime;
+		else
+			tickrate = 0;	/* GCC sucks! */
+		time_adjtime -= tickrate;
+		L_LINT(ftemp, tickrate * 1000);
+		L_ADD(time_adj, ftemp);
+	}
+	*adjustment = time_adj;
+		
+#ifdef PPS_SYNC
+	if (pps_valid > 0)
+		pps_valid--;
+	else
+		time_status &= ~STA_PPSSIGNAL;
+#endif /* PPS_SYNC */
+}
+
+/*
+ * ntp_init() - initialize variables and structures
+ *
+ * This routine must be called after the kernel variables hz and tick
+ * are set or changed and before the next tick interrupt. In this
+ * particular implementation, these values are assumed set elsewhere in
+ * the kernel. The design allows the clock frequency and tick interval
+ * to be changed while the system is running. So, this routine should
+ * probably be integrated with the code that does that.
+ */
+static void
+ntp_init()
+{
+
+	/*
+	 * The following variable must be initialized any time the
+	 * kernel variable hz is changed.
+	 */
+	time_tick = NANOSECOND / hz;
+
+	/*
+	 * The following variables are initialized only at startup. Only
+	 * those structures not cleared by the compiler need to be
+	 * initialized, and these only in the simulator. In the actual
+	 * kernel, any nonzero values here will quickly evaporate.
+	 */
+	L_CLR(time_offset);
+	L_CLR(time_freq);
+#ifdef PPS_SYNC
+	pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
+	pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
+	pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
+	pps_fcount = 0;
+	L_CLR(pps_freq);
+#endif /* PPS_SYNC */	   
+}
+
+SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, ntp_init, NULL)
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillators and nominal update
+ * intervals less than 256 s, operation should be in phase-lock mode,
+ * where the loop is disciplined to phase. For update intervals greater
+ * than 1024 s, operation should be in frequency-lock mode, where the
+ * loop is disciplined to frequency. Between 256 s and 1024 s, the mode
+ * is selected by the STA_MODE status bit.
+ */
+static void
+hardupdate(offset)
+	long offset;		/* clock offset (ns) */
+{
+	long mtemp;
+	l_fp ftemp;
+
+	/*
+	 * Select how the phase is to be controlled and from which
+	 * source. If the PPS signal is present and enabled to
+	 * discipline the time, the PPS offset is used; otherwise, the
+	 * argument offset is used.
+	 */
+	if (!(time_status & STA_PLL))
+		return;
+	if (!(time_status & STA_PPSTIME && time_status &
+	    STA_PPSSIGNAL)) {
+		if (offset > MAXPHASE)
+			time_monitor = MAXPHASE;
+		else if (offset < -MAXPHASE)
+			time_monitor = -MAXPHASE;
+		else
+			time_monitor = offset;
+		L_LINT(time_offset, time_monitor);
+	}
+
+	/*
+	 * Select how the frequency is to be controlled and in which
+	 * mode (PLL or FLL). If the PPS signal is present and enabled
+	 * to discipline the frequency, the PPS frequency is used;
+	 * otherwise, the argument offset is used to compute it.
+	 */
+	if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
+		time_reftime = time_second;
+		return;
+	}
+	if (time_status & STA_FREQHOLD || time_reftime == 0)
+		time_reftime = time_second;
+	mtemp = time_second - time_reftime;
+	L_LINT(ftemp, time_monitor);
+	L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
+	L_MPY(ftemp, mtemp);
+	L_ADD(time_freq, ftemp);
+	time_status &= ~STA_MODE;
+	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
+	    MAXSEC)) {
+		L_LINT(ftemp, (time_monitor << 4) / mtemp);
+		L_RSHIFT(ftemp, SHIFT_FLL + 4);
+		L_ADD(time_freq, ftemp);
+		time_status |= STA_MODE;
+	}
+	time_reftime = time_second;
+	if (L_GINT(time_freq) > MAXFREQ)
+		L_LINT(time_freq, MAXFREQ);
+	else if (L_GINT(time_freq) < -MAXFREQ)
+		L_LINT(time_freq, -MAXFREQ);
+}
+
+#ifdef PPS_SYNC
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. There are two independent
+ * first-order feedback loops, one for the phase, the other for the
+ * frequency. The phase loop measures and grooms the PPS phase offset
+ * and leaves it in a handy spot for the seconds overflow routine. The
+ * frequency loop averages successive PPS phase differences and
+ * calculates the PPS frequency offset, which is also processed by the
+ * seconds overflow routine. The code requires the caller to capture the
+ * time and architecture-dependent hardware counter values in
+ * nanoseconds at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for the actual time and frequency variables, which
+ * are determined by this routine and updated atomically.
+ */
+void
+hardpps(tsp, nsec)
+	struct timespec *tsp;	/* time at PPS */
+	long nsec;		/* hardware counter at PPS */
+{
+	long u_sec, u_nsec, v_nsec; /* temps */
+	l_fp ftemp;
+
+	/*
+	 * The signal is first processed by a range gate and frequency
+	 * discriminator. The range gate rejects noise spikes outside
+	 * the range +-500 us. The frequency discriminator rejects input
+	 * signals with apparent frequency outside the range 1 +-500
+	 * PPM. If two hits occur in the same second, we ignore the
+	 * later hit; if not and a hit occurs outside the range gate,
+	 * keep the later hit for later comparison, but do not process
+	 * it.
+	 */
+	time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
+	time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
+	pps_valid = PPS_VALID;
+	u_sec = tsp->tv_sec;
+	u_nsec = tsp->tv_nsec;
+	if (u_nsec >= (NANOSECOND >> 1)) {
+		u_nsec -= NANOSECOND;
+		u_sec++;
+	}
+	v_nsec = u_nsec - pps_tf[0].tv_nsec;
+	if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
+	    MAXFREQ)
+		return;
+	pps_tf[2] = pps_tf[1];
+	pps_tf[1] = pps_tf[0];
+	pps_tf[0].tv_sec = u_sec;
+	pps_tf[0].tv_nsec = u_nsec;
+
+	/*
+	 * Compute the difference between the current and previous
+	 * counter values. If the difference exceeds 0.5 s, assume it
+	 * has wrapped around, so correct 1.0 s. If the result exceeds
+	 * the tick interval, the sample point has crossed a tick
+	 * boundary during the last second, so correct the tick. Very
+	 * intricate.
+	 */
+	u_nsec = nsec;
+	if (u_nsec > (NANOSECOND >> 1))
+		u_nsec -= NANOSECOND;
+	else if (u_nsec < -(NANOSECOND >> 1))
+		u_nsec += NANOSECOND;
+	pps_fcount += u_nsec;
+	if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
+		return;
+	time_status &= ~STA_PPSJITTER;
+
+	/*
+	 * A three-stage median filter is used to help denoise the PPS
+	 * time. The median sample becomes the time offset estimate; the
+	 * difference between the other two samples becomes the time
+	 * dispersion (jitter) estimate.
+	 */
+	if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
+		if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
+			v_nsec = pps_tf[1].tv_nsec;	/* 0 1 2 */
+			u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
+		} else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
+			v_nsec = pps_tf[0].tv_nsec;	/* 2 0 1 */
+			u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
+		} else {
+			v_nsec = pps_tf[2].tv_nsec;	/* 0 2 1 */
+			u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
+		}
+	} else {
+		if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
+			v_nsec = pps_tf[1].tv_nsec;	/* 2 1 0 */
+			u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
+		} else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
+			v_nsec = pps_tf[0].tv_nsec;	/* 1 0 2 */
+			u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
+		} else {
+			v_nsec = pps_tf[2].tv_nsec;	/* 1 2 0 */
+			u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
+		}
+	}
+
+	/*
+	 * Nominal jitter is due to PPS signal noise and interrupt
+	 * latency. If it exceeds the popcorn threshold, the sample is
+	 * discarded. otherwise, if so enabled, the time offset is
+	 * updated. We can tolerate a modest loss of data here without
+	 * much degrading time accuracy.
+	 */
+	if (u_nsec > (pps_jitter << PPS_POPCORN)) {
+		time_status |= STA_PPSJITTER;
+		pps_jitcnt++;
+	} else if (time_status & STA_PPSTIME) {
+		time_monitor = -v_nsec;
+		L_LINT(time_offset, time_monitor);
+	}
+	pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
+	u_sec = pps_tf[0].tv_sec - pps_lastsec;
+	if (u_sec < (1 << pps_shift))
+		return;
+
+	/*
+	 * At the end of the calibration interval the difference between
+	 * the first and last counter values becomes the scaled
+	 * frequency. It will later be divided by the length of the
+	 * interval to determine the frequency update. If the frequency
+	 * exceeds a sanity threshold, or if the actual calibration
+	 * interval is not equal to the expected length, the data are
+	 * discarded. We can tolerate a modest loss of data here without
+	 * much degrading frequency accuracy.
+	 */
+	pps_calcnt++;
+	v_nsec = -pps_fcount;
+	pps_lastsec = pps_tf[0].tv_sec;
+	pps_fcount = 0;
+	u_nsec = MAXFREQ << pps_shift;
+	if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 <<
+	    pps_shift)) {
+		time_status |= STA_PPSERROR;
+		pps_errcnt++;
+		return;
+	}
+
+	/*
+	 * Here the raw frequency offset and wander (stability) is
+	 * calculated. If the wander is less than the wander threshold
+	 * for four consecutive averaging intervals, the interval is
+	 * doubled; if it is greater than the threshold for four
+	 * consecutive intervals, the interval is halved. The scaled
+	 * frequency offset is converted to frequency offset. The
+	 * stability metric is calculated as the average of recent
+	 * frequency changes, but is used only for performance
+	 * monitoring.
+	 */
+	L_LINT(ftemp, v_nsec);
+	L_RSHIFT(ftemp, pps_shift);
+	L_SUB(ftemp, pps_freq);
+	u_nsec = L_GINT(ftemp);
+	if (u_nsec > PPS_MAXWANDER) {
+		L_LINT(ftemp, PPS_MAXWANDER);
+		pps_intcnt--;
+		time_status |= STA_PPSWANDER;
+		pps_stbcnt++;
+	} else if (u_nsec < -PPS_MAXWANDER) {
+		L_LINT(ftemp, -PPS_MAXWANDER);
+		pps_intcnt--;
+		time_status |= STA_PPSWANDER;
+		pps_stbcnt++;
+	} else {
+		pps_intcnt++;
+	}
+	if (pps_intcnt >= 4) {
+		pps_intcnt = 4;
+		if (pps_shift < pps_shiftmax) {
+			pps_shift++;
+			pps_intcnt = 0;
+		}
+	} else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
+		pps_intcnt = -4;
+		if (pps_shift > PPS_FAVG) {
+			pps_shift--;
+			pps_intcnt = 0;
+		}
+	}
+	if (u_nsec < 0)
+		u_nsec = -u_nsec;
+	pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;
+
+	/*
+	 * The PPS frequency is recalculated and clamped to the maximum
+	 * MAXFREQ. If enabled, the system clock frequency is updated as
+	 * well.
+	 */
+	L_ADD(pps_freq, ftemp);
+	u_nsec = L_GINT(pps_freq);
+	if (u_nsec > MAXFREQ)
+		L_LINT(pps_freq, MAXFREQ);
+	else if (u_nsec < -MAXFREQ)
+		L_LINT(pps_freq, -MAXFREQ);
+	if (time_status & STA_PPSFREQ)
+		time_freq = pps_freq;
+}
+#endif /* PPS_SYNC */
+
+#ifndef _SYS_SYSPROTO_H_
+struct adjtime_args {
+	struct timeval *delta;
+	struct timeval *olddelta;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+adjtime(struct thread *td, struct adjtime_args *uap)
+{
+	struct timeval atv;
+	int error;
+
+	if ((error = suser(td)))
+		return (error);
+
+	mtx_lock(&Giant);
+	if (uap->olddelta) {
+		atv.tv_sec = time_adjtime / 1000000;
+		atv.tv_usec = time_adjtime % 1000000;
+		if (atv.tv_usec < 0) {
+			atv.tv_usec += 1000000;
+			atv.tv_sec--;
+		}
+		error = copyout(&atv, uap->olddelta, sizeof(atv));
+		if (error)
+			goto done2;
+	}
+	if (uap->delta) {
+		error = copyin(uap->delta, &atv, sizeof(atv));
+		if (error)
+			goto done2;
+		time_adjtime = (int64_t)atv.tv_sec * 1000000 + atv.tv_usec;
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
new file mode 100644
index 0000000..11f3d0c
--- /dev/null
+++ b/sys/kern/kern_physio.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ *    John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ *    are met.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static void
+physwakeup(struct buf *bp)
+{
+	wakeup((caddr_t) bp);
+}
+
+int
+physio(dev_t dev, struct uio *uio, int ioflag)
+{
+	int i;
+	int error;
+	int spl;
+	caddr_t sa;
+	u_int iolen;
+	struct buf *bp;
+
+	/* Keep the process UPAGES from being swapped. XXX: why ? */
+	PHOLD(curproc);
+
+	bp = getpbuf(NULL);
+	sa = bp->b_data;
+	error = bp->b_error = 0;
+
+	/* XXX: sanity check */
+	if(dev->si_iosize_max < PAGE_SIZE) {
+		printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n",
+		    devtoname(dev), dev->si_iosize_max);
+		dev->si_iosize_max = DFLTPHYS;
+	}
+
+	for (i = 0; i < uio->uio_iovcnt; i++) {
+		while (uio->uio_iov[i].iov_len) {
+			bp->b_flags = B_PHYS;
+			if (uio->uio_rw == UIO_READ)
+				bp->b_iocmd = BIO_READ;
+			else 
+				bp->b_iocmd = BIO_WRITE;
+			bp->b_dev = dev;
+			bp->b_iodone = physwakeup;
+			bp->b_data = uio->uio_iov[i].iov_base;
+			bp->b_bcount = uio->uio_iov[i].iov_len;
+			bp->b_offset = uio->uio_offset;
+			bp->b_saveaddr = sa;
+
+			/* Don't exceed drivers iosize limit */
+			if (bp->b_bcount > dev->si_iosize_max)
+				bp->b_bcount = dev->si_iosize_max;
+
+			/* 
+			 * Make sure the pbuf can map the request
+			 * XXX: The pbuf has kvasize = MAXPHYS so a request
+			 * XXX: larger than MAXPHYS - PAGE_SIZE must be
+			 * XXX: page aligned or it will be fragmented.
+			 */
+			iolen = ((vm_offset_t) bp->b_data) & PAGE_MASK;
+			if ((bp->b_bcount + iolen) > bp->b_kvasize) {
+				bp->b_bcount = bp->b_kvasize;
+				if (iolen != 0)
+					bp->b_bcount -= PAGE_SIZE;
+			}
+			bp->b_bufsize = bp->b_bcount;
+
+			bp->b_blkno = btodb(bp->b_offset);
+
+			if (uio->uio_segflg == UIO_USERSPACE) {
+				if (!useracc(bp->b_data, bp->b_bufsize,
+				    bp->b_iocmd == BIO_READ ?
+				    VM_PROT_WRITE : VM_PROT_READ)) {
+					error = EFAULT;
+					goto doerror;
+				}
+				vmapbuf(bp);
+			}
+
+			DEV_STRATEGY(bp, 0);
+			spl = splbio();
+			while ((bp->b_flags & B_DONE) == 0)
+				tsleep((caddr_t)bp, PRIBIO, "physstr", 0);
+			splx(spl);
+
+			if (uio->uio_segflg == UIO_USERSPACE)
+				vunmapbuf(bp);
+			iolen = bp->b_bcount - bp->b_resid;
+			if (iolen == 0 && !(bp->b_ioflags & BIO_ERROR))
+				goto doerror;	/* EOF */
+			uio->uio_iov[i].iov_len -= iolen;
+			uio->uio_iov[i].iov_base += iolen;
+			uio->uio_resid -= iolen;
+			uio->uio_offset += iolen;
+			if( bp->b_ioflags & BIO_ERROR) {
+				error = bp->b_error;
+				goto doerror;
+			}
+		}
+	}
+doerror:
+	relpbuf(bp, NULL);
+	PRELE(curproc);
+	return (error);
+}
diff --git a/sys/kern/kern_poll.c b/sys/kern/kern_poll.c
new file mode 100644
index 0000000..a197bc0
--- /dev/null
+++ b/sys/kern/kern_poll.c
@@ -0,0 +1,523 @@
+/*-
+ * Copyright (c) 2001-2002 Luigi Rizzo
+ *
+ * Supported by: the Xorp Project (www.xorp.org)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>			/* needed by net/if.h		*/
+#include <sys/sysctl.h>
+
+#include <net/if.h>			/* for IFF_* flags		*/
+#include <net/netisr.h>			/* for NETISR_POLL		*/
+
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/kthread.h>
+
+#ifdef SMP
+#ifndef COMPILING_LINT
+#error DEVICE_POLLING is not compatible with SMP
+#endif
+#endif
+
+static void netisr_poll(void);		/* the two netisr handlers      */
+void netisr_pollmore(void);
+
+void init_device_poll(void);		/* init routine			*/
+void hardclock_device_poll(void);	/* hook from hardclock		*/
+void ether_poll(int);			/* polling while in trap	*/
+
+/*
+ * Polling support for [network] device drivers.
+ *
+ * Drivers which support this feature try to register with the
+ * polling code.
+ *
+ * If registration is successful, the driver must disable interrupts,
+ * and further I/O is performed through the handler, which is invoked
+ * (at least once per clock tick) with 3 arguments: the "arg" passed at
+ * register time (a struct ifnet pointer), a command, and a "count" limit.
+ *
+ * The command can be one of the following:
+ *  POLL_ONLY: quick move of "count" packets from input/output queues.
+ *  POLL_AND_CHECK_STATUS: as above, plus check status registers or do
+ *	other more expensive operations. This command is issued periodically
+ *	but less frequently than POLL_ONLY.
+ *  POLL_DEREGISTER: deregister and return to interrupt mode.
+ *
+ * The first two commands are only issued if the interface is marked as
+ * 'IFF_UP and IFF_RUNNING', the last one only if IFF_RUNNING is set.
+ *
+ * The count limit specifies how much work the handler can do during the
+ * call -- typically this is the number of packets to be received, or
+ * transmitted, etc. (drivers are free to interpret this number, as long
+ * as the max time spent in the function grows roughly linearly with the
+ * count).
+ *
+ * Deregistration can be requested by the driver itself (typically in the
+ * *_stop() routine), or by the polling code, by invoking the handler.
+ *
+ * Polling can be globally enabled or disabled with the sysctl variable
+ * kern.polling.enable (default is 0, disabled)
+ *
+ * A second variable controls the sharing of CPU between polling/kernel
+ * network processing, and other activities (typically userlevel tasks):
+ * kern.polling.user_frac (between 0 and 100, default 50) sets the share
+ * of CPU allocated to user tasks. CPU is allocated proportionally to the
+ * shares, by dynamically adjusting the "count" (poll_burst).
+ *
+ * Other parameters can should be left to their default values.
+ * The following constraints hold
+ *
+ *	1 <= poll_each_burst <= poll_burst <= poll_burst_max
+ *	0 <= poll_in_trap <= poll_each_burst
+ *	MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
+ */
+
+#define MIN_POLL_BURST_MAX	10
+#define MAX_POLL_BURST_MAX	1000
+
+SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0,
+	"Device polling parameters");
+
+static u_int32_t poll_burst = 5;
+SYSCTL_UINT(_kern_polling, OID_AUTO, burst, CTLFLAG_RW,
+	&poll_burst, 0, "Current polling burst size");
+
+static u_int32_t poll_each_burst = 5;
+SYSCTL_UINT(_kern_polling, OID_AUTO, each_burst, CTLFLAG_RW,
+	&poll_each_burst, 0, "Max size of each burst");
+
+static u_int32_t poll_burst_max = 150;	/* good for 100Mbit net and HZ=1000 */
+SYSCTL_UINT(_kern_polling, OID_AUTO, burst_max, CTLFLAG_RW,
+	&poll_burst_max, 0, "Max Polling burst size");
+
+static u_int32_t poll_in_idle_loop=1;	/* do we poll in idle loop ? */
+SYSCTL_UINT(_kern_polling, OID_AUTO, idle_poll, CTLFLAG_RW,
+	&poll_in_idle_loop, 0, "Enable device polling in idle loop");
+
+u_int32_t poll_in_trap;			/* used in trap.c */
+SYSCTL_UINT(_kern_polling, OID_AUTO, poll_in_trap, CTLFLAG_RW,
+	&poll_in_trap, 0, "Poll burst size during a trap");
+
+static u_int32_t user_frac = 50;
+SYSCTL_UINT(_kern_polling, OID_AUTO, user_frac, CTLFLAG_RW,
+	&user_frac, 0, "Desired user fraction of cpu time");
+
+static u_int32_t reg_frac = 20 ;
+SYSCTL_UINT(_kern_polling, OID_AUTO, reg_frac, CTLFLAG_RW,
+	&reg_frac, 0, "Every this many cycles poll register");
+
+static u_int32_t short_ticks;
+SYSCTL_UINT(_kern_polling, OID_AUTO, short_ticks, CTLFLAG_RW,
+	&short_ticks, 0, "Hardclock ticks shorter than they should be");
+
+static u_int32_t lost_polls;
+SYSCTL_UINT(_kern_polling, OID_AUTO, lost_polls, CTLFLAG_RW,
+	&lost_polls, 0, "How many times we would have lost a poll tick");
+
+static u_int32_t pending_polls;
+SYSCTL_UINT(_kern_polling, OID_AUTO, pending_polls, CTLFLAG_RW,
+	&pending_polls, 0, "Do we need to poll again");
+
+static int residual_burst = 0;
+SYSCTL_INT(_kern_polling, OID_AUTO, residual_burst, CTLFLAG_RW,
+	&residual_burst, 0, "# of residual cycles in burst");
+
+static u_int32_t poll_handlers; /* next free entry in pr[]. */
+SYSCTL_UINT(_kern_polling, OID_AUTO, handlers, CTLFLAG_RD,
+	&poll_handlers, 0, "Number of registered poll handlers");
+
+static int polling = 0;		/* global polling enable */
+SYSCTL_UINT(_kern_polling, OID_AUTO, enable, CTLFLAG_RW,
+	&polling, 0, "Polling enabled");
+
+static u_int32_t phase;
+SYSCTL_UINT(_kern_polling, OID_AUTO, phase, CTLFLAG_RW,
+	&phase, 0, "Polling phase");
+
+static u_int32_t suspect;
+SYSCTL_UINT(_kern_polling, OID_AUTO, suspect, CTLFLAG_RW,
+	&suspect, 0, "suspect event");
+
+static u_int32_t stalled;
+SYSCTL_UINT(_kern_polling, OID_AUTO, stalled, CTLFLAG_RW,
+	&stalled, 0, "potential stalls");
+
+static u_int32_t idlepoll_sleeping; /* idlepoll is sleeping */
+SYSCTL_UINT(_kern_polling, OID_AUTO, idlepoll_sleeping, CTLFLAG_RD,
+	&idlepoll_sleeping, 0, "idlepoll is sleeping");
+
+
+#define POLL_LIST_LEN  128
+struct pollrec {
+	poll_handler_t	*handler;
+	struct ifnet	*ifp;
+};
+
+static struct pollrec pr[POLL_LIST_LEN];
+
+/*
+ * register relevant netisr. Called from kern_clock.c:
+ */
+void
+init_device_poll(void)
+{
+	register_netisr(NETISR_POLL, netisr_poll);
+}
+
+/*
+ * Hook from hardclock. Tries to schedule a netisr, but keeps track
+ * of lost ticks due to the previous handler taking too long.
+ * Normally, this should not happen, because polling handler should
+ * run for a short time. However, in some cases (e.g. when there are
+ * changes in link status etc.) the drivers take a very long time
+ * (even in the order of milliseconds) to reset and reconfigure the
+ * device, causing apparent lost polls.
+ *
+ * The first part of the code is just for debugging purposes, and tries
+ * to count how often hardclock ticks are shorter than they should,
+ * meaning either stray interrupts or delayed events.
+ */
+void
+hardclock_device_poll(void)
+{
+	static struct timeval prev_t, t;
+	int delta;
+
+	if (poll_handlers == 0)
+		return;
+
+	microuptime(&t);
+	delta = (t.tv_usec - prev_t.tv_usec) +
+		(t.tv_sec - prev_t.tv_sec)*1000000;
+	if (delta * hz < 500000)
+		short_ticks++;
+	else
+		prev_t = t;
+
+	if (pending_polls > 100) {
+		/*
+		 * Too much, assume it has stalled (not always true
+		 * see comment above).
+		 */
+		stalled++;
+		pending_polls = 0;
+		phase = 0;
+	}
+
+	if (phase <= 2) {
+		if (phase != 0)
+			suspect++;
+		phase = 1;
+		schednetisr(NETISR_POLL);
+		phase = 2;
+	}
+	if (pending_polls++ > 0)
+		lost_polls++;
+}
+
+/*
+ * ether_poll is called from the idle loop or from the trap handler.
+ */
+void
+ether_poll(int count)
+{
+	int i;
+
+	mtx_lock(&Giant);
+
+	if (count > poll_each_burst)
+		count = poll_each_burst;
+	for (i = 0 ; i < poll_handlers ; i++)
+		if (pr[i].handler && (IFF_UP|IFF_RUNNING) ==
+		    (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) )
+			pr[i].handler(pr[i].ifp, 0, count); /* quick check */
+	mtx_unlock(&Giant);
+}
+
+/*
+ * netisr_pollmore is called after other netisr's, possibly scheduling
+ * another NETISR_POLL call, or adapting the burst size for the next cycle.
+ *
+ * It is very bad to fetch large bursts of packets from a single card at once,
+ * because the burst could take a long time to be completely processed, or
+ * could saturate the intermediate queue (ipintrq or similar) leading to
+ * losses or unfairness. To reduce the problem, and also to account better for
+ * time spent in network-related processing, we split the burst in smaller
+ * chunks of fixed size, giving control to the other netisr's between chunks.
+ * This helps in improving the fairness, reducing livelock (because we
+ * emulate more closely the "process to completion" that we have with
+ * fastforwarding) and accounting for the work performed in low level
+ * handling and forwarding.
+ */
+
+static struct timeval poll_start_t;
+
+void
+netisr_pollmore()
+{
+	struct timeval t;
+	int kern_load;
+	/* XXX run at splhigh() or equivalent */
+
+	phase = 5;
+	if (residual_burst > 0) {
+		schednetisr(NETISR_POLL);
+		/* will run immediately on return, followed by netisrs */
+		return ;
+	}
+	/* here we can account time spent in netisr's in this tick */
+	microuptime(&t);
+	kern_load = (t.tv_usec - poll_start_t.tv_usec) +
+		(t.tv_sec - poll_start_t.tv_sec)*1000000;	/* us */
+	kern_load = (kern_load * hz) / 10000;			/* 0..100 */
+	if (kern_load > (100 - user_frac)) { /* try decrease ticks */
+		if (poll_burst > 1)
+			poll_burst--;
+	} else {
+		if (poll_burst < poll_burst_max)
+			poll_burst++;
+	}
+
+	pending_polls--;
+	if (pending_polls == 0) /* we are done */
+		phase = 0;
+	else {
+		/*
+		 * Last cycle was long and caused us to miss one or more
+		 * hardclock ticks. Restart processing again, but slightly
+		 * reduce the burst size to prevent that this happens again.
+		 */
+		poll_burst -= (poll_burst / 8);
+		if (poll_burst < 1)
+			poll_burst = 1;
+		schednetisr(NETISR_POLL);
+		phase = 6;
+	}
+}
+
+/*
+ * netisr_poll is scheduled by schednetisr when appropriate, typically once
+ * per tick. It is called at splnet() so first thing to do is to upgrade to
+ * splimp(), and call all registered handlers.
+ */
+static void
+netisr_poll(void)
+{
+	static int reg_frac_count;
+	int i, cycles;
+	enum poll_cmd arg = POLL_ONLY;
+	mtx_lock(&Giant);
+
+	phase = 3;
+	if (residual_burst == 0) { /* first call in this tick */
+		microuptime(&poll_start_t);
+		/*
+		 * Check that paremeters are consistent with runtime
+		 * variables. Some of these tests could be done at sysctl
+		 * time, but the savings would be very limited because we
+		 * still have to check against reg_frac_count and
+		 * poll_each_burst. So, instead of writing separate sysctl
+		 * handlers, we do all here.
+		 */
+
+		if (reg_frac > hz)
+			reg_frac = hz;
+		else if (reg_frac < 1)
+			reg_frac = 1;
+		if (reg_frac_count > reg_frac)
+			reg_frac_count = reg_frac - 1;
+		if (reg_frac_count-- == 0) {
+			arg = POLL_AND_CHECK_STATUS;
+			reg_frac_count = reg_frac - 1;
+		}
+		if (poll_burst_max < MIN_POLL_BURST_MAX)
+			poll_burst_max = MIN_POLL_BURST_MAX;
+		else if (poll_burst_max > MAX_POLL_BURST_MAX)
+			poll_burst_max = MAX_POLL_BURST_MAX;
+
+		if (poll_each_burst < 1)
+			poll_each_burst = 1;
+		else if (poll_each_burst > poll_burst_max)
+			poll_each_burst = poll_burst_max;
+
+		residual_burst = poll_burst;
+	}
+	cycles = (residual_burst < poll_each_burst) ?
+		residual_burst : poll_each_burst;
+	residual_burst -= cycles;
+
+	if (polling) {
+		for (i = 0 ; i < poll_handlers ; i++)
+			if (pr[i].handler && (IFF_UP|IFF_RUNNING) ==
+			    (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) )
+				pr[i].handler(pr[i].ifp, arg, cycles);
+	} else {	/* unregister */
+		for (i = 0 ; i < poll_handlers ; i++) {
+			if (pr[i].handler &&
+			    pr[i].ifp->if_flags & IFF_RUNNING) {
+				pr[i].ifp->if_ipending &= ~IFF_POLLING;
+				pr[i].handler(pr[i].ifp, POLL_DEREGISTER, 1);
+			}
+			pr[i].handler=NULL;
+		}
+		residual_burst = 0;
+		poll_handlers = 0;
+	}
+	/* on -stable, schednetisr(NETISR_POLLMORE); */
+	phase = 4;
+	mtx_unlock(&Giant);
+}
+
+/*
+ * Try to register routine for polling. Returns 1 if successful
+ * (and polling should be enabled), 0 otherwise.
+ * A device is not supposed to register itself multiple times.
+ *
+ * This is called from within the *_intr() functions, so we do not need
+ * further locking.
+ */
+int
+ether_poll_register(poll_handler_t *h, struct ifnet *ifp)
+{
+	int s;
+
+	if (polling == 0) /* polling disabled, cannot register */
+		return 0;
+	if (h == NULL || ifp == NULL)		/* bad arguments	*/
+		return 0;
+	if ( !(ifp->if_flags & IFF_UP) )	/* must be up		*/
+		return 0;
+	if (ifp->if_ipending & IFF_POLLING)	/* already polling	*/
+		return 0;
+
+	s = splhigh();
+	if (poll_handlers >= POLL_LIST_LEN) {
+		/*
+		 * List full, cannot register more entries.
+		 * This should never happen; if it does, it is probably a
+		 * broken driver trying to register multiple times. Checking
+		 * this at runtime is expensive, and won't solve the problem
+		 * anyways, so just report a few times and then give up.
+		 */
+		static int verbose = 10 ;
+		splx(s);
+		if (verbose >0) {
+			printf("poll handlers list full, "
+				"maybe a broken driver ?\n");
+			verbose--;
+		}
+		return 0; /* no polling for you */
+	}
+
+	pr[poll_handlers].handler = h;
+	pr[poll_handlers].ifp = ifp;
+	poll_handlers++;
+	ifp->if_ipending |= IFF_POLLING;
+	splx(s);
+	if (idlepoll_sleeping)
+		wakeup(&idlepoll_sleeping);
+	return 1; /* polling enabled in next call */
+}
+
+/*
+ * Remove interface from the polling list. Normally called by *_stop().
+ * It is not an error to call it with IFF_POLLING clear, the call is
+ * sufficiently rare to be preferable to save the space for the extra
+ * test in each driver in exchange of one additional function call.
+ */
+int
+ether_poll_deregister(struct ifnet *ifp)
+{
+	int i;
+
+	mtx_lock(&Giant);
+	if ( !ifp || !(ifp->if_ipending & IFF_POLLING) ) {
+		mtx_unlock(&Giant);
+		return 0;
+	}
+	for (i = 0 ; i < poll_handlers ; i++)
+		if (pr[i].ifp == ifp) /* found it */
+			break;
+	ifp->if_ipending &= ~IFF_POLLING; /* found or not... */
+	if (i == poll_handlers) {
+		mtx_unlock(&Giant);
+		printf("ether_poll_deregister: ifp not found!!!\n");
+		return 0;
+	}
+	poll_handlers--;
+	if (i < poll_handlers) { /* Last entry replaces this one. */
+		pr[i].handler = pr[poll_handlers].handler;
+		pr[i].ifp = pr[poll_handlers].ifp;
+	}
+	mtx_unlock(&Giant);
+	return 1;
+}
+
+static void
+poll_idle(void)
+{
+	struct thread *td = curthread;
+	struct rtprio rtp;
+	int pri;
+
+	rtp.prio = RTP_PRIO_MAX;	/* lowest priority */
+	rtp.type = RTP_PRIO_IDLE;
+	mtx_lock_spin(&sched_lock);
+	rtp_to_pri(&rtp, td->td_ksegrp);
+	pri = td->td_priority;
+	mtx_unlock_spin(&sched_lock);
+
+	for (;;) {
+		if (poll_in_idle_loop && poll_handlers > 0) {
+			idlepoll_sleeping = 0;
+			mtx_lock(&Giant);
+			ether_poll(poll_each_burst);
+			mtx_unlock(&Giant);
+			mtx_assert(&Giant, MA_NOTOWNED);
+			mtx_lock_spin(&sched_lock);
+			setrunqueue(td);
+			td->td_proc->p_stats->p_ru.ru_nvcsw++;
+			mi_switch();
+			mtx_unlock_spin(&sched_lock);
+		} else {
+			idlepoll_sleeping = 1;
+			tsleep(&idlepoll_sleeping, pri, "pollid", hz * 3);
+		}
+	}
+}
+
+static struct proc *idlepoll;
+static struct kproc_desc idlepoll_kp = {
+	 "idlepoll",
+	 poll_idle,
+	 &idlepoll
+};
+SYSINIT(idlepoll, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start, &idlepoll_kp)
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
new file mode 100644
index 0000000..a5378d9
--- /dev/null
+++ b/sys/kern/kern_proc.c
@@ -0,0 +1,1072 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+#include <sys/filedesc.h>
+#include <sys/tty.h>
+#include <sys/signalvar.h>
+#include <sys/sx.h>
+#include <sys/user.h>
+#include <sys/jail.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/uma.h>
+#include <machine/critical.h>
+
+MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
+MALLOC_DEFINE(M_SESSION, "session", "session header");
+static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
+MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
+
+static struct proc *dopfind(register pid_t);
+
+static void doenterpgrp(struct proc *, struct pgrp *);
+
+static void pgdelete(struct pgrp *);
+
+static void orphanpg(struct pgrp *pg);
+
+/*
+ * Other process lists
+ */
+struct pidhashhead *pidhashtbl;
+u_long pidhash;
+struct pgrphashhead *pgrphashtbl;
+u_long pgrphash;
+struct proclist allproc;
+struct proclist zombproc;
+struct sx allproc_lock;
+struct sx proctree_lock;
+struct mtx pargs_ref_lock;
+uma_zone_t proc_zone;
+uma_zone_t ithread_zone;
+
+CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
+
+/*
+ * Initialize global process hashing structures.
+ */
+void
+procinit()
+{
+
+	sx_init(&allproc_lock, "allproc");
+	sx_init(&proctree_lock, "proctree");
+	mtx_init(&pargs_ref_lock, "struct pargs.ref", NULL, MTX_DEF);
+	LIST_INIT(&allproc);
+	LIST_INIT(&zombproc);
+	pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
+	pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
+	proc_zone = uma_zcreate("PROC", sizeof (struct proc), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	uihashinit();
+}
+
+/*
+ * Note that we do not link to the proc's ucred here
+ * The thread is linked as if running but no KSE assigned
+ */
+static  void
+thread_link(struct thread *td, struct ksegrp *kg)
+{
+	struct proc *p = kg->kg_proc;
+
+	td->td_proc     = p;
+	td->td_ksegrp   = kg;
+	td->td_last_kse = &p->p_kse;
+
+	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
+	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
+	td->td_critnest = 0;
+	td->td_kse      = NULL;
+	cpu_thread_link(td);
+}
+
+/* 
+ * KSE is linked onto the idle queue.
+ */
+static void
+kse_link(struct kse *ke, struct ksegrp *kg)
+{
+	struct proc *p = kg->kg_proc;
+
+	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
+	kg->kg_kses++;
+	TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);
+	ke->ke_proc	= p;
+	ke->ke_ksegrp	= kg;
+	ke->ke_thread	= NULL;
+	ke->ke_oncpu = NOCPU;
+}
+
+static void
+ksegrp_link(struct ksegrp *kg, struct proc *p)
+{
+
+	TAILQ_INIT(&kg->kg_threads);
+	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
+	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
+	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
+	TAILQ_INIT(&kg->kg_iq);		/* all kses in ksegrp */
+	kg->kg_proc	= p;
+/* the following counters are in the -zero- section and may not need clearing */
+	kg->kg_runnable = 0;
+	kg->kg_kses = 0;
+	kg->kg_runq_kses = 0; /* XXXKSE change name */
+/* link it in now that it's consitant */
+	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
+}
+
+/*
+ * for a newly created process,
+ * link up a the structure and its initial threads etc.
+ */
+void
+proc_linkup(struct proc *p, struct ksegrp *kg,
+			struct kse *ke, struct thread *td)
+{
+
+	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
+	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
+
+	ksegrp_link(kg, p);
+	kse_link(ke, kg);
+	thread_link(td, kg);
+	/* link them together for 1:1 */
+	td->td_kse = ke;
+	ke->ke_thread = td;
+}
+
+/* temporary version is ultra simple while we are in 1:1 mode */
+struct thread *
+thread_get(struct proc *p)
+{
+	struct thread *td = &p->p_xxthread;
+
+	return (td);
+}
+
+
+/*********************
+* STUB KSE syscalls
+*********************/
+
+/* struct thread_wakeup_args { struct thread_mailbox *tmbx; }; */
+int
+thread_wakeup(struct thread *td, struct  thread_wakeup_args *uap)
+{
+
+	return(ENOSYS);
+}
+
+int
+kse_exit(struct thread *td, struct kse_exit_args *uap)
+{
+
+	return(ENOSYS);
+}
+
+int
+kse_yield(struct thread *td, struct kse_yield_args *uap)
+{
+
+	return(ENOSYS);
+}
+
+int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
+{
+
+	return(ENOSYS);
+}
+
+
+int
+kse_new(struct thread *td, struct kse_new_args *uap)
+/* struct kse_new_args {
+	struct kse_mailbox *mbx;
+	int	new_grp_flag;
+}; */
+{
+
+	return (ENOSYS);
+}
+
+/*
+ * Is p an inferior of the current process?
+ */
+int
+inferior(p)
+	register struct proc *p;
+{
+
+	sx_assert(&proctree_lock, SX_LOCKED);
+	for (; p != curproc; p = p->p_pptr)
+		if (p->p_pid == 0)
+			return (0);
+	return (1);
+}
+
+/*
+ * Locate a process by number
+ */
+struct proc *
+pfind(pid)
+	register pid_t pid;
+{
+	register struct proc *p;
+
+	sx_slock(&allproc_lock);
+	p = dopfind(pid);
+	sx_sunlock(&allproc_lock);
+	return (p);
+}
+
+static struct proc *
+dopfind(pid)
+	register pid_t pid;
+{
+	register struct proc *p;
+
+	sx_assert(&allproc_lock, SX_LOCKED);
+
+	LIST_FOREACH(p, PIDHASH(pid), p_hash)
+		if (p->p_pid == pid) {
+			PROC_LOCK(p);
+			break;
+		}
+	return (p);
+}
+
+/*
+ * Locate a process group by number.
+ * The caller must hold proctree_lock.
+ */
+struct pgrp *
+pgfind(pgid)
+	register pid_t pgid;
+{
+	register struct pgrp *pgrp;
+
+	sx_assert(&proctree_lock, SX_LOCKED);
+
+	LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
+		if (pgrp->pg_id == pgid) {
+			PGRP_LOCK(pgrp);
+			return (pgrp);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Create a new process group.
+ * pgid must be equal to the pid of p.
+ * Begin a new session if required.
+ */
+int
+enterpgrp(p, pgid, pgrp, sess)
+	register struct proc *p;
+	pid_t pgid;
+	struct pgrp *pgrp;
+	struct session *sess;
+{
+	struct pgrp *pgrp2;
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+
+	KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
+	KASSERT(p->p_pid == pgid,
+	    ("enterpgrp: new pgrp and pid != pgid"));
+
+	pgrp2 = pgfind(pgid);
+
+	KASSERT(pgrp2 == NULL,
+	    ("enterpgrp: pgrp with pgid exists"));
+	KASSERT(!SESS_LEADER(p),
+	    ("enterpgrp: session leader attempted setpgrp"));
+
+	mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
+
+	if (sess != NULL) {
+		/*
+		 * new session
+		 */
+		mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
+		PROC_LOCK(p);
+		p->p_flag &= ~P_CONTROLT;
+		PROC_UNLOCK(p);
+		PGRP_LOCK(pgrp);
+		sess->s_leader = p;
+		sess->s_sid = p->p_pid;
+		sess->s_count = 1;
+		sess->s_ttyvp = NULL;
+		sess->s_ttyp = NULL;
+		bcopy(p->p_session->s_login, sess->s_login,
+			    sizeof(sess->s_login));
+		pgrp->pg_session = sess;
+		KASSERT(p == curproc,
+		    ("enterpgrp: mksession and p != curproc"));
+	} else {
+		pgrp->pg_session = p->p_session;
+		SESS_LOCK(pgrp->pg_session);
+		pgrp->pg_session->s_count++;
+		SESS_UNLOCK(pgrp->pg_session);
+		PGRP_LOCK(pgrp);
+	}
+	pgrp->pg_id = pgid;
+	LIST_INIT(&pgrp->pg_members);
+
+	/*
+	 * As we have an exclusive lock of proctree_lock,
+	 * this should not deadlock.
+	 */
+	LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
+	pgrp->pg_jobc = 0;
+	SLIST_INIT(&pgrp->pg_sigiolst);
+	PGRP_UNLOCK(pgrp);
+
+	doenterpgrp(p, pgrp);
+
+	return (0);
+}
+
+/*
+ * Move p to an existing process group
+ */
+int
+enterthispgrp(p, pgrp)
+	register struct proc *p;
+	struct pgrp *pgrp;
+{
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
+	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
+	KASSERT(pgrp->pg_session == p->p_session,
+		("%s: pgrp's session %p, p->p_session %p.\n",
+		__func__,
+		pgrp->pg_session,
+		p->p_session));
+	KASSERT(pgrp != p->p_pgrp,
+		("%s: p belongs to pgrp.", __func__));
+
+	doenterpgrp(p, pgrp);
+
+	return (0);
+}
+
+/*
+ * Move p to a process group
+ */
+static void
+doenterpgrp(p, pgrp)
+	struct proc *p;
+	struct pgrp *pgrp;
+{
+	struct pgrp *savepgrp;
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
+	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
+
+	savepgrp = p->p_pgrp;
+
+	/*
+	 * Adjust eligibility of affected pgrps to participate in job control.
+	 * Increment eligibility counts before decrementing, otherwise we
+	 * could reach 0 spuriously during the first call.
+	 */
+	fixjobc(p, pgrp, 1);
+	fixjobc(p, p->p_pgrp, 0);
+
+	PGRP_LOCK(pgrp);
+	PGRP_LOCK(savepgrp);
+	PROC_LOCK(p);
+	LIST_REMOVE(p, p_pglist);
+	p->p_pgrp = pgrp;
+	PROC_UNLOCK(p);
+	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
+	PGRP_UNLOCK(savepgrp);
+	PGRP_UNLOCK(pgrp);
+	if (LIST_EMPTY(&savepgrp->pg_members))
+		pgdelete(savepgrp);
+}
+
+/*
+ * remove process from process group
+ */
+int
+leavepgrp(p)
+	register struct proc *p;
+{
+	struct pgrp *savepgrp;
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	savepgrp = p->p_pgrp;
+	PGRP_LOCK(savepgrp);
+	PROC_LOCK(p);
+	LIST_REMOVE(p, p_pglist);
+	p->p_pgrp = NULL;
+	PROC_UNLOCK(p);
+	PGRP_UNLOCK(savepgrp);
+	if (LIST_EMPTY(&savepgrp->pg_members))
+		pgdelete(savepgrp);
+	return (0);
+}
+
+/*
+ * delete a process group
+ */
+static void
+pgdelete(pgrp)
+	register struct pgrp *pgrp;
+{
+	struct session *savesess;
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
+
+	/*
+	 * Reset any sigio structures pointing to us as a result of
+	 * F_SETOWN with our pgid.
+	 */
+	funsetownlst(&pgrp->pg_sigiolst);
+
+	PGRP_LOCK(pgrp);
+	if (pgrp->pg_session->s_ttyp != NULL &&
+	    pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
+		pgrp->pg_session->s_ttyp->t_pgrp = NULL;
+	LIST_REMOVE(pgrp, pg_hash);
+	savesess = pgrp->pg_session;
+	SESS_LOCK(savesess);
+	savesess->s_count--;
+	SESS_UNLOCK(savesess);
+	PGRP_UNLOCK(pgrp);
+	if (savesess->s_count == 0) {
+		mtx_destroy(&savesess->s_mtx);
+		FREE(pgrp->pg_session, M_SESSION);
+	}
+	mtx_destroy(&pgrp->pg_mtx);
+	FREE(pgrp, M_PGRP);
+}
+
+/*
+ * Adjust pgrp jobc counters when specified process changes process group.
+ * We count the number of processes in each process group that "qualify"
+ * the group for terminal job control (those with a parent in a different
+ * process group of the same session).  If that count reaches zero, the
+ * process group becomes orphaned.  Check both the specified process'
+ * process group and that of its children.
+ * entering == 0 => p is leaving specified group.
+ * entering == 1 => p is entering specified group.
+ */
+void
+fixjobc(p, pgrp, entering)
+	register struct proc *p;
+	register struct pgrp *pgrp;
+	int entering;
+{
+	register struct pgrp *hispgrp;
+	register struct session *mysession;
+
+	sx_assert(&proctree_lock, SX_LOCKED);
+	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
+
+	/*
+	 * Check p's parent to see whether p qualifies its own process
+	 * group; if so, adjust count for p's process group.
+	 */
+	mysession = pgrp->pg_session;
+	if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
+	    hispgrp->pg_session == mysession) {
+		PGRP_LOCK(pgrp);
+		if (entering)
+			pgrp->pg_jobc++;
+		else {
+			--pgrp->pg_jobc;
+			if (pgrp->pg_jobc == 0)
+				orphanpg(pgrp);
+		}
+		PGRP_UNLOCK(pgrp);
+	}
+
+	/*
+	 * Check this process' children to see whether they qualify
+	 * their process groups; if so, adjust counts for children's
+	 * process groups.
+	 */
+	LIST_FOREACH(p, &p->p_children, p_sibling) {
+		if ((hispgrp = p->p_pgrp) != pgrp &&
+		    hispgrp->pg_session == mysession &&
+		    p->p_stat != SZOMB) {
+			PGRP_LOCK(hispgrp);
+			if (entering)
+				hispgrp->pg_jobc++;
+			else {
+				--hispgrp->pg_jobc;
+				if (hispgrp->pg_jobc == 0)
+					orphanpg(hispgrp);
+			}
+			PGRP_UNLOCK(hispgrp);
+		}
+	}
+}
+
+/*
+ * A process group has become orphaned;
+ * if there are any stopped processes in the group,
+ * hang-up all process in that group.
+ */
+static void
+orphanpg(pg)
+	struct pgrp *pg;
+{
+	register struct proc *p;
+
+	PGRP_LOCK_ASSERT(pg, MA_OWNED);
+
+	mtx_lock_spin(&sched_lock);
+	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+		if (p->p_stat == SSTOP) {
+			mtx_unlock_spin(&sched_lock);
+			LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+				PROC_LOCK(p);
+				psignal(p, SIGHUP);
+				psignal(p, SIGCONT);
+				PROC_UNLOCK(p);
+			}
+			return;
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(pgrpdump, pgrpdump)
+{
+	register struct pgrp *pgrp;
+	register struct proc *p;
+	register int i;
+
+	for (i = 0; i <= pgrphash; i++) {
+		if (!LIST_EMPTY(&pgrphashtbl[i])) {
+			printf("\tindx %d\n", i);
+			LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
+				printf(
+			"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
+				    (void *)pgrp, (long)pgrp->pg_id,
+				    (void *)pgrp->pg_session,
+				    pgrp->pg_session->s_count,
+				    (void *)LIST_FIRST(&pgrp->pg_members));
+				LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+					printf("\t\tpid %ld addr %p pgrp %p\n", 
+					    (long)p->p_pid, (void *)p,
+					    (void *)p->p_pgrp);
+				}
+			}
+		}
+	}
+}
+#endif /* DDB */
+
+/*
+ * Fill in an kinfo_proc structure for the specified process.
+ * Must be called with the target process locked.
+ */
+void
+fill_kinfo_proc(p, kp)
+	struct proc *p;
+	struct kinfo_proc *kp;
+{
+	struct thread *td;
+	struct tty *tp;
+	struct session *sp;
+	struct timeval tv;
+
+	bzero(kp, sizeof(*kp));
+
+	kp->ki_structsize = sizeof(*kp);
+	kp->ki_paddr = p;
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	kp->ki_addr =/* p->p_addr; */0; /* XXXKSE */
+	kp->ki_args = p->p_args;
+	kp->ki_textvp = p->p_textvp;
+#ifdef KTRACE
+	kp->ki_tracep = p->p_tracep;
+	mtx_lock(&ktrace_mtx);
+	kp->ki_traceflag = p->p_traceflag;
+	mtx_unlock(&ktrace_mtx);
+#endif
+	kp->ki_fd = p->p_fd;
+	kp->ki_vmspace = p->p_vmspace;
+	if (p->p_ucred) {
+		kp->ki_uid = p->p_ucred->cr_uid;
+		kp->ki_ruid = p->p_ucred->cr_ruid;
+		kp->ki_svuid = p->p_ucred->cr_svuid;
+		/* XXX bde doesn't like KI_NGROUPS */
+		kp->ki_ngroups = min(p->p_ucred->cr_ngroups, KI_NGROUPS);
+		bcopy(p->p_ucred->cr_groups, kp->ki_groups,
+		    kp->ki_ngroups * sizeof(gid_t));
+		kp->ki_rgid = p->p_ucred->cr_rgid;
+		kp->ki_svgid = p->p_ucred->cr_svgid;
+	}
+	if (p->p_procsig) {
+		kp->ki_sigignore = p->p_procsig->ps_sigignore;
+		kp->ki_sigcatch = p->p_procsig->ps_sigcatch;
+	}
+	mtx_lock_spin(&sched_lock);
+	if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) {
+		struct vmspace *vm = p->p_vmspace;
+
+		kp->ki_size = vm->vm_map.size;
+		kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
+		if (p->p_sflag & PS_INMEM)
+			kp->ki_rssize += UAREA_PAGES;
+		FOREACH_THREAD_IN_PROC(p, td) /* XXXKSE: thread swapout check */
+			kp->ki_rssize += KSTACK_PAGES;
+		kp->ki_swrss = vm->vm_swrss;
+		kp->ki_tsize = vm->vm_tsize;
+		kp->ki_dsize = vm->vm_dsize;
+		kp->ki_ssize = vm->vm_ssize;
+	}
+	if ((p->p_sflag & PS_INMEM) && p->p_stats) {
+		kp->ki_start = p->p_stats->p_start;
+		kp->ki_rusage = p->p_stats->p_ru;
+		kp->ki_childtime.tv_sec = p->p_stats->p_cru.ru_utime.tv_sec +
+		    p->p_stats->p_cru.ru_stime.tv_sec;
+		kp->ki_childtime.tv_usec = p->p_stats->p_cru.ru_utime.tv_usec +
+		    p->p_stats->p_cru.ru_stime.tv_usec;
+	}
+	td = FIRST_THREAD_IN_PROC(p);
+	if (td->td_wmesg != NULL)
+		strncpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg) - 1);
+	if (p->p_stat == SMTX) {
+		kp->ki_kiflag |= KI_MTXBLOCK;
+		strncpy(kp->ki_mtxname, td->td_mtxname,
+		    sizeof(kp->ki_mtxname) - 1);
+	}
+	kp->ki_stat = p->p_stat;
+	kp->ki_sflag = p->p_sflag;
+	kp->ki_swtime = p->p_swtime;
+	kp->ki_pid = p->p_pid;
+	/* vvv XXXKSE */
+	bintime2timeval(&p->p_runtime, &tv);
+	kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;
+	kp->ki_pctcpu = p->p_kse.ke_pctcpu;
+	kp->ki_estcpu = td->td_ksegrp->kg_estcpu;
+	kp->ki_slptime = td->td_ksegrp->kg_slptime;
+	kp->ki_wchan = td->td_wchan;
+	kp->ki_pri.pri_level = td->td_priority;
+	kp->ki_pri.pri_user = td->td_ksegrp->kg_user_pri;
+	kp->ki_pri.pri_class = td->td_ksegrp->kg_pri_class;
+	kp->ki_pri.pri_native = td->td_base_pri;
+	kp->ki_nice = td->td_ksegrp->kg_nice;
+	kp->ki_rqindex = p->p_kse.ke_rqindex;
+	kp->ki_oncpu = p->p_kse.ke_oncpu;
+	kp->ki_lastcpu = td->td_lastcpu;
+	kp->ki_tdflags = td->td_flags;
+	kp->ki_pcb = td->td_pcb;
+	kp->ki_kstack = (void *)td->td_kstack;
+	/* ^^^ XXXKSE */
+	mtx_unlock_spin(&sched_lock);
+	sp = NULL;
+	tp = NULL;
+	if (p->p_pgrp) {
+		kp->ki_pgid = p->p_pgrp->pg_id;
+		kp->ki_jobc = p->p_pgrp->pg_jobc;
+		sp = p->p_pgrp->pg_session;
+
+		if (sp != NULL) {
+			kp->ki_sid = sp->s_sid;
+			SESS_LOCK(sp);
+			strncpy(kp->ki_login, sp->s_login,
+			    sizeof(kp->ki_login) - 1);
+			if (sp->s_ttyvp)
+				kp->ki_kiflag |= KI_CTTY;
+			if (SESS_LEADER(p))
+				kp->ki_kiflag |= KI_SLEADER;
+			tp = sp->s_ttyp;
+			SESS_UNLOCK(sp);
+		}
+	}
+	if ((p->p_flag & P_CONTROLT) && tp != NULL) {
+		kp->ki_tdev = dev2udev(tp->t_dev);
+		kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+		if (tp->t_session)
+			kp->ki_tsid = tp->t_session->s_sid;
+	} else
+		kp->ki_tdev = NOUDEV;
+	if (p->p_comm[0] != '\0') {
+		strncpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm) - 1);
+		strncpy(kp->ki_ocomm, p->p_comm, sizeof(kp->ki_ocomm) - 1);
+	}
+	kp->ki_siglist = p->p_siglist;
+	kp->ki_sigmask = p->p_sigmask;
+	kp->ki_xstat = p->p_xstat;
+	kp->ki_acflag = p->p_acflag;
+	kp->ki_flag = p->p_flag;
+	/* If jailed(p->p_ucred), emulate the old P_JAILED flag. */
+	if (jailed(p->p_ucred))
+		kp->ki_flag |= P_JAILED;
+	kp->ki_lock = p->p_lock;
+	if (p->p_pptr)
+		kp->ki_ppid = p->p_pptr->p_pid;
+}
+
+/*
+ * Locate a zombie process by number
+ */
+struct proc *
+zpfind(pid_t pid)
+{
+	struct proc *p;
+
+	sx_slock(&allproc_lock);
+	LIST_FOREACH(p, &zombproc, p_list)
+		if (p->p_pid == pid) {
+			PROC_LOCK(p);
+			break;
+		}
+	sx_sunlock(&allproc_lock);
+	return (p);
+}
+
+
+/*
+ * Must be called with the process locked and will return with it unlocked.
+ */
+static int
+sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb)
+{
+	struct kinfo_proc kinfo_proc;
+	int error;
+	struct proc *np;
+	pid_t pid = p->p_pid;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	fill_kinfo_proc(p, &kinfo_proc);
+	PROC_UNLOCK(p);
+	error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc));
+	if (error)
+		return (error);
+	if (doingzomb)
+		np = zpfind(pid);
+	else {
+		if (pid == 0)
+			return (0);
+		np = pfind(pid);
+	}
+	if (np == NULL)
+		return EAGAIN;
+	if (np != p) {
+		PROC_UNLOCK(np);
+		return EAGAIN;
+	}
+	PROC_UNLOCK(np);
+	return (0);
+}
+
+static int
+sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int*) arg1;
+	u_int namelen = arg2;
+	struct proc *p;
+	int doingzomb;
+	int error = 0;
+
+	if (oidp->oid_number == KERN_PROC_PID) {
+		if (namelen != 1) 
+			return (EINVAL);
+		p = pfind((pid_t)name[0]);
+		if (!p)
+			return (0);
+		if (p_cansee(curthread, p)) {
+			PROC_UNLOCK(p);
+			return (0);
+		}
+		error = sysctl_out_proc(p, req, 0);
+		return (error);
+	}
+	if (oidp->oid_number == KERN_PROC_ALL && !namelen)
+		;
+	else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1)
+		;
+	else
+		return (EINVAL);
+	
+	if (!req->oldptr) {
+		/* overestimate by 5 procs */
+		error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
+		if (error)
+			return (error);
+	}
+	sx_slock(&allproc_lock);
+	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
+		if (!doingzomb)
+			p = LIST_FIRST(&allproc);
+		else
+			p = LIST_FIRST(&zombproc);
+		for (; p != 0; p = LIST_NEXT(p, p_list)) {
+			PROC_LOCK(p);
+			/*
+			 * Show a user only appropriate processes.
+			 */
+			if (p_cansee(curthread, p)) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			/*
+			 * Skip embryonic processes.
+			 */
+			if (p->p_stat == SIDL) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			/*
+			 * TODO - make more efficient (see notes below).
+			 * do by session.
+			 */
+			switch (oidp->oid_number) {
+
+			case KERN_PROC_PGRP:
+				/* could do this by traversing pgrp */
+				if (p->p_pgrp == NULL || 
+				    p->p_pgrp->pg_id != (pid_t)name[0]) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				break;
+
+			case KERN_PROC_TTY:
+				if ((p->p_flag & P_CONTROLT) == 0 ||
+				    p->p_session == NULL) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				SESS_LOCK(p->p_session);
+				if (p->p_session->s_ttyp == NULL ||
+				    dev2udev(p->p_session->s_ttyp->t_dev) != 
+				    (udev_t)name[0]) {
+					SESS_UNLOCK(p->p_session);
+					PROC_UNLOCK(p);
+					continue;
+				}
+				SESS_UNLOCK(p->p_session);
+				break;
+
+			case KERN_PROC_UID:
+				if (p->p_ucred == NULL || 
+				    p->p_ucred->cr_uid != (uid_t)name[0]) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				break;
+
+			case KERN_PROC_RUID:
+				if (p->p_ucred == NULL || 
+				    p->p_ucred->cr_ruid != (uid_t)name[0]) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				break;
+			}
+
+			error = sysctl_out_proc(p, req, doingzomb);
+			if (error) {
+				sx_sunlock(&allproc_lock);
+				return (error);
+			}
+		}
+	}
+	sx_sunlock(&allproc_lock);
+	return (0);
+}
+
+struct pargs *
+pargs_alloc(int len)
+{
+	struct pargs *pa;
+
+	MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS,
+		M_WAITOK);
+	pa->ar_ref = 1;
+	pa->ar_length = len;
+	return (pa);
+}
+
+void
+pargs_free(struct pargs *pa)
+{
+
+	FREE(pa, M_PARGS);
+}
+
+void
+pargs_hold(struct pargs *pa)
+{
+
+	if (pa == NULL)
+		return;
+	PARGS_LOCK(pa);
+	pa->ar_ref++;
+	PARGS_UNLOCK(pa);
+}
+
+void
+pargs_drop(struct pargs *pa)
+{
+
+	if (pa == NULL)
+		return;
+	PARGS_LOCK(pa);
+	if (--pa->ar_ref == 0) {
+		PARGS_UNLOCK(pa);
+		pargs_free(pa);
+	} else
+		PARGS_UNLOCK(pa);
+}
+
+/*
+ * This sysctl allows a process to retrieve the argument list or process
+ * title for another process without groping around in the address space
+ * of the other process.  It also allow a process to set its own "process 
+ * title to a string of its own choice.
+ */
+static int
+sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int*) arg1;
+	u_int namelen = arg2;
+	struct proc *p;
+	struct pargs *pa;
+	int error = 0;
+
+	if (namelen != 1) 
+		return (EINVAL);
+
+	p = pfind((pid_t)name[0]);
+	if (!p)
+		return (0);
+
+	if ((!ps_argsopen) && p_cansee(curthread, p)) {
+		PROC_UNLOCK(p);
+		return (0);
+	}
+	PROC_UNLOCK(p);
+
+	if (req->newptr && curproc != p)
+		return (EPERM);
+
+	PROC_LOCK(p);
+	pa = p->p_args;
+	pargs_hold(pa);
+	PROC_UNLOCK(p);
+	if (req->oldptr && pa != NULL) {
+		error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
+	}
+	pargs_drop(pa);
+	if (req->newptr == NULL)
+		return (error);
+
+	PROC_LOCK(p);
+	pa = p->p_args;
+	p->p_args = NULL;
+	PROC_UNLOCK(p);
+	pargs_drop(pa);
+
+	if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
+		return (error);
+
+	pa = pargs_alloc(req->newlen);
+	error = SYSCTL_IN(req, pa->ar_args, req->newlen);
+	if (!error) {
+		PROC_LOCK(p);
+		p->p_args = pa;
+		PROC_UNLOCK(p);
+	} else
+		pargs_free(pa);
+	return (error);
+}
+
+SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD,  0, "Process table");
+
+SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT,
+	0, 0, sysctl_kern_proc, "S,proc", "Return entire process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_ANYBODY,
+	sysctl_kern_proc_args, "Process argument list");
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
new file mode 100644
index 0000000..a3e4bea
--- /dev/null
+++ b/sys/kern/kern_prot.c
@@ -0,0 +1,1969 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ * Copyright (c) 2000-2001 Robert N. M. Watson.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_prot.c	8.6 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+/*
+ * System calls related to processes and protection
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+#include <sys/pioctl.h>
+#include <sys/resourcevar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+static MALLOC_DEFINE(M_CRED, "cred", "credentials");
+
+SYSCTL_DECL(_security);
+SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0,
+    "BSD security policy");
+
+#ifndef _SYS_SYSPROTO_H_
+struct getpid_args {
+	int	dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getpid(struct thread *td, struct getpid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	int s;
+
+	s = mtx_lock_giant(kern_giant_proc);
+	td->td_retval[0] = p->p_pid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+	PROC_LOCK(p);
+	td->td_retval[1] = p->p_pptr->p_pid;
+	PROC_UNLOCK(p);
+#endif
+	mtx_unlock_giant(s);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getppid_args {
+        int     dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getppid(struct thread *td, struct getppid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	int s;
+
+	s = mtx_lock_giant(kern_giant_proc);
+	PROC_LOCK(p);
+	td->td_retval[0] = p->p_pptr->p_pid;
+	PROC_UNLOCK(p);
+	mtx_unlock_giant(s);
+	return (0);
+}
+
+/*
+ * Get process group ID; note that POSIX getpgrp takes no parameter.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgrp_args {
+        int     dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+getpgrp(struct thread *td, struct getpgrp_args *uap)
+{
+	struct proc *p = td->td_proc;
+	int s;
+
+	s = mtx_lock_giant(kern_giant_proc);
+	PROC_LOCK(p);
+	td->td_retval[0] = p->p_pgrp->pg_id;
+	PROC_UNLOCK(p);
+	mtx_unlock_giant(s);
+	return (0);
+}
+
+/* Get an arbitary pid's process group id */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgid_args {
+	pid_t	pid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+getpgid(struct thread *td, struct getpgid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct proc *pt;
+	int error;
+
+	mtx_lock(&Giant);
+	error = 0;
+	if (uap->pid == 0) {
+		PROC_LOCK(p);
+		td->td_retval[0] = p->p_pgrp->pg_id;
+		PROC_UNLOCK(p);
+	} else if ((pt = pfind(uap->pid)) == NULL)
+		error = ESRCH;
+	else {
+		error = p_cansee(td, pt);
+		if (error == 0)
+			td->td_retval[0] = pt->p_pgrp->pg_id;
+		PROC_UNLOCK(pt);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Get an arbitary pid's session id.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getsid_args {
+	pid_t	pid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+getsid(struct thread *td, struct getsid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct proc *pt;
+	int error;
+
+	mtx_lock(&Giant);
+	error = 0;
+	if (uap->pid == 0) {
+		PROC_LOCK(p);
+		td->td_retval[0] = p->p_session->s_sid;
+		PROC_UNLOCK(p);
+	} else if ((pt = pfind(uap->pid)) == NULL)
+		error = ESRCH;
+	else {
+		error = p_cansee(td, pt);
+		if (error == 0)
+			td->td_retval[0] = pt->p_session->s_sid;
+		PROC_UNLOCK(pt);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getuid_args {
+        int     dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getuid(struct thread *td, struct getuid_args *uap)
+{
+
+	td->td_retval[0] = td->td_ucred->cr_ruid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+	td->td_retval[1] = td->td_ucred->cr_uid;
+#endif
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct geteuid_args {
+        int     dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+geteuid(struct thread *td, struct geteuid_args *uap)
+{
+
+	td->td_retval[0] = td->td_ucred->cr_uid;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgid_args {
+        int     dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getgid(struct thread *td, struct getgid_args *uap)
+{
+
+	td->td_retval[0] = td->td_ucred->cr_rgid;
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+	td->td_retval[1] = td->td_ucred->cr_groups[0];
+#endif
+	return (0);
+}
+
+/*
+ * Get effective group ID.  The "egid" is groups[0], and could be obtained
+ * via getgroups.  This syscall exists because it is somewhat painful to do
+ * correctly in a library function.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getegid_args {
+        int     dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getegid(struct thread *td, struct getegid_args *uap)
+{
+
+	td->td_retval[0] = td->td_ucred->cr_groups[0];
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgroups_args {
+	u_int	gidsetsize;
+	gid_t	*gidset;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+getgroups(struct thread *td, register struct getgroups_args *uap)
+{
+	struct ucred *cred;
+	u_int ngrp;
+	int error;
+
+	cred = td->td_ucred;
+	if ((ngrp = uap->gidsetsize) == 0) {
+		td->td_retval[0] = cred->cr_ngroups;
+		return (0);
+	}
+	if (ngrp < cred->cr_ngroups)
+		return (EINVAL);
+	ngrp = cred->cr_ngroups;
+	error = copyout((caddr_t)cred->cr_groups, (caddr_t)uap->gidset,
+	    ngrp * sizeof(gid_t));
+	if (error == 0)
+		td->td_retval[0] = ngrp;
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setsid_args {
+        int     dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setsid(register struct thread *td, struct setsid_args *uap)
+{
+	struct pgrp *pgrp;
+	int error;
+	struct proc *p = td->td_proc;
+	struct pgrp *newpgrp;
+	struct session *newsess;
+
+	error = 0;
+	pgrp = NULL;
+
+	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
+	MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
+
+	sx_xlock(&proctree_lock);
+
+	if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
+		if (pgrp != NULL)
+			PGRP_UNLOCK(pgrp);
+		error = EPERM;
+	} else {
+		(void)enterpgrp(p, p->p_pid, newpgrp, newsess);
+		td->td_retval[0] = p->p_pid;
+		newpgrp = NULL;
+		newsess = NULL;
+	}
+
+	sx_xunlock(&proctree_lock);
+
+	if (newpgrp != NULL)
+		FREE(newpgrp, M_PGRP);
+	if (newsess != NULL)
+		FREE(newsess, M_SESSION);
+
+	return (error);
+}
+
+/*
+ * set process group (setpgid/old setpgrp)
+ *
+ * caller does setpgid(targpid, targpgid)
+ *
+ * pid must be caller or child of caller (ESRCH)
+ * if a child
+ *	pid must be in same session (EPERM)
+ *	pid can't have done an exec (EACCES)
+ * if pgid != pid
+ * 	there must exist some pid in same session having pgid (EPERM)
+ * pid must not be session leader (EPERM)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setpgid_args {
+	int	pid;		/* target process id */
+	int	pgid;		/* target pgrp id */
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setpgid(struct thread *td, register struct setpgid_args *uap)
+{
+	struct proc *curp = td->td_proc;
+	register struct proc *targp;	/* target process */
+	register struct pgrp *pgrp;	/* target pgrp */
+	int error;
+	struct pgrp *newpgrp;
+
+	if (uap->pgid < 0)
+		return (EINVAL);
+
+	error = 0;
+
+	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
+
+	sx_xlock(&proctree_lock);
+	if (uap->pid != 0 && uap->pid != curp->p_pid) {
+		if ((targp = pfind(uap->pid)) == NULL) {
+			if (targp)
+				PROC_UNLOCK(targp);
+			error = ESRCH;
+			goto done;
+		}
+		if (!inferior(targp)) {
+			PROC_UNLOCK(targp);
+			error = ESRCH;
+			goto done;
+		}
+		if ((error = p_cansee(curthread, targp))) {
+			PROC_UNLOCK(targp);
+			goto done;
+		}
+		if (targp->p_pgrp == NULL ||
+		    targp->p_session != curp->p_session) {
+			PROC_UNLOCK(targp);
+			error = EPERM;
+			goto done;
+		}
+		if (targp->p_flag & P_EXEC) {
+			PROC_UNLOCK(targp);
+			error = EACCES;
+			goto done;
+		}
+		PROC_UNLOCK(targp);
+	} else
+		targp = curp;
+	if (SESS_LEADER(targp)) {
+		error = EPERM;
+		goto done;
+	}
+	if (uap->pgid == 0)
+		uap->pgid = targp->p_pid;
+	if (uap->pgid == targp->p_pid) {
+		if (targp->p_pgid == uap->pgid)
+			goto done;
+		error = enterpgrp(targp, uap->pgid, newpgrp, NULL);
+		if (error == 0)
+			newpgrp = NULL;
+	} else {
+		if ((pgrp = pgfind(uap->pgid)) == NULL ||
+		    pgrp->pg_session != curp->p_session) {
+			if (pgrp != NULL)
+				PGRP_UNLOCK(pgrp);
+			error = EPERM;
+			goto done;
+		}
+		if (pgrp == targp->p_pgrp) {
+			PGRP_UNLOCK(pgrp);
+			goto done;
+		}
+		PGRP_UNLOCK(pgrp);
+		error = enterthispgrp(targp, pgrp);
+	}
+done:
+	sx_xunlock(&proctree_lock);
+	KASSERT((error == 0) || (newpgrp != NULL),
+	    ("setpgid failed and newpgrp is NULL"));
+	if (newpgrp != NULL)
+		FREE(newpgrp, M_PGRP);
+	return (error);
+}
+
+/*
+ * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
+ * compatible.  It says that setting the uid/gid to euid/egid is a special
+ * case of "appropriate privilege".  Once the rules are expanded out, this
+ * basically means that setuid(nnn) sets all three id's, in all permitted
+ * cases unless _POSIX_SAVED_IDS is enabled.  In that case, setuid(getuid())
+ * does not set the saved id - this is dangerous for traditional BSD
+ * programs.  For this reason, we *really* do not want to set
+ * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
+ */
+#define POSIX_APPENDIX_B_4_2_2
+
+#ifndef _SYS_SYSPROTO_H_
+struct setuid_args {
+	uid_t	uid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setuid(struct thread *td, struct setuid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	uid_t uid;
+	struct uidinfo *uip;
+	int error;
+
+	mtx_lock(&Giant);
+	uid = uap->uid;
+	newcred = crget();
+	uip = uifind(uid);
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+
+	/*
+	 * See if we have "permission" by POSIX 1003.1 rules.
+	 *
+	 * Note that setuid(geteuid()) is a special case of
+	 * "appropriate privileges" in appendix B.4.2.2.  We need
+	 * to use this clause to be compatible with traditional BSD
+	 * semantics.  Basically, it means that "setuid(xx)" sets all
+	 * three id's (assuming you have privs).
+	 *
+	 * Notes on the logic.  We do things in three steps.
+	 * 1: We determine if the euid is going to change, and do EPERM
+	 *    right away.  We unconditionally change the euid later if this
+	 *    test is satisfied, simplifying that part of the logic.
+	 * 2: We determine if the real and/or saved uids are going to
+	 *    change.  Determined by compile options.
+	 * 3: Change euid last. (after tests in #2 for "appropriate privs")
+	 */
+	if (uid != oldcred->cr_ruid &&		/* allow setuid(getuid()) */
+#ifdef _POSIX_SAVED_IDS
+	    uid != oldcred->cr_svuid &&		/* allow setuid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
+	    uid != oldcred->cr_uid &&		/* allow setuid(geteuid()) */
+#endif
+	    (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+		PROC_UNLOCK(p);
+		uifree(uip);
+		crfree(newcred);
+		mtx_unlock(&Giant);
+		return (error);
+	}
+
+	/*
+	 * Copy credentials so other references do not see our changes.
+	 */
+	crcopy(newcred, oldcred);
+#ifdef _POSIX_SAVED_IDS
+	/*
+	 * Do we have "appropriate privileges" (are we root or uid == euid)
+	 * If so, we are changing the real uid and/or saved uid.
+	 */
+	if (
+#ifdef POSIX_APPENDIX_B_4_2_2	/* Use the clause from B.4.2.2 */
+	    uid == oldcred->cr_uid ||
+#endif
+	    suser_cred(oldcred, PRISON_ROOT) == 0) /* we are using privs */
+#endif
+	{
+		/*
+		 * Set the real uid and transfer proc count to new user.
+		 */
+		if (uid != oldcred->cr_ruid) {
+			change_ruid(newcred, uip);
+			setsugid(p);
+		}
+		/*
+		 * Set saved uid
+		 *
+		 * XXX always set saved uid even if not _POSIX_SAVED_IDS, as
+		 * the security of seteuid() depends on it.  B.4.2.2 says it
+		 * is important that we should do this.
+		 */
+		if (uid != oldcred->cr_svuid) {
+			change_svuid(newcred, uid);
+			setsugid(p);
+		}
+	}
+
+	/*
+	 * In all permitted cases, we are changing the euid.
+	 */
+	if (uid != oldcred->cr_uid) {
+		change_euid(newcred, uip);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	uifree(uip);
+	crfree(oldcred);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct seteuid_args {
+	uid_t	euid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+seteuid(struct thread *td, struct seteuid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	uid_t euid;
+	struct uidinfo *euip;
+	int error;
+
+	euid = uap->euid;
+	mtx_lock(&Giant);
+	newcred = crget();
+	euip = uifind(euid);
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+	if (euid != oldcred->cr_ruid &&		/* allow seteuid(getuid()) */
+	    euid != oldcred->cr_svuid &&	/* allow seteuid(saved uid) */
+	    (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+		PROC_UNLOCK(p);
+		uifree(euip);
+		crfree(newcred);
+		mtx_unlock(&Giant);
+		return (error);
+	}
+	/*
+	 * Everything's okay, do it.  Copy credentials so other references do
+	 * not see our changes.
+	 */
+	crcopy(newcred, oldcred);
+	if (oldcred->cr_uid != euid) {
+		change_euid(newcred, euip);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	uifree(euip);
+	crfree(oldcred);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgid_args {
+	gid_t	gid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setgid(struct thread *td, struct setgid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	gid_t gid;
+	int error;
+
+	gid = uap->gid;
+	mtx_lock(&Giant);
+	newcred = crget();
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+
+	/*
+	 * See if we have "permission" by POSIX 1003.1 rules.
+	 *
+	 * Note that setgid(getegid()) is a special case of
+	 * "appropriate privileges" in appendix B.4.2.2.  We need
+	 * to use this clause to be compatible with traditional BSD
+	 * semantics.  Basically, it means that "setgid(xx)" sets all
+	 * three id's (assuming you have privs).
+	 *
+	 * For notes on the logic here, see setuid() above.
+	 */
+	if (gid != oldcred->cr_rgid &&		/* allow setgid(getgid()) */
+#ifdef _POSIX_SAVED_IDS
+	    gid != oldcred->cr_svgid &&		/* allow setgid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
+	    gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
+#endif
+	    (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+		PROC_UNLOCK(p);
+		crfree(newcred);
+		mtx_unlock(&Giant);
+		return (error);
+	}
+
+	crcopy(newcred, oldcred);
+#ifdef _POSIX_SAVED_IDS
+	/*
+	 * Do we have "appropriate privileges" (are we root or gid == egid)
+	 * If so, we are changing the real uid and saved gid.
+	 */
+	if (
+#ifdef POSIX_APPENDIX_B_4_2_2	/* use the clause from B.4.2.2 */
+	    gid == oldcred->cr_groups[0] ||
+#endif
+	    suser_cred(oldcred, PRISON_ROOT) == 0) /* we are using privs */
+#endif
+	{
+		/*
+		 * Set real gid
+		 */
+		if (oldcred->cr_rgid != gid) {
+			change_rgid(newcred, gid);
+			setsugid(p);
+		}
+		/*
+		 * Set saved gid
+		 *
+		 * XXX always set saved gid even if not _POSIX_SAVED_IDS, as
+		 * the security of setegid() depends on it.  B.4.2.2 says it
+		 * is important that we should do this.
+		 */
+		if (oldcred->cr_svgid != gid) {
+			change_svgid(newcred, gid);
+			setsugid(p);
+		}
+	}
+	/*
+	 * In all cases permitted cases, we are changing the egid.
+	 * Copy credentials so other references do not see our changes.
+	 */
+	if (oldcred->cr_groups[0] != gid) {
+		change_egid(newcred, gid);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setegid_args {
+	gid_t	egid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setegid(struct thread *td, struct setegid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	gid_t egid;
+	int error;
+
+	egid = uap->egid;
+	mtx_lock(&Giant);
+	newcred = crget();
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+	if (egid != oldcred->cr_rgid &&		/* allow setegid(getgid()) */
+	    egid != oldcred->cr_svgid &&	/* allow setegid(saved gid) */
+	    (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+		PROC_UNLOCK(p);
+		crfree(newcred);
+		mtx_unlock(&Giant);
+		return (error);
+	}
+	crcopy(newcred, oldcred);
+	if (oldcred->cr_groups[0] != egid) {
+		change_egid(newcred, egid);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgroups_args {
+	u_int	gidsetsize;
+	gid_t	*gidset;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setgroups(struct thread *td, struct setgroups_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *tempcred, *oldcred;
+	u_int ngrp;
+	int error;
+
+	ngrp = uap->gidsetsize;
+	if (ngrp > NGROUPS)
+		return (EINVAL);
+	mtx_lock(&Giant);
+	tempcred = crget();
+	error = copyin((caddr_t)uap->gidset, (caddr_t)tempcred->cr_groups,
+	    ngrp * sizeof(gid_t));
+	if (error != 0) {
+		crfree(tempcred);
+		mtx_unlock(&Giant);
+		return (error);
+	}
+	newcred = crget();
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+	error = suser_cred(oldcred, PRISON_ROOT);
+	if (error) {
+		PROC_UNLOCK(p);
+		crfree(newcred);
+		crfree(tempcred);
+		mtx_unlock(&Giant);
+		return (error);
+	}
+		
+	/*
+	 * XXX A little bit lazy here.  We could test if anything has
+	 * changed before crcopy() and setting P_SUGID.
+	 */
+	crcopy(newcred, oldcred);
+	if (ngrp < 1) {
+		/*
+		 * setgroups(0, NULL) is a legitimate way of clearing the
+		 * groups vector on non-BSD systems (which generally do not
+		 * have the egid in the groups[0]).  We risk security holes
+		 * when running non-BSD software if we do not do the same.
+		 */
+		newcred->cr_ngroups = 1;
+	} else {
+		bcopy(tempcred->cr_groups, newcred->cr_groups,
+		    ngrp * sizeof(gid_t));
+		newcred->cr_ngroups = ngrp;
+	}
+	setsugid(p);
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(tempcred);
+	crfree(oldcred);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setreuid_args {
+	uid_t	ruid;
+	uid_t	euid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setreuid(register struct thread *td, struct setreuid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	uid_t euid, ruid;
+	struct uidinfo *euip, *ruip;
+	int error;
+
+	euid = uap->euid;
+	ruid = uap->ruid;
+	mtx_lock(&Giant);
+	newcred = crget();
+	euip = uifind(euid);
+	ruip = uifind(ruid);
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
+	      ruid != oldcred->cr_svuid) ||
+	     (euid != (uid_t)-1 && euid != oldcred->cr_uid &&
+	      euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
+	    (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+		PROC_UNLOCK(p);
+		uifree(ruip);
+		uifree(euip);
+		crfree(newcred);
+		mtx_unlock(&Giant);
+		return (error);
+	}
+	crcopy(newcred, oldcred);
+	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
+		change_euid(newcred, euip);
+		setsugid(p);
+	}
+	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
+		change_ruid(newcred, ruip);
+		setsugid(p);
+	}
+	if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) &&
+	    newcred->cr_svuid != newcred->cr_uid) {
+		change_svuid(newcred, newcred->cr_uid);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	uifree(ruip);
+	uifree(euip);
+	crfree(oldcred);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setregid_args {
+	gid_t	rgid;
+	gid_t	egid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setregid(register struct thread *td, struct setregid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	gid_t egid, rgid;
+	int error;
+
+	egid = uap->egid;
+	rgid = uap->rgid;
+	mtx_lock(&Giant);
+	newcred = crget();
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
+	    rgid != oldcred->cr_svgid) ||
+	     (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
+	     egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
+	    (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+		PROC_UNLOCK(p);
+		crfree(newcred);
+		mtx_unlock(&Giant);
+		return (error);
+	}
+
+	crcopy(newcred, oldcred);
+	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+		change_egid(newcred, egid);
+		setsugid(p);
+	}
+	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
+		change_rgid(newcred, rgid);
+		setsugid(p);
+	}
+	if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) &&
+	    newcred->cr_svgid != newcred->cr_groups[0]) {
+		change_svgid(newcred, newcred->cr_groups[0]);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+/*
+ * setresuid(ruid, euid, suid) is like setreuid except control over the
+ * saved uid is explicit.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct setresuid_args {
+	uid_t	ruid;
+	uid_t	euid;
+	uid_t	suid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setresuid(register struct thread *td, struct setresuid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	uid_t euid, ruid, suid;
+	struct uidinfo *euip, *ruip;
+	int error;
+
+	euid = uap->euid;
+	ruid = uap->ruid;
+	suid = uap->suid;
+	mtx_lock(&Giant);
+	newcred = crget();
+	euip = uifind(euid);
+	ruip = uifind(ruid);
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
+	     ruid != oldcred->cr_svuid &&
+	      ruid != oldcred->cr_uid) ||
+	     (euid != (uid_t)-1 && euid != oldcred->cr_ruid &&
+	    euid != oldcred->cr_svuid &&
+	      euid != oldcred->cr_uid) ||
+	     (suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
+	    suid != oldcred->cr_svuid &&
+	      suid != oldcred->cr_uid)) &&
+	    (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+		PROC_UNLOCK(p);
+		uifree(ruip);
+		uifree(euip);
+		crfree(newcred);
+		mtx_unlock(&Giant);
+		return (error);
+	}
+
+	crcopy(newcred, oldcred);
+	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
+		change_euid(newcred, euip);
+		setsugid(p);
+	}
+	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
+		change_ruid(newcred, ruip);
+		setsugid(p);
+	}
+	if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) {
+		change_svuid(newcred, suid);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	uifree(ruip);
+	uifree(euip);
+	crfree(oldcred);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+/*
+ * setresgid(rgid, egid, sgid) is like setregid except control over the
+ * saved gid is explicit.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct setresgid_args {
+	gid_t	rgid;
+	gid_t	egid;
+	gid_t	sgid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setresgid(register struct thread *td, struct setresgid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	gid_t egid, rgid, sgid;
+	int error;
+
+	egid = uap->egid;
+	rgid = uap->rgid;
+	sgid = uap->sgid;
+	mtx_lock(&Giant);
+	newcred = crget();
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
+	      rgid != oldcred->cr_svgid &&
+	      rgid != oldcred->cr_groups[0]) ||
+	     (egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
+	      egid != oldcred->cr_svgid &&
+	      egid != oldcred->cr_groups[0]) ||
+	     (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
+	      sgid != oldcred->cr_svgid &&
+	      sgid != oldcred->cr_groups[0])) &&
+	    (error = suser_cred(oldcred, PRISON_ROOT)) != 0) {
+		PROC_UNLOCK(p);
+		crfree(newcred);
+		mtx_unlock(&Giant);
+		return (error);
+	}
+
+	crcopy(newcred, oldcred);
+	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+		change_egid(newcred, egid);
+		setsugid(p);
+	}
+	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
+		change_rgid(newcred, rgid);
+		setsugid(p);
+	}
+	if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) {
+		change_svgid(newcred, sgid);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getresuid_args {
+	uid_t	*ruid;
+	uid_t	*euid;
+	uid_t	*suid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getresuid(register struct thread *td, struct getresuid_args *uap)
+{
+	struct ucred *cred;
+	int error1 = 0, error2 = 0, error3 = 0;
+
+	cred = td->td_ucred;
+	if (uap->ruid)
+		error1 = copyout((caddr_t)&cred->cr_ruid,
+		    (caddr_t)uap->ruid, sizeof(cred->cr_ruid));
+	if (uap->euid)
+		error2 = copyout((caddr_t)&cred->cr_uid,
+		    (caddr_t)uap->euid, sizeof(cred->cr_uid));
+	if (uap->suid)
+		error3 = copyout((caddr_t)&cred->cr_svuid,
+		    (caddr_t)uap->suid, sizeof(cred->cr_svuid));
+	return (error1 ? error1 : error2 ? error2 : error3);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getresgid_args {
+	gid_t	*rgid;
+	gid_t	*egid;
+	gid_t	*sgid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getresgid(register struct thread *td, struct getresgid_args *uap)
+{
+	struct ucred *cred;
+	int error1 = 0, error2 = 0, error3 = 0;
+
+	cred = td->td_ucred;
+	if (uap->rgid)
+		error1 = copyout((caddr_t)&cred->cr_rgid,
+		    (caddr_t)uap->rgid, sizeof(cred->cr_rgid));
+	if (uap->egid)
+		error2 = copyout((caddr_t)&cred->cr_groups[0],
+		    (caddr_t)uap->egid, sizeof(cred->cr_groups[0]));
+	if (uap->sgid)
+		error3 = copyout((caddr_t)&cred->cr_svgid,
+		    (caddr_t)uap->sgid, sizeof(cred->cr_svgid));
+	return (error1 ? error1 : error2 ? error2 : error3);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct issetugid_args {
+	int dummy;
+};
+#endif
+/*
+ * NOT MPSAFE?
+ */
+/* ARGSUSED */
+int
+issetugid(register struct thread *td, struct issetugid_args *uap)
+{
+	struct proc *p = td->td_proc;
+
+	/*
+	 * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
+	 * we use P_SUGID because we consider changing the owners as
+	 * "tainting" as well.
+	 * This is significant for procs that start as root and "become"
+	 * a user without an exec - programs cannot know *everything*
+	 * that libc *might* have put in their data segment.
+	 */
+	PROC_LOCK(p);
+	td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0;
+	PROC_UNLOCK(p);
+	return (0);
+}
+
+/*
+ * MPSAFE
+ */
+int
+__setugid(struct thread *td, struct __setugid_args *uap)
+{
+#ifdef REGRESSION
+	struct proc *p;
+
+	p = td->td_proc;
+	switch (uap->flag) {
+	case 0:
+		mtx_lock(&Giant);
+		PROC_LOCK(p);
+		p->p_flag &= ~P_SUGID;
+		PROC_UNLOCK(p);
+		mtx_unlock(&Giant);
+		return (0);
+	case 1:
+		mtx_lock(&Giant);
+		PROC_LOCK(p);
+		p->p_flag |= P_SUGID;
+		PROC_UNLOCK(p);
+		mtx_unlock(&Giant);
+		return (0);
+	default:
+		return (EINVAL);
+	}
+#else /* !REGRESSION */
+
+	return (ENOSYS);
+#endif /* REGRESSION */
+}
+
+/*
+ * Check if gid is a member of the group set.
+ *
+ * MPSAFE (cred must be held)
+ */
+int
+groupmember(gid_t gid, struct ucred *cred)
+{
+	register gid_t *gp;
+	gid_t *egp;
+
+	egp = &(cred->cr_groups[cred->cr_ngroups]);
+	for (gp = cred->cr_groups; gp < egp; gp++)
+		if (*gp == gid)
+			return (1);
+	return (0);
+}
+
+/*
+ * `suser_enabled' (which can be set by the security.suser_enabled
+ * sysctl) determines whether the system 'super-user' policy is in effect.
+ * If it is nonzero, an effective uid of 0 connotes special privilege,
+ * overriding many mandatory and discretionary protections.  If it is zero,
+ * uid 0 is offered no special privilege in the kernel security policy.
+ * Setting it to zero may seriously impact the functionality of many
+ * existing userland programs, and should not be done without careful
+ * consideration of the consequences.
+ */
+int	suser_enabled = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW,
+    &suser_enabled, 0, "processes with uid 0 have privilege");
+TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled);
+
+/*
+ * Test whether the specified credentials imply "super-user" privilege.
+ * Return 0 or EPERM.  The flag argument is currently used only to
+ * specify jail interaction.
+ */
+int
+suser_cred(struct ucred *cred, int flag)
+{
+
+	if (!suser_enabled)
+		return (EPERM);
+	if (cred->cr_uid != 0)
+		return (EPERM);
+	if (jailed(cred) && !(flag & PRISON_ROOT))
+		return (EPERM);
+	return (0);
+}
+
+/*
+ * Shortcut to hide contents of struct td and struct proc from the
+ * caller, promoting binary compatibility.
+ */
+int
+suser(struct thread *td)
+{
+
+	return (suser_cred(td->td_ucred, 0));
+}
+
+/*
+ * Test the active securelevel against a given level.  securelevel_gt()
+ * implements (securelevel > level).  securelevel_ge() implements
+ * (securelevel >= level).  Note that the logic is inverted -- these
+ * functions return EPERM on "success" and 0 on "failure".
+ *
+ * MPSAFE
+ */
+int
+securelevel_gt(struct ucred *cr, int level)
+{
+	int active_securelevel;
+
+	active_securelevel = securelevel;
+	KASSERT(cr != NULL, ("securelevel_gt: null cr"));
+	if (cr->cr_prison != NULL) {
+		mtx_lock(&cr->cr_prison->pr_mtx);
+		active_securelevel = imax(cr->cr_prison->pr_securelevel,
+		    active_securelevel);
+		mtx_unlock(&cr->cr_prison->pr_mtx);
+	}
+	return (active_securelevel > level ? EPERM : 0);
+}
+
+int
+securelevel_ge(struct ucred *cr, int level)
+{
+	int active_securelevel;
+
+	active_securelevel = securelevel;
+	KASSERT(cr != NULL, ("securelevel_ge: null cr"));
+	if (cr->cr_prison != NULL) {
+		mtx_lock(&cr->cr_prison->pr_mtx);
+		active_securelevel = imax(cr->cr_prison->pr_securelevel,
+		    active_securelevel);
+		mtx_unlock(&cr->cr_prison->pr_mtx);
+	}
+	return (active_securelevel >= level ? EPERM : 0);
+}
+
+/*
+ * 'see_other_uids' determines whether or not visibility of processes
+ * and sockets with credentials holding different real uids is possible
+ * using a variety of system MIBs.
+ * XXX: data declarations should be together near the beginning of the file.
+ */
+static int	see_other_uids = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW,
+    &see_other_uids, 0,
+    "Unprivileged processes may see subjects/objects with different real uid");
+
+/*-
+ * Determine if u1 "can see" the subject specified by u2, according to the
+ * 'see_other_uids' policy.
+ * Returns: 0 for permitted, ESRCH otherwise
+ * Locks: none
+ * References: *u1 and *u2 must not change during the call
+ *             u1 may equal u2, in which case only one reference is required
+ */
+static int
+cr_seeotheruids(struct ucred *u1, struct ucred *u2)
+{
+
+	if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
+		if (suser_cred(u1, PRISON_ROOT) != 0)
+			return (ESRCH);
+	}
+	return (0);
+}
+
+/*-
+ * Determine if u1 "can see" the subject specified by u2.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: none
+ * References: *u1 and *u2 must not change during the call
+ *             u1 may equal u2, in which case only one reference is required
+ */
+int
+cr_cansee(struct ucred *u1, struct ucred *u2)
+{
+	int error;
+
+	if ((error = prison_check(u1, u2)))
+		return (error);
+	if ((error = cr_seeotheruids(u1, u2)))
+		return (error);
+	return (0);
+}
+
+/*-
+ * Determine if td "can see" the subject specified by p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect p->p_ucred must be held.  td really
+ *        should be curthread.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansee(struct thread *td, struct proc *p)
+{
+
+	/* Wrap cr_cansee() for all functionality. */
+	KASSERT(td == curthread, ("%s: td not curthread", __func__));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	return (cr_cansee(td->td_ucred, p->p_ucred));
+}
+
+/*-
+ * Determine whether cred may deliver the specified signal to proc.
+ * Returns: 0 for permitted, an errno value otherwise.
+ * Locks: A lock must be held for proc.
+ * References: cred and proc must be valid for the lifetime of the call.
+ */
+int
+cr_cansignal(struct ucred *cred, struct proc *proc, int signum)
+{
+	int error;
+
+	PROC_LOCK_ASSERT(proc, MA_OWNED);
+	/*
+	 * Jail semantics limit the scope of signalling to proc in the
+	 * same jail as cred, if cred is in jail.
+	 */
+	error = prison_check(cred, proc->p_ucred);
+	if (error)
+		return (error);
+	error = cr_seeotheruids(cred, proc->p_ucred);
+	if (error)
+		return (error);
+
+	/*
+	 * UNIX signal semantics depend on the status of the P_SUGID
+	 * bit on the target process.  If the bit is set, then additional
+	 * restrictions are placed on the set of available signals.
+	 */
+	if (proc->p_flag & P_SUGID) {
+		switch (signum) {
+		case 0:
+		case SIGKILL:
+		case SIGINT:
+		case SIGTERM:
+		case SIGSTOP:
+		case SIGTTIN:
+		case SIGTTOU:
+		case SIGTSTP:
+		case SIGHUP:
+		case SIGUSR1:
+		case SIGUSR2:
+			/*
+			 * Generally, permit job and terminal control
+			 * signals.
+			 */
+			break;
+		default:
+			/* Not permitted without privilege. */
+			error = suser_cred(cred, PRISON_ROOT);
+			if (error)
+				return (error);
+		}
+	}
+
+	/*
+	 * Generally, the target credential's ruid or svuid must match the
+	 * subject credential's ruid or euid.
+	 */
+	if (cred->cr_ruid != proc->p_ucred->cr_ruid &&
+	    cred->cr_ruid != proc->p_ucred->cr_svuid &&
+	    cred->cr_uid != proc->p_ucred->cr_ruid &&
+	    cred->cr_uid != proc->p_ucred->cr_svuid) {
+		/* Not permitted without privilege. */
+		error = suser_cred(cred, PRISON_ROOT);
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+
+
+/*-
+ * Determine whether td may deliver the specified signal to p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ *        must be held.  td must be curthread, and a lock must be
+ *        held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansignal(struct thread *td, struct proc *p, int signum)
+{
+
+	KASSERT(td == curthread, ("%s: td not curthread", __func__));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if (td->td_proc == p)
+		return (0);
+
+	/*
+	 * UNIX signalling semantics require that processes in the same
+	 * session always be able to deliver SIGCONT to one another,
+	 * overriding the remaining protections.
+	 */
+	/* XXX: This will require an additional lock of some sort. */
+	if (signum == SIGCONT && td->td_proc->p_session == p->p_session)
+		return (0);
+
+	return (cr_cansignal(td->td_ucred, p, signum));
+}
+
+/*-
+ * Determine whether td may reschedule p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ *        must be held.  td must be curthread, and a lock must
+ *        be held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansched(struct thread *td, struct proc *p)
+{
+	int error;
+
+	KASSERT(td == curthread, ("%s: td not curthread", __func__));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if (td->td_proc == p)
+		return (0);
+	if ((error = prison_check(td->td_ucred, p->p_ucred)))
+		return (error);
+	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
+		return (error);
+	if (td->td_ucred->cr_ruid == p->p_ucred->cr_ruid)
+		return (0);
+	if (td->td_ucred->cr_uid == p->p_ucred->cr_ruid)
+		return (0);
+	if (suser_cred(td->td_ucred, PRISON_ROOT) == 0)
+		return (0);
+
+#ifdef CAPABILITIES
+	if (!cap_check(NULL, td, CAP_SYS_NICE, PRISON_ROOT))
+		return (0);
+#endif
+
+	return (EPERM);
+}
+
+/*
+ * The 'unprivileged_proc_debug' flag may be used to disable a variety of
+ * unprivileged inter-process debugging services, including some procfs
+ * functionality, ptrace(), and ktrace().  In the past, inter-process
+ * debugging has been involved in a variety of security problems, and sites
+ * not requiring the service might choose to disable it when hardening
+ * systems.
+ *
+ * XXX: Should modifying and reading this variable require locking?
+ * XXX: data declarations should be together near the beginning of the file.
+ */
+static int	unprivileged_proc_debug = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW,
+    &unprivileged_proc_debug, 0,
+    "Unprivileged processes may use process debugging facilities");
+
+/*-
+ * Determine whether td may debug p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ *        must be held.  td must be curthread, and a lock must
+ *        be held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_candebug(struct thread *td, struct proc *p)
+{
+	int credentialchanged, error, grpsubset, i, uidsubset;
+
+	KASSERT(td == curthread, ("%s: td not curthread", __func__));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if (!unprivileged_proc_debug) {
+		error = suser_cred(td->td_ucred, PRISON_ROOT);
+		if (error)
+			return (error);
+	}
+	if (td->td_proc == p)
+		return (0);
+	if ((error = prison_check(td->td_ucred, p->p_ucred)))
+		return (error);
+	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
+		return (error);
+
+	/*
+	 * Is p's group set a subset of td's effective group set?  This
+	 * includes p's egid, group access list, rgid, and svgid.
+	 */
+	grpsubset = 1;
+	for (i = 0; i < p->p_ucred->cr_ngroups; i++) {
+		if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) {
+			grpsubset = 0;
+			break;
+		}
+	}
+	grpsubset = grpsubset &&
+	    groupmember(p->p_ucred->cr_rgid, td->td_ucred) &&
+	    groupmember(p->p_ucred->cr_svgid, td->td_ucred);
+
+	/*
+	 * Are the uids present in p's credential equal to td's
+	 * effective uid?  This includes p's euid, svuid, and ruid.
+	 */
+	uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid &&
+	    td->td_ucred->cr_uid == p->p_ucred->cr_svuid &&
+	    td->td_ucred->cr_uid == p->p_ucred->cr_ruid);
+
+	/*
+	 * Has the credential of the process changed since the last exec()?
+	 */
+	credentialchanged = (p->p_flag & P_SUGID);
+
+	/*
+	 * If p's gids aren't a subset, or the uids aren't a subset,
+	 * or the credential has changed, require appropriate privilege
+	 * for td to debug p.  For POSIX.1e capabilities, this will
+	 * require CAP_SYS_PTRACE.
+	 */
+	if (!grpsubset || !uidsubset || credentialchanged) {
+		error = suser_cred(td->td_ucred, PRISON_ROOT);
+		if (error)
+			return (error);
+	}
+
+	/* Can't trace init when securelevel > 0. */
+	if (p == initproc) {
+		error = securelevel_gt(td->td_ucred, 0);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * Can't trace a process that's currently exec'ing.
+	 * XXX: Note, this is not a security policy decision, it's a
+	 * basic correctness/functionality decision.  Therefore, this check
+	 * should be moved to the caller's of p_candebug().
+	 */
+	if ((p->p_flag & P_INEXEC) != 0)
+		return (EAGAIN);
+
+	return (0);
+}
+
+/*-
+ * Determine whether the subject represented by cred can "see" a socket.
+ * Returns: 0 for permitted, ENOENT otherwise.
+ */
+int
+cr_canseesocket(struct ucred *cred, struct socket *so)
+{
+	int error;
+
+	error = prison_check(cred, so->so_cred);
+	if (error)
+		return (ENOENT);
+	if (cr_seeotheruids(cred, so->so_cred))
+		return (ENOENT);
+#ifdef MAC
+	/* XXX: error = mac_cred_check_seesocket() here. */
+#endif
+
+	return (0);
+}
+
+/*
+ * Allocate a zeroed cred structure.
+ */
+struct ucred *
+crget(void)
+{
+	register struct ucred *cr;
+
+	MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
+	cr->cr_ref = 1;
+	cr->cr_mtxp = mtx_pool_find(cr);
+	return (cr);
+}
+
+/*
+ * Claim another reference to a ucred structure.
+ */
+struct ucred *
+crhold(struct ucred *cr)
+{
+
+	mtx_lock(cr->cr_mtxp);
+	cr->cr_ref++;
+	mtx_unlock(cr->cr_mtxp);
+	return (cr);
+}
+
+/*
+ * Free a cred structure.
+ * Throws away space when ref count gets to 0.
+ */
+void
+crfree(struct ucred *cr)
+{
+	struct mtx *mtxp = cr->cr_mtxp;
+
+	mtx_lock(mtxp);
+	KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
+	if (--cr->cr_ref == 0) {
+		/*
+		 * Some callers of crget(), such as nfs_statfs(),
+		 * allocate a temporary credential, but don't
+		 * allocate a uidinfo structure.
+		 */
+		mtx_unlock(mtxp);
+		mtx_lock(&Giant);
+		if (cr->cr_uidinfo != NULL)
+			uifree(cr->cr_uidinfo);
+		if (cr->cr_ruidinfo != NULL)
+			uifree(cr->cr_ruidinfo);
+		/*
+		 * Free a prison, if any.
+		 */
+		if (jailed(cr))
+			prison_free(cr->cr_prison);
+		FREE((caddr_t)cr, M_CRED);
+		mtx_unlock(&Giant);
+	} else {
+		mtx_unlock(mtxp);
+	}
+}
+
+/*
+ * Check to see if this ucred is shared.
+ */
+int
+crshared(struct ucred *cr)
+{
+	int shared;
+
+	mtx_lock(cr->cr_mtxp);
+	shared = (cr->cr_ref > 1);
+	mtx_unlock(cr->cr_mtxp);
+	return (shared);
+}
+
+/*
+ * Copy a ucred's contents from a template.  Does not block.
+ */
+void
+crcopy(struct ucred *dest, struct ucred *src)
+{
+
+	KASSERT(crshared(dest) == 0, ("crcopy of shared ucred"));
+	bcopy(&src->cr_startcopy, &dest->cr_startcopy,
+	    (unsigned)((caddr_t)&src->cr_endcopy -
+		(caddr_t)&src->cr_startcopy));
+	uihold(dest->cr_uidinfo);
+	uihold(dest->cr_ruidinfo);
+	if (jailed(dest))
+		prison_hold(dest->cr_prison);
+}
+
+/*
+ * Dup cred struct to a new held one.
+ */
+struct ucred *
+crdup(struct ucred *cr)
+{
+	struct ucred *newcr;
+
+	newcr = crget();
+	crcopy(newcr, cr);
+	return (newcr);
+}
+
+/*
+ * Fill in a struct xucred based on a struct ucred.
+ */
+void
+cru2x(struct ucred *cr, struct xucred *xcr)
+{
+
+	bzero(xcr, sizeof(*xcr));
+	xcr->cr_version = XUCRED_VERSION;
+	xcr->cr_uid = cr->cr_uid;
+	xcr->cr_ngroups = cr->cr_ngroups;
+	bcopy(cr->cr_groups, xcr->cr_groups, sizeof(cr->cr_groups));
+}
+
+/*
+ * small routine to swap a thread's current ucred for the correct one
+ * taken from the process.
+ */
+void
+cred_update_thread(struct thread *td)
+{
+	struct proc *p;
+	struct ucred *cred;
+
+	p = td->td_proc;
+	cred = td->td_ucred;
+	mtx_lock(&Giant);
+	PROC_LOCK(p);
+	td->td_ucred = crhold(p->p_ucred);
+	PROC_UNLOCK(p);
+	if (cred != NULL)
+		crfree(cred);
+	mtx_unlock(&Giant);
+}
+
+/*
+ * Get login name, if available.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getlogin_args {
+	char	*namebuf;
+	u_int	namelen;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getlogin(struct thread *td, struct getlogin_args *uap)
+{
+	int error;
+	char login[MAXLOGNAME];
+	struct proc *p = td->td_proc;
+
+	if (uap->namelen > MAXLOGNAME)
+		uap->namelen = MAXLOGNAME;
+	PROC_LOCK(p);
+	SESS_LOCK(p->p_session);
+	bcopy(p->p_session->s_login, login, uap->namelen);
+	SESS_UNLOCK(p->p_session);
+	PROC_UNLOCK(p);
+	error = copyout((caddr_t) login, (caddr_t) uap->namebuf, uap->namelen);
+	return(error);
+}
+
+/*
+ * Set login name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setlogin_args {
+	char	*namebuf;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setlogin(struct thread *td, struct setlogin_args *uap)
+{
+	struct proc *p = td->td_proc;
+	int error;
+	char logintmp[MAXLOGNAME];
+
+	error = suser_cred(td->td_ucred, PRISON_ROOT);
+	if (error)
+		return (error);
+	error = copyinstr((caddr_t) uap->namebuf, (caddr_t) logintmp,
+	    sizeof(logintmp), (size_t *)0);
+	if (error == ENAMETOOLONG)
+		error = EINVAL;
+	else if (!error) {
+		PROC_LOCK(p);
+		SESS_LOCK(p->p_session);
+		(void) memcpy(p->p_session->s_login, logintmp,
+		    sizeof(logintmp));
+		SESS_UNLOCK(p->p_session);
+		PROC_UNLOCK(p);
+	}
+	return (error);
+}
+
+void
+setsugid(struct proc *p)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	p->p_flag |= P_SUGID;
+	if (!(p->p_pfsflags & PF_ISUGID))
+		p->p_stops = 0;
+}
+
+/*-
+ * Change a process's effective uid.
+ * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_euid(struct ucred *newcred, struct uidinfo *euip)
+{
+
+	newcred->cr_uid = euip->ui_uid;
+	uihold(euip);
+	uifree(newcred->cr_uidinfo);
+	newcred->cr_uidinfo = euip;
+}
+
+/*-
+ * Change a process's effective gid.
+ * Side effects: newcred->cr_gid will be modified.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_egid(struct ucred *newcred, gid_t egid)
+{
+
+	newcred->cr_groups[0] = egid;
+}
+
+/*-
+ * Change a process's real uid.
+ * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo
+ *               will be updated, and the old and new cr_ruidinfo proc
+ *               counts will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_ruid(struct ucred *newcred, struct uidinfo *ruip)
+{
+
+	(void)chgproccnt(newcred->cr_ruidinfo, -1, 0);
+	newcred->cr_ruid = ruip->ui_uid;
+	uihold(ruip);
+	uifree(newcred->cr_ruidinfo);
+	newcred->cr_ruidinfo = ruip;
+	(void)chgproccnt(newcred->cr_ruidinfo, 1, 0);
+}
+
+/*-
+ * Change a process's real gid.
+ * Side effects: newcred->cr_rgid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_rgid(struct ucred *newcred, gid_t rgid)
+{
+
+	newcred->cr_rgid = rgid;
+}
+
+/*-
+ * Change a process's saved uid.
+ * Side effects: newcred->cr_svuid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_svuid(struct ucred *newcred, uid_t svuid)
+{
+
+	newcred->cr_svuid = svuid;
+}
+
+/*-
+ * Change a process's saved gid.
+ * Side effects: newcred->cr_svgid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_svgid(struct ucred *newcred, gid_t svgid)
+{
+
+	newcred->cr_svgid = svgid;
+}
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
new file mode 100644
index 0000000..d467c1a
--- /dev/null
+++ b/sys/kern/kern_resource.c
@@ -0,0 +1,1020 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/file.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sx.h>
+#include <sys/time.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+static int donice(struct thread *td, struct proc *chgp, int n);
+
+static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
+#define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
+static struct mtx uihashtbl_mtx;
+static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
+static u_long uihash;		/* size of hash table - 1 */
+
+static struct uidinfo	*uilookup(uid_t uid);
+
+/*
+ * Resource controls and accounting.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getpriority_args {
+	int	which;
+	int	who;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+getpriority(td, uap)
+	struct thread *td;
+	register struct getpriority_args *uap;
+{
+	register struct proc *p;
+	register int low = PRIO_MAX + 1;
+	int error = 0;
+
+	mtx_lock(&Giant);
+
+	switch (uap->which) {
+	case PRIO_PROCESS:
+		if (uap->who == 0)
+			low = td->td_ksegrp->kg_nice;
+		else {
+			p = pfind(uap->who);
+			if (p == NULL)
+				break;
+			if (p_cansee(td, p) == 0)
+				low = p->p_ksegrp.kg_nice /* XXXKSE */ ;
+			PROC_UNLOCK(p);
+		}
+		break;
+
+	case PRIO_PGRP: {
+		register struct pgrp *pg;
+
+		sx_slock(&proctree_lock);
+		if (uap->who == 0) {
+			pg = td->td_proc->p_pgrp;
+			PGRP_LOCK(pg);
+		} else {
+			pg = pgfind(uap->who);
+			if (pg == NULL) {
+				sx_sunlock(&proctree_lock);
+				break;
+			}
+		}
+		sx_sunlock(&proctree_lock);
+		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (!p_cansee(td, p) && p->p_ksegrp.kg_nice /* XXXKSE */  < low)
+				low = p->p_ksegrp.kg_nice /* XXXKSE */ ;
+			PROC_UNLOCK(p);
+		}
+		PGRP_UNLOCK(pg);
+		break;
+	}
+
+	case PRIO_USER:
+		if (uap->who == 0)
+			uap->who = td->td_ucred->cr_uid;
+		sx_slock(&allproc_lock);
+		LIST_FOREACH(p, &allproc, p_list) {
+			PROC_LOCK(p);
+			if (!p_cansee(td, p) &&
+			    p->p_ucred->cr_uid == uap->who &&
+			    p->p_ksegrp.kg_nice /* XXXKSE */  < low)
+				low = p->p_ksegrp.kg_nice /* XXXKSE */ ;
+			PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+	if (low == PRIO_MAX + 1 && error == 0)
+		error = ESRCH;
+	td->td_retval[0] = low;
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setpriority_args {
+	int	which;
+	int	who;
+	int	prio;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setpriority(td, uap)
+	struct thread *td;
+	register struct setpriority_args *uap;
+{
+	struct proc *curp = td->td_proc;
+	register struct proc *p;
+	int found = 0, error = 0;
+
+	mtx_lock(&Giant);
+
+	switch (uap->which) {
+	case PRIO_PROCESS:
+		if (uap->who == 0) {
+			PROC_LOCK(curp);
+			error = donice(td, curp, uap->prio);
+			PROC_UNLOCK(curp);
+		} else {
+			p = pfind(uap->who);
+			if (p == 0)
+				break;
+			if (p_cansee(td, p) == 0)
+				error = donice(td, p, uap->prio);
+			PROC_UNLOCK(p);
+		}
+		found++;
+		break;
+
+	case PRIO_PGRP: {
+		register struct pgrp *pg;
+
+		sx_slock(&proctree_lock);
+		if (uap->who == 0) {
+			pg = curp->p_pgrp;
+			PGRP_LOCK(pg);
+		} else {
+			pg = pgfind(uap->who);
+			if (pg == NULL) {
+				sx_sunlock(&proctree_lock);
+				break;
+			}
+		}
+		sx_sunlock(&proctree_lock);
+		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (!p_cansee(td, p)) {
+				error = donice(td, p, uap->prio);
+				found++;
+			}
+			PROC_UNLOCK(p);
+		}
+		PGRP_UNLOCK(pg);
+		break;
+	}
+
+	case PRIO_USER:
+		if (uap->who == 0)
+			uap->who = td->td_ucred->cr_uid;
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_LOCK(p);
+			if (p->p_ucred->cr_uid == uap->who &&
+			    !p_cansee(td, p)) {
+				error = donice(td, p, uap->prio);
+				found++;
+			}
+			PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+	if (found == 0 && error == 0)
+		error = ESRCH;
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static int
+donice(td, chgp, n)
+	struct thread *td;
+	register struct proc *chgp;
+	register int n;
+{
+	int	error;
+
+	PROC_LOCK_ASSERT(chgp, MA_OWNED);
+	if ((error = p_cansched(td, chgp)))
+		return (error);
+	if (n > PRIO_MAX)
+		n = PRIO_MAX;
+	if (n < PRIO_MIN)
+		n = PRIO_MIN;
+	if (n < chgp->p_ksegrp.kg_nice /* XXXKSE */  && suser(td))
+		return (EACCES);
+	chgp->p_ksegrp.kg_nice /* XXXKSE */  = n;
+	(void)resetpriority(&chgp->p_ksegrp); /* XXXKSE */
+	return (0);
+}
+
+/* rtprio system call */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_args {
+	int		function;
+	pid_t		pid;
+	struct rtprio	*rtp;
+};
+#endif
+
+/*
+ * Set realtime priority
+ */
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+rtprio(td, uap)
+	struct thread *td;
+	register struct rtprio_args *uap;
+{
+	struct proc *curp = td->td_proc;
+	register struct proc *p;
+	struct rtprio rtp;
+	int error, cierror = 0;
+
+	/* Perform copyin before acquiring locks if needed. */
+	if (uap->function == RTP_SET)
+		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+
+	if (uap->pid == 0) {
+		p = curp;
+		PROC_LOCK(p);
+	} else {
+		p = pfind(uap->pid);
+		if (p == NULL)
+			return (ESRCH);
+	}
+
+	switch (uap->function) {
+	case RTP_LOOKUP:
+		if ((error = p_cansee(td, p)))
+			break;
+		mtx_lock_spin(&sched_lock);
+		pri_to_rtp(&p->p_ksegrp /* XXXKSE */ , &rtp);
+		mtx_unlock_spin(&sched_lock);
+		PROC_UNLOCK(p);
+		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
+	case RTP_SET:
+		if ((error = p_cansched(td, p)) || (error = cierror))
+			break;
+		/* disallow setting rtprio in most cases if not superuser */
+		if (suser(td) != 0) {
+			/* can't set someone else's */
+			if (uap->pid) {
+				error = EPERM;
+				break;
+			}
+			/* can't set realtime priority */
+/*
+ * Realtime priority has to be restricted for reasons which should be
+ * obvious. However, for idle priority, there is a potential for
+ * system deadlock if an idleprio process gains a lock on a resource
+ * that other processes need (and the idleprio process can't run
+ * due to a CPU-bound normal process). Fix me! XXX
+ */
+#if 0
+ 			if (RTP_PRIO_IS_REALTIME(rtp.type))
+#endif
+			if (rtp.type != RTP_PRIO_NORMAL) {
+				error = EPERM;
+				break;
+			}
+		}
+		mtx_lock_spin(&sched_lock);
+		error = rtp_to_pri(&rtp, &p->p_ksegrp);
+		mtx_unlock_spin(&sched_lock);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+int
+rtp_to_pri(struct rtprio *rtp, struct ksegrp *kg)
+{
+
+	if (rtp->prio > RTP_PRIO_MAX)
+		return (EINVAL);
+	switch (RTP_PRIO_BASE(rtp->type)) {
+	case RTP_PRIO_REALTIME:
+		kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio;
+		break;
+	case RTP_PRIO_NORMAL:
+		kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio;
+		break;
+	case RTP_PRIO_IDLE:
+		kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio;
+		break;
+	default:
+		return (EINVAL);
+	}
+	kg->kg_pri_class = rtp->type;
+	if (curthread->td_ksegrp == kg) {
+		curthread->td_base_pri = kg->kg_user_pri;
+		curthread->td_priority = kg->kg_user_pri; /* XXX dubious */
+	}
+	return (0);
+}
+
+void
+pri_to_rtp(struct ksegrp *kg, struct rtprio *rtp)
+{
+
+	switch (PRI_BASE(kg->kg_pri_class)) {
+	case PRI_REALTIME:
+		rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME;
+		break;
+	case PRI_TIMESHARE:
+		rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE;
+		break;
+	case PRI_IDLE:
+		rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE;
+		break;
+	default:
+		break;
+	}
+	rtp->type = kg->kg_pri_class;
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osetrlimit_args {
+	u_int	which;
+	struct	orlimit *rlp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osetrlimit(td, uap)
+	struct thread *td;
+	register struct osetrlimit_args *uap;
+{
+	struct orlimit olim;
+	struct rlimit lim;
+	int error;
+
+	if ((error =
+	    copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit))))
+		return (error);
+	lim.rlim_cur = olim.rlim_cur;
+	lim.rlim_max = olim.rlim_max;
+	mtx_lock(&Giant);
+	error = dosetrlimit(td, uap->which, &lim);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogetrlimit_args {
+	u_int	which;
+	struct	orlimit *rlp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ogetrlimit(td, uap)
+	struct thread *td;
+	register struct ogetrlimit_args *uap;
+{
+	struct proc *p = td->td_proc;
+	struct orlimit olim;
+	int error;
+
+	if (uap->which >= RLIM_NLIMITS)
+		return (EINVAL);
+	mtx_lock(&Giant);
+	olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur;
+	if (olim.rlim_cur == -1)
+		olim.rlim_cur = 0x7fffffff;
+	olim.rlim_max = p->p_rlimit[uap->which].rlim_max;
+	if (olim.rlim_max == -1)
+		olim.rlim_max = 0x7fffffff;
+	error = copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim));
+	mtx_unlock(&Giant);
+	return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifndef _SYS_SYSPROTO_H_
+struct __setrlimit_args {
+	u_int	which;
+	struct	rlimit *rlp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setrlimit(td, uap)
+	struct thread *td;
+	register struct __setrlimit_args *uap;
+{
+	struct rlimit alim;
+	int error;
+
+	if ((error =
+	    copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit))))
+		return (error);
+	mtx_lock(&Giant);
+	error = dosetrlimit(td, uap->which, &alim);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+int
+dosetrlimit(td, which, limp)
+	struct thread *td;
+	u_int which;
+	struct rlimit *limp;
+{
+	struct proc *p = td->td_proc;
+	register struct rlimit *alimp;
+	int error;
+
+	GIANT_REQUIRED;
+
+	if (which >= RLIM_NLIMITS)
+		return (EINVAL);
+	alimp = &p->p_rlimit[which];
+
+	/*
+	 * Preserve historical bugs by treating negative limits as unsigned.
+	 */
+	if (limp->rlim_cur < 0)
+		limp->rlim_cur = RLIM_INFINITY;
+	if (limp->rlim_max < 0)
+		limp->rlim_max = RLIM_INFINITY;
+
+	if (limp->rlim_cur > alimp->rlim_max ||
+	    limp->rlim_max > alimp->rlim_max)
+		if ((error = suser_cred(td->td_ucred, PRISON_ROOT)))
+			return (error);
+	if (limp->rlim_cur > limp->rlim_max)
+		limp->rlim_cur = limp->rlim_max;
+	if (p->p_limit->p_refcnt > 1 &&
+	    (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
+		p->p_limit->p_refcnt--;
+		p->p_limit = limcopy(p->p_limit);
+		alimp = &p->p_rlimit[which];
+	}
+
+	switch (which) {
+
+	case RLIMIT_CPU:
+		if (limp->rlim_cur > RLIM_INFINITY / (rlim_t)1000000)
+			p->p_limit->p_cpulimit = RLIM_INFINITY;
+		else
+			p->p_limit->p_cpulimit = 
+			    (rlim_t)1000000 * limp->rlim_cur;
+		break;
+	case RLIMIT_DATA:
+		if (limp->rlim_cur > maxdsiz)
+			limp->rlim_cur = maxdsiz;
+		if (limp->rlim_max > maxdsiz)
+			limp->rlim_max = maxdsiz;
+		break;
+
+	case RLIMIT_STACK:
+		if (limp->rlim_cur > maxssiz)
+			limp->rlim_cur = maxssiz;
+		if (limp->rlim_max > maxssiz)
+			limp->rlim_max = maxssiz;
+		/*
+		 * Stack is allocated to the max at exec time with only
+		 * "rlim_cur" bytes accessible.  If stack limit is going
+		 * up make more accessible, if going down make inaccessible.
+		 */
+		if (limp->rlim_cur != alimp->rlim_cur) {
+			vm_offset_t addr;
+			vm_size_t size;
+			vm_prot_t prot;
+
+			if (limp->rlim_cur > alimp->rlim_cur) {
+				prot = VM_PROT_ALL;
+				size = limp->rlim_cur - alimp->rlim_cur;
+				addr = USRSTACK - limp->rlim_cur;
+			} else {
+				prot = VM_PROT_NONE;
+				size = alimp->rlim_cur - limp->rlim_cur;
+				addr = USRSTACK - alimp->rlim_cur;
+			}
+			addr = trunc_page(addr);
+			size = round_page(size);
+			(void) vm_map_protect(&p->p_vmspace->vm_map,
+					      addr, addr+size, prot, FALSE);
+		}
+		break;
+
+	case RLIMIT_NOFILE:
+		if (limp->rlim_cur > maxfilesperproc)
+			limp->rlim_cur = maxfilesperproc;
+		if (limp->rlim_max > maxfilesperproc)
+			limp->rlim_max = maxfilesperproc;
+		break;
+
+	case RLIMIT_NPROC:
+		if (limp->rlim_cur > maxprocperuid)
+			limp->rlim_cur = maxprocperuid;
+		if (limp->rlim_max > maxprocperuid)
+			limp->rlim_max = maxprocperuid;
+		if (limp->rlim_cur < 1)
+			limp->rlim_cur = 1;
+		if (limp->rlim_max < 1)
+			limp->rlim_max = 1;
+		break;
+	}
+	*alimp = *limp;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct __getrlimit_args {
+	u_int	which;
+	struct	rlimit *rlp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getrlimit(td, uap)
+	struct thread *td;
+	register struct __getrlimit_args *uap;
+{
+	int error;
+	struct proc *p = td->td_proc;
+
+	if (uap->which >= RLIM_NLIMITS)
+		return (EINVAL);
+	mtx_lock(&Giant);
+	error = copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp,
+		    sizeof (struct rlimit));
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+/*
+ * Transform the running time and tick information in proc p into user,
+ * system, and interrupt time usage.
+ */
+void
+calcru(p, up, sp, ip)
+	struct proc *p;
+	struct timeval *up;
+	struct timeval *sp;
+	struct timeval *ip;
+{
+	/* {user, system, interrupt, total} {ticks, usec}; previous tu: */
+	u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
+	u_int64_t uut = 0, sut = 0, iut = 0;
+	int s;
+	struct timeval tv;
+	struct bintime bt;
+	struct kse *ke;
+	struct ksegrp *kg;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	/* XXX: why spl-protect ?  worst case is an off-by-one report */
+
+	FOREACH_KSEGRP_IN_PROC(p, kg) {
+		/* we could accumulate per ksegrp and per process here*/
+		FOREACH_KSE_IN_GROUP(kg, ke) {
+			s = splstatclock();
+			ut = ke->ke_uticks;
+			st = ke->ke_sticks;
+			it = ke->ke_iticks;
+			splx(s);
+
+			tt = ut + st + it;
+			if (tt == 0) {
+				st = 1;
+				tt = 1;
+			}
+		
+			if (ke == curthread->td_kse) {
+		/*
+		 * Adjust for the current time slice.  This is actually fairly
+		 * important since the error here is on the order of a time
+		 * quantum, which is much greater than the sampling error.
+		 * XXXKSE use a different test due to threads on other 
+		 * processors also being 'current'.
+		 */
+				
+				binuptime(&bt);
+				bintime_sub(&bt, PCPU_PTR(switchtime));
+				bintime_add(&bt, &p->p_runtime);
+			} else {
+				bt = p->p_runtime;
+			}
+			bintime2timeval(&bt, &tv);
+			tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
+			ptu = ke->ke_uu + ke->ke_su + ke->ke_iu;
+			if (tu < ptu || (int64_t)tu < 0) {
+				/* XXX no %qd in kernel.  Truncate. */
+				printf("calcru: negative time of %ld usec for pid %d (%s)\n",
+		       		(long)tu, p->p_pid, p->p_comm);
+				tu = ptu;
+			}
+
+			/* Subdivide tu. */
+			uu = (tu * ut) / tt;
+			su = (tu * st) / tt;
+			iu = tu - uu - su;
+		
+			/* Enforce monotonicity. */
+			if (uu < ke->ke_uu || su < ke->ke_su || iu < ke->ke_iu) {
+				if (uu < ke->ke_uu)
+					uu = ke->ke_uu;
+				else if (uu + ke->ke_su + ke->ke_iu > tu)
+					uu = tu - ke->ke_su - ke->ke_iu;
+				if (st == 0)
+					su = ke->ke_su;
+				else {
+					su = ((tu - uu) * st) / (st + it);
+					if (su < ke->ke_su)
+						su = ke->ke_su;
+					else if (uu + su + ke->ke_iu > tu)
+						su = tu - uu - ke->ke_iu;
+				}
+				KASSERT(uu + su + ke->ke_iu <= tu,
+		    		("calcru: monotonisation botch 1"));
+				iu = tu - uu - su;
+				KASSERT(iu >= ke->ke_iu,
+		    		("calcru: monotonisation botch 2"));
+			}
+			ke->ke_uu = uu;
+			ke->ke_su = su;
+			ke->ke_iu = iu;
+			uut += uu;
+			sut += su;
+			iut += iu;
+		
+		} /* end kse loop */
+	} /* end kseg loop */
+	up->tv_sec = uut / 1000000;
+	up->tv_usec = uut % 1000000;
+	sp->tv_sec = sut / 1000000;
+	sp->tv_usec = sut % 1000000;
+	if (ip != NULL) {
+		ip->tv_sec = iut / 1000000;
+		ip->tv_usec = iut % 1000000;
+	}
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getrusage_args {
+	int	who;
+	struct	rusage *rusage;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getrusage(td, uap)
+	register struct thread *td;
+	register struct getrusage_args *uap;
+{
+	struct proc *p = td->td_proc;
+	register struct rusage *rup;
+	int error = 0;
+
+	mtx_lock(&Giant);
+
+	switch (uap->who) {
+	case RUSAGE_SELF:
+		rup = &p->p_stats->p_ru;
+		mtx_lock_spin(&sched_lock);
+		calcru(p, &rup->ru_utime, &rup->ru_stime, NULL);
+		mtx_unlock_spin(&sched_lock);
+		break;
+
+	case RUSAGE_CHILDREN:
+		rup = &p->p_stats->p_cru;
+		break;
+
+	default:
+		rup = NULL;
+		error = EINVAL;
+		break;
+	}
+	mtx_unlock(&Giant);
+	if (error == 0) {
+		error = copyout((caddr_t)rup, (caddr_t)uap->rusage,
+		    sizeof (struct rusage));
+	}
+	return(error);
+}
+
+void
+ruadd(ru, ru2)
+	register struct rusage *ru, *ru2;
+{
+	register long *ip, *ip2;
+	register int i;
+
+	timevaladd(&ru->ru_utime, &ru2->ru_utime);
+	timevaladd(&ru->ru_stime, &ru2->ru_stime);
+	if (ru->ru_maxrss < ru2->ru_maxrss)
+		ru->ru_maxrss = ru2->ru_maxrss;
+	ip = &ru->ru_first; ip2 = &ru2->ru_first;
+	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
+		*ip++ += *ip2++;
+}
+
+/*
+ * Make a copy of the plimit structure.
+ * We share these structures copy-on-write after fork,
+ * and copy when a limit is changed.
+ */
+struct plimit *
+limcopy(lim)
+	struct plimit *lim;
+{
+	register struct plimit *copy;
+
+	MALLOC(copy, struct plimit *, sizeof(struct plimit),
+	    M_SUBPROC, M_WAITOK);
+	bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct plimit));
+	copy->p_lflags = 0;
+	copy->p_refcnt = 1;
+	return (copy);
+}
+
+/*
+ * Find the uidinfo structure for a uid.  This structure is used to
+ * track the total resource consumption (process count, socket buffer
+ * size, etc.) for the uid and impose limits.
+ */
+void
+uihashinit()
+{
+
+	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
+	mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
+}
+
+/*
+ * lookup a uidinfo struct for the parameter uid.
+ * uihashtbl_mtx must be locked.
+ */
+static struct uidinfo *
+uilookup(uid)
+	uid_t uid;
+{
+	struct	uihashhead *uipp;
+	struct	uidinfo *uip;
+
+	mtx_assert(&uihashtbl_mtx, MA_OWNED);
+	uipp = UIHASH(uid);
+	LIST_FOREACH(uip, uipp, ui_hash)
+		if (uip->ui_uid == uid)
+			break;
+
+	return (uip);
+}
+
+/*
+ * Find or allocate a struct uidinfo for a particular uid.
+ * Increase refcount on uidinfo struct returned.
+ * uifree() should be called on a struct uidinfo when released.
+ */
+struct uidinfo *
+uifind(uid)
+	uid_t uid;
+{
+	struct	uidinfo *uip;
+
+	mtx_lock(&uihashtbl_mtx);
+	uip = uilookup(uid);
+	if (uip == NULL) {
+		struct  uidinfo *old_uip;
+
+		mtx_unlock(&uihashtbl_mtx);
+		uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
+		mtx_lock(&uihashtbl_mtx);
+		/*
+		 * There's a chance someone created our uidinfo while we
+		 * were in malloc and not holding the lock, so we have to
+		 * make sure we don't insert a duplicate uidinfo
+		 */
+		if ((old_uip = uilookup(uid)) != NULL) {
+			/* someone else beat us to it */
+			free(uip, M_UIDINFO);
+			uip = old_uip;
+		} else {
+			uip->ui_mtxp = mtx_pool_alloc();
+			uip->ui_uid = uid;
+			LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
+		}
+	}
+	uihold(uip);
+	mtx_unlock(&uihashtbl_mtx);
+	return (uip);
+}
+
+/*
+ * Place another refcount on a uidinfo struct.
+ */
+void
+uihold(uip)
+	struct uidinfo *uip;
+{
+
+	UIDINFO_LOCK(uip);
+	uip->ui_ref++;
+	UIDINFO_UNLOCK(uip);
+}
+
+/*-
+ * Since uidinfo structs have a long lifetime, we use an
+ * opportunistic refcounting scheme to avoid locking the lookup hash
+ * for each release.
+ *
+ * If the refcount hits 0, we need to free the structure,
+ * which means we need to lock the hash.
+ * Optimal case:
+ *   After locking the struct and lowering the refcount, if we find
+ *   that we don't need to free, simply unlock and return.
+ * Suboptimal case:
+ *   If refcount lowering results in need to free, bump the count
+ *   back up, loose the lock and aquire the locks in the proper
+ *   order to try again.
+ */
+void
+uifree(uip)
+	struct uidinfo *uip;
+{
+
+	/* Prepare for optimal case. */
+	UIDINFO_LOCK(uip);
+
+	if (--uip->ui_ref != 0) {
+		UIDINFO_UNLOCK(uip);
+		return;
+	}
+
+	/* Prepare for suboptimal case. */
+	uip->ui_ref++;
+	UIDINFO_UNLOCK(uip);
+	mtx_lock(&uihashtbl_mtx);
+	UIDINFO_LOCK(uip);
+
+	/*
+	 * We must subtract one from the count again because we backed out
+	 * our initial subtraction before dropping the lock.
+	 * Since another thread may have added a reference after we dropped the
+	 * initial lock we have to test for zero again.
+	 */
+	if (--uip->ui_ref == 0) {
+		LIST_REMOVE(uip, ui_hash);
+		mtx_unlock(&uihashtbl_mtx);
+		if (uip->ui_sbsize != 0)
+			/* XXX no %qd in kernel.  Truncate. */
+			printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
+			    uip->ui_uid, (long)uip->ui_sbsize);
+		if (uip->ui_proccnt != 0)
+			printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
+			    uip->ui_uid, uip->ui_proccnt);
+		UIDINFO_UNLOCK(uip);
+		FREE(uip, M_UIDINFO);
+		return;
+	}
+
+	mtx_unlock(&uihashtbl_mtx);
+	UIDINFO_UNLOCK(uip);
+}
+
+/*
+ * Change the count associated with number of processes
+ * a given user is using.  When 'max' is 0, don't enforce a limit
+ */
+int
+chgproccnt(uip, diff, max)
+	struct	uidinfo	*uip;
+	int	diff;
+	int	max;
+{
+
+	UIDINFO_LOCK(uip);
+	/* don't allow them to exceed max, but allow subtraction */
+	if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
+		UIDINFO_UNLOCK(uip);
+		return (0);
+	}
+	uip->ui_proccnt += diff;
+	if (uip->ui_proccnt < 0)
+		printf("negative proccnt for uid = %d\n", uip->ui_uid);
+	UIDINFO_UNLOCK(uip);
+	return (1);
+}
+
+/*
+ * Change the total socket buffer size a user has used.
+ */
+int
+chgsbsize(uip, hiwat, to, max)
+	struct	uidinfo	*uip;
+	u_long *hiwat;
+	u_long	to;
+	rlim_t	max;
+{
+	rlim_t new;
+	int s;
+
+	s = splnet();
+	UIDINFO_LOCK(uip);
+	new = uip->ui_sbsize + to - *hiwat;
+	/* don't allow them to exceed max, but allow subtraction */
+	if (to > *hiwat && new > max) {
+		splx(s);
+		UIDINFO_UNLOCK(uip);
+		return (0);
+	}
+	uip->ui_sbsize = new;
+	*hiwat = to;
+	if (uip->ui_sbsize < 0)
+		printf("negative sbsize for uid = %d\n", uip->ui_uid);
+	splx(s);
+	UIDINFO_UNLOCK(uip);
+	return (1);
+}
diff --git a/sys/kern/kern_sema.c b/sys/kern/kern_sema.c
new file mode 100644
index 0000000..61435bd
--- /dev/null
+++ b/sys/kern/kern_sema.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2001 Jason Evans <jasone@freebsd.org>.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible 
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Counting semaphores.
+ *
+ * Priority propagation will not generally raise the priority of semaphore
+ * "owners" (a misnomer in the context of semaphores), so should not be relied
+ * upon in combination with semaphores.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sema.h>
+
+void
+sema_init(struct sema *sema, int value, const char *description)
+{
+
+	KASSERT((value >= 0), ("%s(): negative value\n", __func__));
+
+	bzero(sema, sizeof(*sema));
+	mtx_init(&sema->sema_mtx, description, "sema backing lock",
+	    MTX_DEF | MTX_NOWITNESS | MTX_QUIET);
+	cv_init(&sema->sema_cv, description);
+	sema->sema_value = value;
+
+	CTR4(KTR_LOCK, "%s(%p, %d, \"%s\")", __func__, sema, value, description);
+}
+
+void
+sema_destroy(struct sema *sema)
+{
+
+	CTR3(KTR_LOCK, "%s(%p) \"%s\"", __func__, sema,
+	    cv_wmesg(&sema->sema_cv));
+
+	KASSERT((sema->sema_waiters == 0), ("%s(): waiters\n", __func__));
+
+	mtx_destroy(&sema->sema_mtx);
+	cv_destroy(&sema->sema_cv);
+}
+
+void
+_sema_post(struct sema *sema, const char *file, int line)
+{
+
+	mtx_lock(&sema->sema_mtx);
+	sema->sema_value++;
+	if (sema->sema_waiters && sema->sema_value > 0)
+		cv_signal(&sema->sema_cv);
+
+	CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+	    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+
+	mtx_unlock(&sema->sema_mtx);
+}
+
+void
+_sema_wait(struct sema *sema, const char *file, int line)
+{
+
+	mtx_lock(&sema->sema_mtx);
+	while (sema->sema_value == 0) {
+		sema->sema_waiters++;
+		cv_wait(&sema->sema_cv, &sema->sema_mtx);
+		sema->sema_waiters--;
+	}
+	sema->sema_value--;
+
+	CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+	    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+
+	mtx_unlock(&sema->sema_mtx);
+}
+
+int
+_sema_timedwait(struct sema *sema, int timo, const char *file, int line)
+{
+	int ret, timed_out;
+
+	mtx_lock(&sema->sema_mtx);
+
+	/*
+	 * A spurious wakeup will cause the timeout interval to start over.
+	 * This isn't a big deal as long as spurious wakeups don't occur
+	 * continuously, since the timeout period is merely a lower bound on how
+	 * long to wait.
+	 */
+	for (timed_out = 0; sema->sema_value == 0 && timed_out == 0;) {
+		sema->sema_waiters++;
+		timed_out = cv_timedwait(&sema->sema_cv, &sema->sema_mtx, timo);
+		sema->sema_waiters--;
+	}
+	if (sema->sema_value > 0) {
+		/* Success. */
+		sema->sema_value--;
+		ret = 1;
+
+		CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+		    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+	} else {
+		ret = 0;
+		
+		CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema,
+		    cv_wmesg(&sema->sema_cv), file, line);
+	}
+
+	mtx_unlock(&sema->sema_mtx);
+	return (ret);
+}
+
+int
+_sema_trywait(struct sema *sema, const char *file, int line)
+{
+	int ret;
+
+	mtx_lock(&sema->sema_mtx);
+
+	if (sema->sema_value > 0) {
+		/* Success. */
+		sema->sema_value--;
+		ret = 1;
+
+		CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+		    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+	} else {
+		ret = 0;
+
+		CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema,
+		    cv_wmesg(&sema->sema_cv), file, line);
+	}
+
+	mtx_unlock(&sema->sema_mtx);
+	return (ret);
+}
+
+int
+sema_value(struct sema *sema)
+{
+	int ret;
+
+	mtx_lock(&sema->sema_mtx);
+	ret = sema->sema_value;
+	mtx_unlock(&sema->sema_mtx);
+	return (ret);
+}
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
new file mode 100644
index 0000000..d2cb69d
--- /dev/null
+++ b/sys/kern/kern_shutdown.c
@@ -0,0 +1,564 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_shutdown.c	8.3 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+#include "opt_hw_wdog.h"
+#include "opt_panic.h"
+#include "opt_show_busybufs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/disklabel.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/reboot.h>
+#include <sys/resourcevar.h>
+#include <sys/smp.h>		/* smp_active */
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vnode.h>
+
+#include <machine/pcb.h>
+#include <machine/md_var.h>
+#include <machine/smp.h>
+
+#include <sys/signalvar.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#ifndef PANIC_REBOOT_WAIT_TIME
+#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
+#endif
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#ifdef DDB
+#ifdef DDB_UNATTENDED
+int debugger_on_panic = 0;
+#else
+int debugger_on_panic = 1;
+#endif
+SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW,
+	&debugger_on_panic, 0, "Run debugger on kernel panic");
+#endif
+
+int sync_on_panic = 1;
+SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW,
+	&sync_on_panic, 0, "Do a sync before rebooting from a panic");
+
+SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment");
+
+#ifdef	HW_WDOG
+/*
+ * If there is a hardware watchdog, point this at the function needed to
+ * hold it off.
+ * It's needed when the kernel needs to do some lengthy operations.
+ * e.g. in wd.c when dumping core.. It's most annoying to have
+ * your precious core-dump only half written because the wdog kicked in.
+ */
+watchdog_tickle_fn wdog_tickler = NULL;
+#endif	/* HW_WDOG */
+
+/*
+ * Variable panicstr contains argument to first call to panic; used as flag
+ * to indicate that the kernel has already called panic.
+ */
+const char *panicstr;
+
+int dumping;				/* system is dumping */
+static struct dumperinfo dumper;	/* our selected dumper */
+static struct pcb dumppcb;		/* "You Are Here" sign for dump-debuggers */
+
+static void boot(int) __dead2;
+static void poweroff_wait(void *, int);
+static void shutdown_halt(void *junk, int howto);
+static void shutdown_panic(void *junk, int howto);
+static void shutdown_reset(void *junk, int howto);
+
+/* register various local shutdown events */
+static void 
+shutdown_conf(void *unused)
+{
+	EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST);
+	EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100);
+	EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100);
+	EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200);
+}
+
+SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL)
+
+/*
+ * The system call that results in a reboot
+ *
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+reboot(struct thread *td, struct reboot_args *uap)
+{
+	int error;
+
+	mtx_lock(&Giant);
+	if ((error = suser(td)) == 0)
+		boot(uap->opt);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Called by events that want to shut down.. e.g  <CTL><ALT><DEL> on a PC
+ */
+static int shutdown_howto = 0;
+
+void
+shutdown_nice(int howto)
+{
+	shutdown_howto = howto;
+	
+	/* Send a signal to init(8) and have it shutdown the world */
+	if (initproc != NULL) {
+		PROC_LOCK(initproc);
+		psignal(initproc, SIGINT);
+		PROC_UNLOCK(initproc);
+	} else {
+		/* No init(8) running, so simply reboot */
+		boot(RB_NOSYNC);
+	}
+	return;
+}
+static int	waittime = -1;
+
+static void
+print_uptime(void)
+{
+	int f;
+	struct timespec ts;
+
+	getnanouptime(&ts);
+	printf("Uptime: ");
+	f = 0;
+	if (ts.tv_sec >= 86400) {
+		printf("%ldd", (long)ts.tv_sec / 86400);
+		ts.tv_sec %= 86400;
+		f = 1;
+	}
+	if (f || ts.tv_sec >= 3600) {
+		printf("%ldh", (long)ts.tv_sec / 3600);
+		ts.tv_sec %= 3600;
+		f = 1;
+	}
+	if (f || ts.tv_sec >= 60) {
+		printf("%ldm", (long)ts.tv_sec / 60);
+		ts.tv_sec %= 60;
+		f = 1;
+	}
+	printf("%lds\n", (long)ts.tv_sec);
+}
+
+static void
+doadump(void)
+{
+	savectx(&dumppcb);
+	dumping++;
+	dumpsys(&dumper);
+}
+
+/*
+ *  Go through the rigmarole of shutting down..
+ * this used to be in machdep.c but I'll be dammned if I could see
+ * anything machine dependant in it.
+ */
+static void
+boot(int howto)
+{
+
+	/* collect extra flags that shutdown_nice might have set */
+	howto |= shutdown_howto;
+
+#ifdef DDB
+	/* We are out of the debugger now. */
+	db_active = 0;
+#endif
+
+#ifdef SMP
+	if (smp_active)
+		printf("boot() called on cpu#%d\n", PCPU_GET(cpuid));
+#endif
+	/*
+	 * Do any callouts that should be done BEFORE syncing the filesystems.
+	 */
+	EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
+
+	/* 
+	 * Now sync filesystems
+	 */
+	if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
+		register struct buf *bp;
+		int iter, nbusy, pbusy;
+		int subiter;
+
+		waittime = 0;
+		printf("\nsyncing disks... ");
+
+		sync(&thread0, NULL);
+
+		/*
+		 * With soft updates, some buffers that are
+		 * written will be remarked as dirty until other
+		 * buffers are written.
+		 */
+		for (iter = pbusy = 0; iter < 20; iter++) {
+			nbusy = 0;
+			for (bp = &buf[nbuf]; --bp >= buf; ) {
+				if ((bp->b_flags & B_INVAL) == 0 &&
+				    BUF_REFCNT(bp) > 0) {
+					nbusy++;
+				} else if ((bp->b_flags & (B_DELWRI | B_INVAL))
+						== B_DELWRI) {
+					/* bawrite(bp);*/
+					nbusy++;
+				}
+			}
+			if (nbusy == 0)
+				break;
+			printf("%d ", nbusy);
+			if (nbusy < pbusy)
+				iter = 0;
+			pbusy = nbusy;
+			sync(&thread0, NULL);
+ 			if (curthread != NULL) {
+				DROP_GIANT();
+   				for (subiter = 0; subiter < 50 * iter; subiter++) {
+     					mtx_lock_spin(&sched_lock);
+					setrunqueue(curthread);
+					curthread->td_proc->p_stats->p_ru.ru_nvcsw++;
+     					mi_switch(); /* Allow interrupt threads to run */
+     					mtx_unlock_spin(&sched_lock);
+     					DELAY(1000);
+   				}
+				PICKUP_GIANT();
+ 			} else
+			DELAY(50000 * iter);
+		}
+		printf("\n");
+		/*
+		 * Count only busy local buffers to prevent forcing 
+		 * a fsck if we're just a client of a wedged NFS server
+		 */
+		nbusy = 0;
+		for (bp = &buf[nbuf]; --bp >= buf; ) {
+			if (((bp->b_flags&B_INVAL) == 0 && BUF_REFCNT(bp)) ||
+			    ((bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI)) {
+				if (bp->b_dev == NODEV) {
+					TAILQ_REMOVE(&mountlist,
+					    bp->b_vp->v_mount, mnt_list);
+					continue;
+				}
+				nbusy++;
+#if defined(SHOW_BUSYBUFS) || defined(DIAGNOSTIC)
+				printf(
+			    "%d: dev:%s, flags:%08lx, blkno:%ld, lblkno:%ld\n",
+				    nbusy, devtoname(bp->b_dev),
+				    bp->b_flags, (long)bp->b_blkno,
+				    (long)bp->b_lblkno);
+#endif
+			}
+		}
+		if (nbusy) {
+			/*
+			 * Failed to sync all blocks. Indicate this and don't
+			 * unmount filesystems (thus forcing an fsck on reboot).
+			 */
+			printf("giving up on %d buffers\n", nbusy);
+			DELAY(5000000);	/* 5 seconds */
+		} else {
+			printf("done\n");
+			/*
+			 * Unmount filesystems
+			 */
+			if (panicstr == 0)
+				vfs_unmountall();
+		}
+		DELAY(100000);		/* wait for console output to finish */
+	}
+
+	print_uptime();
+
+	/*
+	 * Ok, now do things that assume all filesystem activity has
+	 * been completed.
+	 */
+	EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
+	splhigh();
+	if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP &&
+	    !cold && dumper.dumper != NULL && !dumping) 
+		doadump();
+
+	/* Now that we're going to really halt the system... */
+	EVENTHANDLER_INVOKE(shutdown_final, howto);
+
+	for(;;) ;	/* safety against shutdown_reset not working */
+	/* NOTREACHED */
+}
+
+/*
+ * If the shutdown was a clean halt, behave accordingly.
+ */
+static void
+shutdown_halt(void *junk, int howto)
+{
+	if (howto & RB_HALT) {
+		printf("\n");
+		printf("The operating system has halted.\n");
+		printf("Please press any key to reboot.\n\n");
+		switch (cngetc()) {
+		case -1:		/* No console, just die */
+			cpu_halt();
+			/* NOTREACHED */
+		default:
+			howto &= ~RB_HALT;
+			break;
+		}
+	}
+}
+
+/*
+ * Check to see if the system paniced, pause and then reboot
+ * according to the specified delay.
+ */
+static void
+shutdown_panic(void *junk, int howto)
+{
+	int loop;
+
+	if (howto & RB_DUMP) {
+		if (PANIC_REBOOT_WAIT_TIME != 0) {
+			if (PANIC_REBOOT_WAIT_TIME != -1) {
+				printf("Automatic reboot in %d seconds - "
+				       "press a key on the console to abort\n",
+					PANIC_REBOOT_WAIT_TIME);
+				for (loop = PANIC_REBOOT_WAIT_TIME * 10;
+				     loop > 0; --loop) {
+					DELAY(1000 * 100); /* 1/10th second */
+					/* Did user type a key? */
+					if (cncheckc() != -1)
+						break;
+				}
+				if (!loop)
+					return;
+			}
+		} else { /* zero time specified - reboot NOW */
+			return;
+		}
+		printf("--> Press a key on the console to reboot,\n");
+		printf("--> or switch off the system now.\n");
+		cngetc();
+	}
+}
+
+/*
+ * Everything done, now reset
+ */
+static void
+shutdown_reset(void *junk, int howto)
+{
+	printf("Rebooting...\n");
+	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
+	/* cpu_boot(howto); */ /* doesn't do anything at the moment */
+	cpu_reset();
+	/* NOTREACHED */ /* assuming reset worked */
+}
+
+#ifdef SMP
+static u_int panic_cpu = NOCPU;
+#endif
+
+/*
+ * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
+ * and then reboots.  If we are called twice, then we avoid trying to sync
+ * the disks as this often leads to recursive panics.
+ *
+ * MPSAFE
+ */
+void
+panic(const char *fmt, ...)
+{
+	int bootopt;
+	va_list ap;
+	static char buf[256];
+
+#ifdef SMP
+	/*
+	 * We don't want multiple CPU's to panic at the same time, so we
+	 * use panic_cpu as a simple spinlock.  We have to keep checking
+	 * panic_cpu if we are spinning in case the panic on the first
+	 * CPU is canceled.
+	 */
+	if (panic_cpu != PCPU_GET(cpuid))
+		while (atomic_cmpset_int(&panic_cpu, NOCPU,
+		    PCPU_GET(cpuid)) == 0)
+			while (panic_cpu != NOCPU)
+				; /* nothing */
+#endif
+
+	bootopt = RB_AUTOBOOT | RB_DUMP;
+	if (panicstr)
+		bootopt |= RB_NOSYNC;
+	else
+		panicstr = fmt;
+
+	va_start(ap, fmt);
+	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
+	if (panicstr == fmt)
+		panicstr = buf;
+	va_end(ap);
+	printf("panic: %s\n", buf);
+#ifdef SMP
+	/* two separate prints in case of an unmapped page and trap */
+	printf("cpuid = %d; ", PCPU_GET(cpuid));
+#ifdef APIC_IO
+	printf("lapic.id = %08x\n", lapic.id);
+#endif
+#endif
+
+#if defined(DDB)
+	if (debugger_on_panic)
+		Debugger ("panic");
+#ifdef RESTARTABLE_PANICS
+	/* See if the user aborted the panic, in which case we continue. */
+	if (panicstr == NULL) {
+#ifdef SMP
+		atomic_store_rel_int(&panic_cpu, NOCPU);
+#endif
+		return;
+	}
+#endif
+#endif
+	if (!sync_on_panic)
+		bootopt |= RB_NOSYNC;
+	boot(bootopt);
+}
+
+/*
+ * Support for poweroff delay.
+ */
+#ifndef POWEROFF_DELAY
+# define POWEROFF_DELAY 5000
+#endif
+static int poweroff_delay = POWEROFF_DELAY;
+
+SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
+	&poweroff_delay, 0, "");
+
+static void 
+poweroff_wait(void *junk, int howto)
+{
+	if(!(howto & RB_POWEROFF) || poweroff_delay <= 0)
+		return;
+	DELAY(poweroff_delay * 1000);
+}
+
+/*
+ * Some system processes (e.g. syncer) need to be stopped at appropriate
+ * points in their main loops prior to a system shutdown, so that they
+ * won't interfere with the shutdown process (e.g. by holding a disk buf
+ * to cause sync to fail).  For each of these system processes, register
+ * shutdown_kproc() as a handler for one of shutdown events.
+ */
+static int kproc_shutdown_wait = 60;
+SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
+    &kproc_shutdown_wait, 0, "");
+
+void
+kproc_shutdown(void *arg, int howto)
+{
+	struct proc *p;
+	int error;
+
+	if (panicstr)
+		return;
+
+	p = (struct proc *)arg;
+	printf("Waiting (max %d seconds) for system process `%s' to stop...",
+	    kproc_shutdown_wait, p->p_comm);
+	error = kthread_suspend(p, kproc_shutdown_wait * hz);
+
+	if (error == EWOULDBLOCK)
+		printf("timed out\n");
+	else
+		printf("stopped\n");
+}
+
+/* Registration of dumpers */
+int
+set_dumper(struct dumperinfo *di)
+{
+	if (di == NULL) {
+		bzero(&dumper, sizeof dumper);
+		return (0);
+	}
+	if (dumper.dumper != NULL)
+		return (EBUSY);
+	dumper = *di;
+	return (0);
+}
+
+#if defined(__powerpc__) || defined(__sparc64__)
+void
+dumpsys(struct dumperinfo *di __unused)
+{
+
+	printf("Kernel dumps not implemented on this architecture\n");
+}
+#endif
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
new file mode 100644
index 0000000..8af0280
--- /dev/null
+++ b/sys/kern/kern_sig.c
@@ -0,0 +1,2153 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/acct.h>
+#include <sys/fcntl.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/wait.h>
+#include <sys/ktr.h>
+#include <sys/ktrace.h>
+#include <sys/resourcevar.h>
+#include <sys/smp.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syslog.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+
+#include <machine/cpu.h>
+
+#define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
+
+static int coredump(struct thread *);
+static int do_sigaction(struct proc *p, int sig, struct sigaction *act,
+			struct sigaction *oact, int old);
+static int do_sigprocmask(struct proc *p, int how, sigset_t *set,
+			  sigset_t *oset, int old);
+static char *expand_name(const char *, uid_t, pid_t);
+static int killpg1(struct thread *td, int sig, int pgid, int all);
+static int sig_ffs(sigset_t *set);
+static int sigprop(int sig);
+static void stop(struct proc *);
+
+static int	filt_sigattach(struct knote *kn);
+static void	filt_sigdetach(struct knote *kn);
+static int	filt_signal(struct knote *kn, long hint);
+
+struct filterops sig_filtops =
+	{ 0, filt_sigattach, filt_sigdetach, filt_signal };
+
+static int	kern_logsigexit = 1;
+SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, 
+    &kern_logsigexit, 0, 
+    "Log processes quitting on abnormal signals to syslog(3)");
+
+/*
+ * Policy -- Can ucred cr1 send SIGIO to process cr2?
+ * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
+ * in the right situations.
+ */
+#define CANSIGIO(cr1, cr2) \
+	((cr1)->cr_uid == 0 || \
+	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
+	    (cr1)->cr_uid == (cr2)->cr_ruid || \
+	    (cr1)->cr_ruid == (cr2)->cr_uid || \
+	    (cr1)->cr_uid == (cr2)->cr_uid)
+
+int sugid_coredump;
+SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, 
+    &sugid_coredump, 0, "Enable coredumping set user/group ID processes");
+
+static int	do_coredump = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
+	&do_coredump, 0, "Enable/Disable coredumps");
+
+/*
+ * Signal properties and actions.
+ * The array below categorizes the signals and their default actions
+ * according to the following properties:
+ */
+#define	SA_KILL		0x01		/* terminates process by default */
+#define	SA_CORE		0x02		/* ditto and coredumps */
+#define	SA_STOP		0x04		/* suspend process */
+#define	SA_TTYSTOP	0x08		/* ditto, from tty */
+#define	SA_IGNORE	0x10		/* ignore by default */
+#define	SA_CONT		0x20		/* continue if suspended */
+#define	SA_CANTMASK	0x40		/* non-maskable, catchable */
+
+static int sigproptbl[NSIG] = {
+        SA_KILL,                /* SIGHUP */
+        SA_KILL,                /* SIGINT */
+        SA_KILL|SA_CORE,        /* SIGQUIT */
+        SA_KILL|SA_CORE,        /* SIGILL */
+        SA_KILL|SA_CORE,        /* SIGTRAP */
+        SA_KILL|SA_CORE,        /* SIGABRT */
+        SA_KILL|SA_CORE,        /* SIGEMT */
+        SA_KILL|SA_CORE,        /* SIGFPE */
+        SA_KILL,                /* SIGKILL */
+        SA_KILL|SA_CORE,        /* SIGBUS */
+        SA_KILL|SA_CORE,        /* SIGSEGV */
+        SA_KILL|SA_CORE,        /* SIGSYS */
+        SA_KILL,                /* SIGPIPE */
+        SA_KILL,                /* SIGALRM */
+        SA_KILL,                /* SIGTERM */
+        SA_IGNORE,              /* SIGURG */
+        SA_STOP,                /* SIGSTOP */
+        SA_STOP|SA_TTYSTOP,     /* SIGTSTP */
+        SA_IGNORE|SA_CONT,      /* SIGCONT */
+        SA_IGNORE,              /* SIGCHLD */
+        SA_STOP|SA_TTYSTOP,     /* SIGTTIN */
+        SA_STOP|SA_TTYSTOP,     /* SIGTTOU */
+        SA_IGNORE,              /* SIGIO */
+        SA_KILL,                /* SIGXCPU */
+        SA_KILL,                /* SIGXFSZ */
+        SA_KILL,                /* SIGVTALRM */
+        SA_KILL,                /* SIGPROF */
+        SA_IGNORE,              /* SIGWINCH  */
+        SA_IGNORE,              /* SIGINFO */
+        SA_KILL,                /* SIGUSR1 */
+        SA_KILL,                /* SIGUSR2 */
+};
+
+/*
+ * Determine signal that should be delivered to process p, the current
+ * process, 0 if none.  If there is a pending stop signal with default
+ * action, the process stops in issignal().
+ *
+ * MP SAFE.
+ */
+int
+cursig(struct proc *p)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	mtx_assert(&sched_lock, MA_NOTOWNED);
+	return (SIGPENDING(p) ? issignal(p) : 0);
+}
+
+/*
+ * Arrange for ast() to handle unmasked pending signals on return to user
+ * mode.  This must be called whenever a signal is added to p_siglist or
+ * unmasked in p_sigmask.
+ */
+void
+signotify(struct proc *p)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	mtx_lock_spin(&sched_lock);
+	if (SIGPENDING(p)) {
+		p->p_sflag |= PS_NEEDSIGCHK;
+		p->p_kse.ke_flags |= KEF_ASTPENDING;	/* XXXKSE */  
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+static __inline int
+sigprop(int sig)
+{
+
+	if (sig > 0 && sig < NSIG)
+		return (sigproptbl[_SIG_IDX(sig)]);
+	return (0);
+}
+
+static __inline int
+sig_ffs(sigset_t *set)
+{
+	int i;
+
+	for (i = 0; i < _SIG_WORDS; i++)
+		if (set->__bits[i])
+			return (ffs(set->__bits[i]) + (i * 32));
+	return (0);
+}
+
+/*
+ * do_sigaction
+ * sigaction
+ * osigaction
+ */
+static int
+do_sigaction(p, sig, act, oact, old)
+	struct proc *p;
+	register int sig;
+	struct sigaction *act, *oact;
+	int old;
+{
+	register struct sigacts *ps;
+
+	if (!_SIG_VALID(sig))
+		return (EINVAL);
+
+	PROC_LOCK(p);
+	ps = p->p_sigacts;
+	if (oact) {
+		oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
+		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
+		oact->sa_flags = 0;
+		if (SIGISMEMBER(ps->ps_sigonstack, sig))
+			oact->sa_flags |= SA_ONSTACK;
+		if (!SIGISMEMBER(ps->ps_sigintr, sig))
+			oact->sa_flags |= SA_RESTART;
+		if (SIGISMEMBER(ps->ps_sigreset, sig))
+			oact->sa_flags |= SA_RESETHAND;
+		if (SIGISMEMBER(ps->ps_signodefer, sig))
+			oact->sa_flags |= SA_NODEFER;
+		if (SIGISMEMBER(ps->ps_siginfo, sig))
+			oact->sa_flags |= SA_SIGINFO;
+		if (sig == SIGCHLD && p->p_procsig->ps_flag & PS_NOCLDSTOP)
+			oact->sa_flags |= SA_NOCLDSTOP;
+		if (sig == SIGCHLD && p->p_procsig->ps_flag & PS_NOCLDWAIT)
+			oact->sa_flags |= SA_NOCLDWAIT;
+	}
+	if (act) {
+		if ((sig == SIGKILL || sig == SIGSTOP) &&
+		    act->sa_handler != SIG_DFL) {
+			PROC_UNLOCK(p);
+			return (EINVAL);
+		}
+
+		/*
+		 * Change setting atomically.
+		 */
+
+		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
+		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
+		if (act->sa_flags & SA_SIGINFO) {
+			ps->ps_sigact[_SIG_IDX(sig)] =
+			    (__sighandler_t *)act->sa_sigaction;
+			SIGADDSET(ps->ps_siginfo, sig);
+		} else {
+			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
+			SIGDELSET(ps->ps_siginfo, sig);
+		}
+		if (!(act->sa_flags & SA_RESTART))
+			SIGADDSET(ps->ps_sigintr, sig);
+		else
+			SIGDELSET(ps->ps_sigintr, sig);
+		if (act->sa_flags & SA_ONSTACK)
+			SIGADDSET(ps->ps_sigonstack, sig);
+		else
+			SIGDELSET(ps->ps_sigonstack, sig);
+		if (act->sa_flags & SA_RESETHAND)
+			SIGADDSET(ps->ps_sigreset, sig);
+		else
+			SIGDELSET(ps->ps_sigreset, sig);
+		if (act->sa_flags & SA_NODEFER)
+			SIGADDSET(ps->ps_signodefer, sig);
+		else
+			SIGDELSET(ps->ps_signodefer, sig);
+#ifdef COMPAT_SUNOS
+		if (act->sa_flags & SA_USERTRAMP)
+			SIGADDSET(ps->ps_usertramp, sig);
+		else
+			SIGDELSET(ps->ps_usertramp, sig);
+#endif
+		if (sig == SIGCHLD) {
+			if (act->sa_flags & SA_NOCLDSTOP)
+				p->p_procsig->ps_flag |= PS_NOCLDSTOP;
+			else
+				p->p_procsig->ps_flag &= ~PS_NOCLDSTOP;
+			if (act->sa_flags & SA_NOCLDWAIT) {
+				/*
+				 * Paranoia: since SA_NOCLDWAIT is implemented
+				 * by reparenting the dying child to PID 1 (and
+				 * trust it to reap the zombie), PID 1 itself
+				 * is forbidden to set SA_NOCLDWAIT.
+				 */
+				if (p->p_pid == 1)
+					p->p_procsig->ps_flag &= ~PS_NOCLDWAIT;
+				else
+					p->p_procsig->ps_flag |= PS_NOCLDWAIT;
+			} else
+				p->p_procsig->ps_flag &= ~PS_NOCLDWAIT;
+			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
+				p->p_procsig->ps_flag |= PS_CLDSIGIGN;
+			else
+				p->p_procsig->ps_flag &= ~PS_CLDSIGIGN;
+		}
+		/*
+		 * Set bit in p_sigignore for signals that are set to SIG_IGN,
+		 * and for signals set to SIG_DFL where the default is to
+		 * ignore. However, don't put SIGCONT in p_sigignore, as we
+		 * have to restart the process.
+		 */
+		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
+		    (sigprop(sig) & SA_IGNORE &&
+		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
+			/* never to be seen again */
+			SIGDELSET(p->p_siglist, sig);
+			if (sig != SIGCONT)
+				/* easier in psignal */
+				SIGADDSET(p->p_sigignore, sig);
+			SIGDELSET(p->p_sigcatch, sig);
+		} else {
+			SIGDELSET(p->p_sigignore, sig);
+			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
+				SIGDELSET(p->p_sigcatch, sig);
+			else
+				SIGADDSET(p->p_sigcatch, sig);
+		}
+#ifdef COMPAT_43
+		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
+		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || !old)
+			SIGDELSET(ps->ps_osigset, sig);
+		else
+			SIGADDSET(ps->ps_osigset, sig);
+#endif
+	}
+	PROC_UNLOCK(p);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaction_args {
+	int	sig;
+	struct	sigaction *act;
+	struct	sigaction *oact;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+sigaction(td, uap)
+	struct thread *td;
+	register struct sigaction_args *uap;
+{
+	struct proc *p = td->td_proc;
+	struct sigaction act, oact;
+	register struct sigaction *actp, *oactp;
+	int error;
+
+	mtx_lock(&Giant);
+
+	actp = (uap->act != NULL) ? &act : NULL;
+	oactp = (uap->oact != NULL) ? &oact : NULL;
+	if (actp) {
+		error = copyin(uap->act, actp, sizeof(act));
+		if (error)
+			goto done2;
+	}
+	error = do_sigaction(p, uap->sig, actp, oactp, 0);
+	if (oactp && !error) {
+		error = copyout(oactp, uap->oact, sizeof(oact));
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigaction_args {
+	int	signum;
+	struct	osigaction *nsa;
+	struct	osigaction *osa;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osigaction(td, uap)
+	struct thread *td;
+	register struct osigaction_args *uap;
+{
+	struct proc *p = td->td_proc;
+	struct osigaction sa;
+	struct sigaction nsa, osa;
+	register struct sigaction *nsap, *osap;
+	int error;
+
+	if (uap->signum <= 0 || uap->signum >= ONSIG)
+		return (EINVAL);
+
+	nsap = (uap->nsa != NULL) ? &nsa : NULL;
+	osap = (uap->osa != NULL) ? &osa : NULL;
+
+	mtx_lock(&Giant);
+
+	if (nsap) {
+		error = copyin(uap->nsa, &sa, sizeof(sa));
+		if (error)
+			goto done2;
+		nsap->sa_handler = sa.sa_handler;
+		nsap->sa_flags = sa.sa_flags;
+		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
+	}
+	error = do_sigaction(p, uap->signum, nsap, osap, 1);
+	if (osap && !error) {
+		sa.sa_handler = osap->sa_handler;
+		sa.sa_flags = osap->sa_flags;
+		SIG2OSIG(osap->sa_mask, sa.sa_mask);
+		error = copyout(&sa, uap->osa, sizeof(sa));
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Initialize signal state for process 0;
+ * set to ignore signals that are ignored by default.
+ */
+void
+siginit(p)
+	struct proc *p;
+{
+	register int i;
+
+	PROC_LOCK(p);
+	for (i = 1; i <= NSIG; i++)
+		if (sigprop(i) & SA_IGNORE && i != SIGCONT)
+			SIGADDSET(p->p_sigignore, i);
+	PROC_UNLOCK(p);
+}
+
+/*
+ * Reset signals for an exec of the specified process.
+ */
+void
+execsigs(p)
+	register struct proc *p;
+{
+	register struct sigacts *ps;
+	register int sig;
+
+	/*
+	 * Reset caught signals.  Held signals remain held
+	 * through p_sigmask (unless they were caught,
+	 * and are now ignored by default).
+	 */
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	ps = p->p_sigacts;
+	while (SIGNOTEMPTY(p->p_sigcatch)) {
+		sig = sig_ffs(&p->p_sigcatch);
+		SIGDELSET(p->p_sigcatch, sig);
+		if (sigprop(sig) & SA_IGNORE) {
+			if (sig != SIGCONT)
+				SIGADDSET(p->p_sigignore, sig);
+			SIGDELSET(p->p_siglist, sig);
+		}
+		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+	}
+	/*
+	 * Reset stack state to the user stack.
+	 * Clear set of signals caught on the signal stack.
+	 */
+	p->p_sigstk.ss_flags = SS_DISABLE;
+	p->p_sigstk.ss_size = 0;
+	p->p_sigstk.ss_sp = 0;
+	p->p_flag &= ~P_ALTSTACK;
+	/*
+	 * Reset no zombies if child dies flag as Solaris does.
+	 */
+	p->p_procsig->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
+	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
+		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
+}
+
+/*
+ * do_sigprocmask()
+ *
+ *	Manipulate signal mask.
+ */
+static int
+do_sigprocmask(p, how, set, oset, old)
+	struct proc *p;
+	int how;
+	sigset_t *set, *oset;
+	int old;
+{
+	int error;
+
+	PROC_LOCK(p);
+	if (oset != NULL)
+		*oset = p->p_sigmask;
+
+	error = 0;
+	if (set != NULL) {
+		switch (how) {
+		case SIG_BLOCK:
+			SIG_CANTMASK(*set);
+			SIGSETOR(p->p_sigmask, *set);
+			break;
+		case SIG_UNBLOCK:
+			SIGSETNAND(p->p_sigmask, *set);
+			signotify(p);
+			break;
+		case SIG_SETMASK:
+			SIG_CANTMASK(*set);
+			if (old)
+				SIGSETLO(p->p_sigmask, *set);
+			else
+				p->p_sigmask = *set;
+			signotify(p);
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+/*
+ * sigprocmask() - MP SAFE (XXXKSE not under KSE it isn't)
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigprocmask_args {
+	int	how;
+	const sigset_t *set;
+	sigset_t *oset;
+};
+#endif
+int
+sigprocmask(td, uap)
+	register struct thread *td;
+	struct sigprocmask_args *uap;
+{
+	struct proc *p = td->td_proc;
+	sigset_t set, oset;
+	sigset_t *setp, *osetp;
+	int error;
+
+	setp = (uap->set != NULL) ? &set : NULL;
+	osetp = (uap->oset != NULL) ? &oset : NULL;
+	if (setp) {
+		error = copyin(uap->set, setp, sizeof(set));
+		if (error)
+			return (error);
+	}
+	error = do_sigprocmask(p, uap->how, setp, osetp, 0);
+	if (osetp && !error) {
+		error = copyout(osetp, uap->oset, sizeof(oset));
+	}
+	return (error);
+}
+
+#ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
+/*
+ * osigprocmask() - MP SAFE
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct osigprocmask_args {
+	int	how;
+	osigset_t mask;
+};
+#endif
+int
+osigprocmask(td, uap)
+	register struct thread *td;
+	struct osigprocmask_args *uap;
+{
+	struct proc *p = td->td_proc;
+	sigset_t set, oset;
+	int error;
+
+	OSIG2SIG(uap->mask, set);
+	error = do_sigprocmask(p, uap->how, &set, &oset, 1);
+	SIG2OSIG(oset, td->td_retval[0]);
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigpending_args {
+	sigset_t	*set;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+sigpending(td, uap)
+	struct thread *td;
+	struct sigpending_args *uap;
+{
+	struct proc *p = td->td_proc;
+	sigset_t siglist;
+	int error;
+
+	mtx_lock(&Giant);
+	PROC_LOCK(p);
+	siglist = p->p_siglist;
+	PROC_UNLOCK(p);
+	mtx_unlock(&Giant);
+	error = copyout(&siglist, uap->set, sizeof(sigset_t));
+	return(error);
+}
+
+#ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigpending_args {
+	int	dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osigpending(td, uap)
+	struct thread *td;
+	struct osigpending_args *uap;
+{
+	struct proc *p = td->td_proc;
+
+	mtx_lock(&Giant);
+	PROC_LOCK(p);
+	SIG2OSIG(p->p_siglist, td->td_retval[0]);
+	PROC_UNLOCK(p);
+	mtx_unlock(&Giant);
+	return (0);
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Generalized interface signal handler, 4.3-compatible.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct osigvec_args {
+	int	signum;
+	struct	sigvec *nsv;
+	struct	sigvec *osv;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osigvec(td, uap)
+	struct thread *td;
+	register struct osigvec_args *uap;
+{
+	struct proc *p = td->td_proc;
+	struct sigvec vec;
+	struct sigaction nsa, osa;
+	register struct sigaction *nsap, *osap;
+	int error;
+
+	if (uap->signum <= 0 || uap->signum >= ONSIG)
+		return (EINVAL);
+	nsap = (uap->nsv != NULL) ? &nsa : NULL;
+	osap = (uap->osv != NULL) ? &osa : NULL;
+	if (nsap) {
+		error = copyin(uap->nsv, &vec, sizeof(vec));
+		if (error)
+			return (error);
+		nsap->sa_handler = vec.sv_handler;
+		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
+		nsap->sa_flags = vec.sv_flags;
+		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
+#ifdef COMPAT_SUNOS
+		nsap->sa_flags |= SA_USERTRAMP;
+#endif
+	}
+	mtx_lock(&Giant);
+	error = do_sigaction(p, uap->signum, nsap, osap, 1);
+	mtx_unlock(&Giant);
+	if (osap && !error) {
+		vec.sv_handler = osap->sa_handler;
+		SIG2OSIG(osap->sa_mask, vec.sv_mask);
+		vec.sv_flags = osap->sa_flags;
+		vec.sv_flags &= ~SA_NOCLDWAIT;
+		vec.sv_flags ^= SA_RESTART;
+#ifdef COMPAT_SUNOS
+		vec.sv_flags &= ~SA_NOCLDSTOP;
+#endif
+		error = copyout(&vec, uap->osv, sizeof(vec));
+	}
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigblock_args {
+	int	mask;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+osigblock(td, uap)
+	register struct thread *td;
+	struct osigblock_args *uap;
+{
+	struct proc *p = td->td_proc;
+	sigset_t set;
+
+	OSIG2SIG(uap->mask, set);
+	SIG_CANTMASK(set);
+	mtx_lock(&Giant);
+	PROC_LOCK(p);
+	SIG2OSIG(p->p_sigmask, td->td_retval[0]);
+	SIGSETOR(p->p_sigmask, set);
+	PROC_UNLOCK(p);
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigsetmask_args {
+	int	mask;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+osigsetmask(td, uap)
+	struct thread *td;
+	struct osigsetmask_args *uap;
+{
+	struct proc *p = td->td_proc;
+	sigset_t set;
+
+	OSIG2SIG(uap->mask, set);
+	SIG_CANTMASK(set);
+	mtx_lock(&Giant);
+	PROC_LOCK(p);
+	SIG2OSIG(p->p_sigmask, td->td_retval[0]);
+	SIGSETLO(p->p_sigmask, set);
+	signotify(p);
+	PROC_UNLOCK(p);
+	mtx_unlock(&Giant);
+	return (0);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Suspend process until signal, providing mask to be set
+ * in the meantime.  Note nonstandard calling convention:
+ * libc stub passes mask, not pointer, to save a copyin.
+ ***** XXXKSE this doesn't make sense under KSE.
+ ***** Do we suspend the thread or all threads in the process?
+ ***** How do we suspend threads running NOW on another processor?
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sigsuspend_args {
+	const sigset_t *sigmask;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+sigsuspend(td, uap)
+	struct thread *td;
+	struct sigsuspend_args *uap;
+{
+	struct proc *p = td->td_proc;
+	sigset_t mask;
+	register struct sigacts *ps;
+	int error;
+
+	error = copyin(uap->sigmask, &mask, sizeof(mask));
+	if (error)
+		return (error);
+
+	/*
+	 * When returning from sigsuspend, we want
+	 * the old mask to be restored after the
+	 * signal handler has finished.  Thus, we
+	 * save it here and mark the sigacts structure
+	 * to indicate this.
+	 */
+	mtx_lock(&Giant);
+	PROC_LOCK(p);
+	ps = p->p_sigacts;
+	p->p_oldsigmask = p->p_sigmask;
+	p->p_flag |= P_OLDMASK;
+
+	SIG_CANTMASK(mask);
+	p->p_sigmask = mask;
+	signotify(p);
+	while (msleep((caddr_t) ps, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0)
+		/* void */;
+	PROC_UNLOCK(p);
+	mtx_unlock(&Giant);
+	/* always return EINTR rather than ERESTART... */
+	return (EINTR);
+}
+
+#ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigsuspend_args {
+	osigset_t mask;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osigsuspend(td, uap)
+	struct thread *td;
+	struct osigsuspend_args *uap;
+{
+	struct proc *p = td->td_proc;
+	sigset_t mask;
+	register struct sigacts *ps;
+
+	mtx_lock(&Giant);
+	PROC_LOCK(p);
+	ps = p->p_sigacts;
+	p->p_oldsigmask = p->p_sigmask;
+	p->p_flag |= P_OLDMASK;
+	OSIG2SIG(uap->mask, mask);
+	SIG_CANTMASK(mask);
+	SIGSETLO(p->p_sigmask, mask);
+	signotify(p);
+	while (msleep((caddr_t) ps, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0)
+		/* void */;
+	PROC_UNLOCK(p);
+	mtx_unlock(&Giant);
+	/* always return EINTR rather than ERESTART... */
+	return (EINTR);
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osigstack_args {
+	struct	sigstack *nss;
+	struct	sigstack *oss;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osigstack(td, uap)
+	struct thread *td;
+	register struct osigstack_args *uap;
+{
+	struct proc *p = td->td_proc;
+	struct sigstack ss;
+	int error = 0;
+
+	mtx_lock(&Giant);
+
+	if (uap->oss != NULL) {
+		PROC_LOCK(p);
+		ss.ss_sp = p->p_sigstk.ss_sp;
+		ss.ss_onstack = sigonstack(cpu_getstack(td));
+		PROC_UNLOCK(p);
+		error = copyout(&ss, uap->oss, sizeof(struct sigstack));
+		if (error)
+			goto done2;
+	}
+
+	if (uap->nss != NULL) {
+		if ((error = copyin(uap->nss, &ss, sizeof(ss))) != 0)
+			goto done2;
+		PROC_LOCK(p);
+		p->p_sigstk.ss_sp = ss.ss_sp;
+		p->p_sigstk.ss_size = 0;
+		p->p_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK;
+		p->p_flag |= P_ALTSTACK;
+		PROC_UNLOCK(p);
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaltstack_args {
+	stack_t	*ss;
+	stack_t	*oss;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+sigaltstack(td, uap)
+	struct thread *td;
+	register struct sigaltstack_args *uap;
+{
+	struct proc *p = td->td_proc;
+	stack_t ss;
+	int oonstack;
+	int error = 0;
+
+	mtx_lock(&Giant);
+
+	oonstack = sigonstack(cpu_getstack(td));
+
+	if (uap->oss != NULL) {
+		PROC_LOCK(p);
+		ss = p->p_sigstk;
+		ss.ss_flags = (p->p_flag & P_ALTSTACK)
+		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
+		PROC_UNLOCK(p);
+		if ((error = copyout(&ss, uap->oss, sizeof(stack_t))) != 0)
+			goto done2;
+	}
+
+	if (uap->ss != NULL) {
+		if (oonstack) {
+			error = EPERM;
+			goto done2;
+		}
+		if ((error = copyin(uap->ss, &ss, sizeof(ss))) != 0)
+			goto done2;
+		if ((ss.ss_flags & ~SS_DISABLE) != 0) {
+			error = EINVAL;
+			goto done2;
+		}
+		if (!(ss.ss_flags & SS_DISABLE)) {
+			if (ss.ss_size < p->p_sysent->sv_minsigstksz) {
+				error = ENOMEM;
+				goto done2;
+			}
+			PROC_LOCK(p);
+			p->p_sigstk = ss;
+			p->p_flag |= P_ALTSTACK;
+			PROC_UNLOCK(p);
+		} else {
+			PROC_LOCK(p);
+			p->p_flag &= ~P_ALTSTACK;
+			PROC_UNLOCK(p);
+		}
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Common code for kill process group/broadcast kill.
+ * cp is calling process.
+ */
+int
+killpg1(td, sig, pgid, all)
+	register struct thread *td;
+	int sig, pgid, all;
+{
+	register struct proc *p;
+	struct pgrp *pgrp;
+	int nfound = 0;
+
+	if (all) {
+		/*
+		 * broadcast
+		 */
+		sx_slock(&allproc_lock);
+		LIST_FOREACH(p, &allproc, p_list) {
+			PROC_LOCK(p);
+			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+			    p == td->td_proc) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			if (p_cansignal(td, p, sig) == 0) {
+				nfound++;
+				if (sig)
+					psignal(p, sig);
+			}
+			PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+	} else {
+		sx_slock(&proctree_lock);
+		if (pgid == 0) {
+			/*
+			 * zero pgid means send to my process group.
+			 */
+			pgrp = td->td_proc->p_pgrp;
+			PGRP_LOCK(pgrp);
+		} else {
+			pgrp = pgfind(pgid);
+			if (pgrp == NULL) {
+				sx_sunlock(&proctree_lock);
+				return (ESRCH);
+			}
+		}
+		sx_sunlock(&proctree_lock);
+		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+			PROC_LOCK(p);	      
+			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			if (p->p_stat == SZOMB) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			if (p_cansignal(td, p, sig) == 0) {
+				nfound++;
+				if (sig)
+					psignal(p, sig);
+			}
+			PROC_UNLOCK(p);
+		}
+		PGRP_UNLOCK(pgrp);
+	}
+	return (nfound ? 0 : ESRCH);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct kill_args {
+	int	pid;
+	int	signum;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+kill(td, uap)
+	register struct thread *td;
+	register struct kill_args *uap;
+{
+	register struct proc *p;
+	int error = 0;
+
+	if ((u_int)uap->signum > _SIG_MAXSIG)
+		return (EINVAL);
+
+	mtx_lock(&Giant);
+	if (uap->pid > 0) {
+		/* kill single process */
+		if ((p = pfind(uap->pid)) == NULL) {
+			error = ESRCH;
+		} else if ((error = p_cansignal(td, p, uap->signum)) != 0) {
+			PROC_UNLOCK(p);
+		} else {
+			if (uap->signum)
+				psignal(p, uap->signum);
+			PROC_UNLOCK(p);
+			error = 0;
+		}
+	} else {
+		switch (uap->pid) {
+		case -1:		/* broadcast signal */
+			error = killpg1(td, uap->signum, 0, 1);
+			break;
+		case 0:			/* signal own process group */
+			error = killpg1(td, uap->signum, 0, 0);
+			break;
+		default:		/* negative explicit process group */
+			error = killpg1(td, uap->signum, -uap->pid, 0);
+			break;
+		}
+	}
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct okillpg_args {
+	int	pgid;
+	int	signum;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+okillpg(td, uap)
+	struct thread *td;
+	register struct okillpg_args *uap;
+{
+	int error;
+
+	if ((u_int)uap->signum > _SIG_MAXSIG)
+		return (EINVAL);
+	mtx_lock(&Giant);
+	error = killpg1(td, uap->signum, uap->pgid, 0);
+	mtx_unlock(&Giant);
+	return (error);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Send a signal to a process group.
+ */
+void
+gsignal(pgid, sig)
+	int pgid, sig;
+{
+	struct pgrp *pgrp;
+
+	if (pgid != 0) {
+		sx_slock(&proctree_lock);
+		pgrp = pgfind(pgid);
+		sx_sunlock(&proctree_lock);
+		if (pgrp != NULL) {
+			pgsignal(pgrp, sig, 0);
+			PGRP_UNLOCK(pgrp);
+		}
+	}
+}
+
+/*
+ * Send a signal to a process group.  If checktty is 1,
+ * limit to members which have a controlling terminal.
+ */
+void
+pgsignal(pgrp, sig, checkctty)
+	struct pgrp *pgrp;
+	int sig, checkctty;
+{
+	register struct proc *p;
+
+	if (pgrp) {
+		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
+		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (checkctty == 0 || p->p_flag & P_CONTROLT)
+				psignal(p, sig);
+			PROC_UNLOCK(p);
+		}
+	}
+}
+
+/*
+ * Send a signal caused by a trap to the current process.
+ * If it will be caught immediately, deliver it with correct code.
+ * Otherwise, post it normally.
+ *
+ * MPSAFE
+ */
+void
+trapsignal(p, sig, code)
+	struct proc *p;
+	register int sig;
+	u_long code;
+{
+	register struct sigacts *ps = p->p_sigacts;
+
+	PROC_LOCK(p);
+	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(p->p_sigcatch, sig) &&
+	    !SIGISMEMBER(p->p_sigmask, sig)) {
+		p->p_stats->p_ru.ru_nsignals++;
+#ifdef KTRACE
+		if (KTRPOINT(curthread, KTR_PSIG))
+			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
+			    &p->p_sigmask, code);
+#endif
+		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], sig,
+						&p->p_sigmask, code);
+		SIGSETOR(p->p_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
+		if (!SIGISMEMBER(ps->ps_signodefer, sig))
+			SIGADDSET(p->p_sigmask, sig);
+		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
+			/*
+			 * See do_sigaction() for origin of this code.
+			 */
+			SIGDELSET(p->p_sigcatch, sig);
+			if (sig != SIGCONT &&
+			    sigprop(sig) & SA_IGNORE)
+				SIGADDSET(p->p_sigignore, sig);
+			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+		}
+	} else {
+		p->p_code = code;	/* XXX for core dump/debugger */
+		p->p_sig = sig;		/* XXX to verify code */
+		psignal(p, sig);
+	}
+	PROC_UNLOCK(p);
+}
+
+/*
+ * Send the signal to the process.  If the signal has an action, the action
+ * is usually performed by the target process rather than the caller; we add
+ * the signal to the set of pending signals for the process.
+ *
+ * Exceptions:
+ *   o When a stop signal is sent to a sleeping process that takes the
+ *     default action, the process is stopped without awakening it.
+ *   o SIGCONT restarts stopped processes (or puts them back to sleep)
+ *     regardless of the signal action (eg, blocked or ignored).
+ *
+ * Other ignored signals are discarded immediately.
+ */
+void
+psignal(p, sig)
+	register struct proc *p;
+	register int sig;
+{
+	register int prop;
+	register sig_t action;
+	struct thread *td;
+#ifdef SMP
+	struct ksegrp *kg;
+#endif
+
+	KASSERT(_SIG_VALID(sig),
+	    ("psignal(): invalid signal %d\n", sig));
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	KNOTE(&p->p_klist, NOTE_SIGNAL | sig);
+
+	prop = sigprop(sig);
+
+	/*
+	 * If proc is traced, always give parent a chance;
+	 * if signal event is tracked by procfs, give *that*
+	 * a chance, as well.
+	 */
+	if ((p->p_flag & P_TRACED) || (p->p_stops & S_SIG)) {
+		action = SIG_DFL;
+	} else {
+		/*
+		 * If the signal is being ignored,
+		 * then we forget about it immediately.
+		 * (Note: we don't set SIGCONT in p_sigignore,
+		 * and if it is set to SIG_IGN,
+		 * action will be SIG_DFL here.)
+		 */
+		if (SIGISMEMBER(p->p_sigignore, sig) || (p->p_flag & P_WEXIT))
+			return;
+		if (SIGISMEMBER(p->p_sigmask, sig))
+			action = SIG_HOLD;
+		else if (SIGISMEMBER(p->p_sigcatch, sig))
+			action = SIG_CATCH;
+		else
+			action = SIG_DFL;
+	}
+
+	/*
+	 * bring the priority of a process up if we want it to get 
+	 * killed in this lifetime.
+	 * XXXKSE think if a better way to do this.
+	 *
+	 * What we need to do is see if there is a thread that will
+	 * be able to accept the signal. e.g.
+	 * FOREACH_THREAD_IN_PROC() {
+	 *	if runnable, we're done
+	 *	else pick one at random.
+	 * }
+	 */
+	/* XXXKSE
+	 * For now there is one thread per proc.
+	 * Effectively select one sucker thread..
+	 */
+	td = FIRST_THREAD_IN_PROC(p);
+	mtx_lock_spin(&sched_lock);
+	if ((p->p_ksegrp.kg_nice > NZERO) && (action == SIG_DFL) &&
+	    (prop & SA_KILL) && ((p->p_flag & P_TRACED) == 0))
+		p->p_ksegrp.kg_nice = NZERO; /* XXXKSE */
+	mtx_unlock_spin(&sched_lock);
+
+	if (prop & SA_CONT)
+		SIG_STOPSIGMASK(p->p_siglist);
+
+	if (prop & SA_STOP) {
+		/*
+		 * If sending a tty stop signal to a member of an orphaned
+		 * process group, discard the signal here if the action
+		 * is default; don't stop the process below if sleeping,
+		 * and don't clear any pending SIGCONT.
+		 */
+		if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 &&
+		    action == SIG_DFL)
+		        return;
+		SIG_CONTSIGMASK(p->p_siglist);
+	}
+	SIGADDSET(p->p_siglist, sig);
+	mtx_lock_spin(&sched_lock);
+	signotify(p);
+
+	/*
+	 * Defer further processing for signals which are held,
+	 * except that stopped processes must be continued by SIGCONT.
+	 */
+	if (action == SIG_HOLD && (!(prop & SA_CONT) || p->p_stat != SSTOP)) {
+		mtx_unlock_spin(&sched_lock);
+		return;
+	}
+
+	switch (p->p_stat) {
+
+	case SSLEEP:
+		/*
+		 * If process is sleeping uninterruptibly
+		 * we can't interrupt the sleep... the signal will
+		 * be noticed when the process returns through
+		 * trap() or syscall().
+		 */
+		if ((td->td_flags & TDF_SINTR) == 0)
+			goto out;
+		/*
+		 * Process is sleeping and traced... make it runnable
+		 * so it can discover the signal in issignal() and stop
+		 * for the parent.
+		 */
+		if (p->p_flag & P_TRACED)
+			goto run;
+		/*
+		 * If SIGCONT is default (or ignored) and process is
+		 * asleep, we are finished; the process should not
+		 * be awakened.
+		 */
+		if ((prop & SA_CONT) && action == SIG_DFL) {
+			SIGDELSET(p->p_siglist, sig);
+			goto out;
+		}
+		/*
+		 * When a sleeping process receives a stop
+		 * signal, process immediately if possible.
+		 * All other (caught or default) signals
+		 * cause the process to run.
+		 */
+		if (prop & SA_STOP) {
+			if (action != SIG_DFL)
+				goto runfast;
+			/*
+			 * If a child holding parent blocked,
+			 * stopping could cause deadlock.
+			 */
+			if (p->p_flag & P_PPWAIT)
+				goto out;
+			mtx_unlock_spin(&sched_lock);
+			SIGDELSET(p->p_siglist, sig);
+			p->p_xstat = sig;
+			PROC_LOCK(p->p_pptr);
+			if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)
+				psignal(p->p_pptr, SIGCHLD);
+			PROC_UNLOCK(p->p_pptr);
+			mtx_lock_spin(&sched_lock);
+			stop(p);
+			goto out;
+		} else
+			goto runfast;
+		/* NOTREACHED */
+
+	case SSTOP:
+		/*
+		 * If traced process is already stopped,
+		 * then no further action is necessary.
+		 */
+		if (p->p_flag & P_TRACED)
+			goto out;
+
+		/*
+		 * Kill signal always sets processes running.
+		 */
+		if (sig == SIGKILL)
+			goto runfast;
+
+		if (prop & SA_CONT) {
+			/*
+			 * If SIGCONT is default (or ignored), we continue the
+			 * process but don't leave the signal in p_siglist, as
+			 * it has no further action.  If SIGCONT is held, we
+			 * continue the process and leave the signal in
+			 * p_siglist.  If the process catches SIGCONT, let it
+			 * handle the signal itself.  If it isn't waiting on
+			 * an event, then it goes back to run state.
+			 * Otherwise, process goes back to sleep state.
+			 */
+			if (action == SIG_DFL)
+				SIGDELSET(p->p_siglist, sig);
+			if (action == SIG_CATCH)
+				goto runfast;
+			/*
+			 * XXXKSE
+			 * do this for each thread.
+			 */
+			if (p->p_flag & P_KSES) {
+				mtx_assert(&sched_lock,
+				    MA_OWNED | MA_NOTRECURSED);
+				FOREACH_THREAD_IN_PROC(p, td) {
+					if (td->td_wchan == NULL) {
+						setrunnable(td); /* XXXKSE */
+					} else {
+						/* mark it as sleeping */
+					}
+				}
+			} else {
+				p->p_flag |= P_CONTINUED;
+				wakeup((caddr_t)p->p_pptr);
+				if (td->td_wchan == NULL)
+					goto run;
+				p->p_stat = SSLEEP;
+			}
+			goto out;
+		}
+
+		if (prop & SA_STOP) {
+			/*
+			 * Already stopped, don't need to stop again.
+			 * (If we did the shell could get confused.)
+			 */
+			SIGDELSET(p->p_siglist, sig);
+			goto out;
+		}
+
+		/*
+		 * If process is sleeping interruptibly, then simulate a
+		 * wakeup so that when it is continued, it will be made
+		 * runnable and can look at the signal.  But don't make
+		 * the process runnable, leave it stopped.
+		 * XXXKSE should we wake ALL blocked threads?
+		 */
+		if (p->p_flag & P_KSES) {
+			FOREACH_THREAD_IN_PROC(p, td) {
+				if (td->td_wchan && (td->td_flags & TDF_SINTR)){
+					if (td->td_flags & TDF_CVWAITQ)
+						cv_waitq_remove(td);
+					else
+						unsleep(td); /* XXXKSE */
+				}
+			}
+		} else {
+			if (td->td_wchan && td->td_flags & TDF_SINTR) {
+				if (td->td_flags & TDF_CVWAITQ)
+					cv_waitq_remove(td);
+				else
+					unsleep(td); /* XXXKSE */
+			}
+		}
+		goto out;
+
+	default:
+		/*
+		 * SRUN, SIDL, SZOMB do nothing with the signal,
+		 * other than kicking ourselves if we are running.
+		 * It will either never be noticed, or noticed very soon.
+		 */
+		if (p->p_stat == SRUN) {
+#ifdef SMP
+			struct kse *ke;
+			struct thread *td = curthread;
+/* we should only deliver to one thread.. but which one? */
+			FOREACH_KSEGRP_IN_PROC(p, kg) {
+				FOREACH_KSE_IN_GROUP(kg, ke) {
+					if (ke->ke_thread == td) {
+						continue;
+					}
+					forward_signal(ke->ke_thread);
+				}
+			}
+#endif
+		}
+		goto out;
+	}
+	/*NOTREACHED*/
+
+runfast:
+	/*
+	 * Raise priority to at least PUSER.
+	 * XXXKSE Should we make them all run fast?
+	 * Maybe just one would be enough?
+	 */
+
+	if (FIRST_THREAD_IN_PROC(p)->td_priority > PUSER) {
+		FIRST_THREAD_IN_PROC(p)->td_priority = PUSER;
+	}
+run:
+	/* If we jump here, sched_lock has to be owned. */
+	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
+	setrunnable(td); /* XXXKSE */
+out:
+	mtx_unlock_spin(&sched_lock);
+
+	/* Once we get here, sched_lock should not be owned. */
+	mtx_assert(&sched_lock, MA_NOTOWNED);
+}
+
+/*
+ * If the current process has received a signal (should be caught or cause
+ * termination, should interrupt current syscall), return the signal number.
+ * Stop signals with default action are processed immediately, then cleared;
+ * they aren't returned.  This is checked after each entry to the system for
+ * a syscall or trap (though this can usually be done without calling issignal
+ * by checking the pending signal masks in cursig.) The normal call
+ * sequence is
+ *
+ *	while (sig = cursig(curproc))
+ *		postsig(sig);
+ */
+int
+issignal(p)
+	register struct proc *p;
+{
+	sigset_t mask;
+	register int sig, prop;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	for (;;) {
+		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
+
+		mask = p->p_siglist;
+		SIGSETNAND(mask, p->p_sigmask);
+		if (p->p_flag & P_PPWAIT)
+			SIG_STOPSIGMASK(mask);
+		if (SIGISEMPTY(mask))		/* no signal to send */
+			return (0);
+		sig = sig_ffs(&mask);
+		prop = sigprop(sig);
+
+		_STOPEVENT(p, S_SIG, sig);
+
+		/*
+		 * We should see pending but ignored signals
+		 * only if P_TRACED was on when they were posted.
+		 */
+		if (SIGISMEMBER(p->p_sigignore, sig) && (traced == 0)) {
+			SIGDELSET(p->p_siglist, sig);
+			continue;
+		}
+		if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
+			/*
+			 * If traced, always stop.
+			 */
+			p->p_xstat = sig;
+			PROC_LOCK(p->p_pptr);
+			psignal(p->p_pptr, SIGCHLD);
+			PROC_UNLOCK(p->p_pptr);
+			mtx_lock_spin(&sched_lock);
+			stop(p);
+			PROC_UNLOCK(p);
+			DROP_GIANT();
+			p->p_stats->p_ru.ru_nivcsw++;
+			mi_switch();
+			mtx_unlock_spin(&sched_lock);
+			PICKUP_GIANT();
+			PROC_LOCK(p);
+
+			/*
+			 * If the traced bit got turned off, go back up
+			 * to the top to rescan signals.  This ensures
+			 * that p_sig* and ps_sigact are consistent.
+			 */
+			if ((p->p_flag & P_TRACED) == 0)
+				continue;
+
+			/*
+			 * If parent wants us to take the signal,
+			 * then it will leave it in p->p_xstat;
+			 * otherwise we just look for signals again.
+			 */
+			SIGDELSET(p->p_siglist, sig);	/* clear old signal */
+			sig = p->p_xstat;
+			if (sig == 0)
+				continue;
+
+			/*
+			 * Put the new signal into p_siglist.  If the
+			 * signal is being masked, look for other signals.
+			 */
+			SIGADDSET(p->p_siglist, sig);
+			if (SIGISMEMBER(p->p_sigmask, sig))
+				continue;
+		}
+
+		/*
+		 * Decide whether the signal should be returned.
+		 * Return the signal's number, or fall through
+		 * to clear it from the pending mask.
+		 */
+		switch ((int)(intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
+
+		case (int)SIG_DFL:
+			/*
+			 * Don't take default actions on system processes.
+			 */
+			if (p->p_pid <= 1) {
+#ifdef DIAGNOSTIC
+				/*
+				 * Are you sure you want to ignore SIGSEGV
+				 * in init? XXX
+				 */
+				printf("Process (pid %lu) got signal %d\n",
+					(u_long)p->p_pid, sig);
+#endif
+				break;		/* == ignore */
+			}
+			/*
+			 * If there is a pending stop signal to process
+			 * with default action, stop here,
+			 * then clear the signal.  However,
+			 * if process is member of an orphaned
+			 * process group, ignore tty stop signals.
+			 */
+			if (prop & SA_STOP) {
+				if (p->p_flag & P_TRACED ||
+		    		    (p->p_pgrp->pg_jobc == 0 &&
+				     prop & SA_TTYSTOP))
+					break;	/* == ignore */
+				p->p_xstat = sig;
+				PROC_LOCK(p->p_pptr);
+				if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)
+					psignal(p->p_pptr, SIGCHLD);
+				PROC_UNLOCK(p->p_pptr);
+				mtx_lock_spin(&sched_lock);
+				stop(p);
+				PROC_UNLOCK(p);
+				DROP_GIANT();
+				p->p_stats->p_ru.ru_nivcsw++;
+				mi_switch();
+				mtx_unlock_spin(&sched_lock);
+				PICKUP_GIANT();
+				PROC_LOCK(p);
+				break;
+			} else if (prop & SA_IGNORE) {
+				/*
+				 * Except for SIGCONT, shouldn't get here.
+				 * Default action is to ignore; drop it.
+				 */
+				break;		/* == ignore */
+			} else
+				return (sig);
+			/*NOTREACHED*/
+
+		case (int)SIG_IGN:
+			/*
+			 * Masking above should prevent us ever trying
+			 * to take action on an ignored signal other
+			 * than SIGCONT, unless process is traced.
+			 */
+			if ((prop & SA_CONT) == 0 &&
+			    (p->p_flag & P_TRACED) == 0)
+				printf("issignal\n");
+			break;		/* == ignore */
+
+		default:
+			/*
+			 * This signal has an action, let
+			 * postsig() process it.
+			 */
+			return (sig);
+		}
+		SIGDELSET(p->p_siglist, sig);		/* take the signal! */
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Put the argument process into the stopped state and notify the parent
+ * via wakeup.  Signals are handled elsewhere.  The process must not be
+ * on the run queue.  Must be called with the proc p locked and the scheduler
+ * lock held.
+ */
+static void
+stop(p)
+	register struct proc *p;
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	mtx_assert(&sched_lock, MA_OWNED);
+	p->p_stat = SSTOP;
+	p->p_flag &= ~P_WAITED;
+	wakeup((caddr_t)p->p_pptr);
+}
+
+/*
+ * Take the action for the specified signal
+ * from the current set of pending signals.
+ */
+void
+postsig(sig)
+	register int sig;
+{
+	struct thread *td = curthread;
+	register struct proc *p = td->td_proc;
+	struct sigacts *ps;
+	sig_t action;
+	sigset_t returnmask;
+	int code;
+
+	KASSERT(sig != 0, ("postsig"));
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	ps = p->p_sigacts;
+	SIGDELSET(p->p_siglist, sig);
+	action = ps->ps_sigact[_SIG_IDX(sig)];
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_PSIG))
+		ktrpsig(sig, action, p->p_flag & P_OLDMASK ?
+		    &p->p_oldsigmask : &p->p_sigmask, 0);
+#endif
+	_STOPEVENT(p, S_SIG, sig);
+
+	if (action == SIG_DFL) {
+		/*
+		 * Default action, where the default is to kill
+		 * the process.  (Other cases were ignored above.)
+		 */
+		sigexit(td, sig);
+		/* NOTREACHED */
+	} else {
+		/*
+		 * If we get here, the signal must be caught.
+		 */
+		KASSERT(action != SIG_IGN && !SIGISMEMBER(p->p_sigmask, sig),
+		    ("postsig action"));
+		/*
+		 * Set the new mask value and also defer further
+		 * occurrences of this signal.
+		 *
+		 * Special case: user has done a sigsuspend.  Here the
+		 * current mask is not of interest, but rather the
+		 * mask from before the sigsuspend is what we want
+		 * restored after the signal processing is completed.
+		 */
+		if (p->p_flag & P_OLDMASK) {
+			returnmask = p->p_oldsigmask;
+			p->p_flag &= ~P_OLDMASK;
+		} else
+			returnmask = p->p_sigmask;
+
+		SIGSETOR(p->p_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
+		if (!SIGISMEMBER(ps->ps_signodefer, sig))
+			SIGADDSET(p->p_sigmask, sig);
+
+		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
+			/*
+			 * See do_sigaction() for origin of this code.
+			 */
+			SIGDELSET(p->p_sigcatch, sig);
+			if (sig != SIGCONT &&
+			    sigprop(sig) & SA_IGNORE)
+				SIGADDSET(p->p_sigignore, sig);
+			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+		}
+		p->p_stats->p_ru.ru_nsignals++;
+		if (p->p_sig != sig) {
+			code = 0;
+		} else {
+			code = p->p_code;
+			p->p_code = 0;
+			p->p_sig = 0;
+		}
+		(*p->p_sysent->sv_sendsig)(action, sig, &returnmask, code);
+	}
+}
+
+/*
+ * Kill the current process for stated reason.
+ */
+void
+killproc(p, why)
+	struct proc *p;
+	char *why;
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
+		p, p->p_pid, p->p_comm);
+	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
+		p->p_ucred ? p->p_ucred->cr_uid : -1, why);
+	psignal(p, SIGKILL);
+}
+
+/*
+ * Force the current process to exit with the specified signal, dumping core
+ * if appropriate.  We bypass the normal tests for masked and caught signals,
+ * allowing unrecoverable failures to terminate the process without changing
+ * signal state.  Mark the accounting record with the signal termination.
+ * If dumping core, save the signal number for the debugger.  Calls exit and
+ * does not return.
+ */
+void
+sigexit(td, sig)
+	struct thread *td;
+	int sig;
+{
+	struct proc *p = td->td_proc;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	p->p_acflag |= AXSIG;
+	if (sigprop(sig) & SA_CORE) {
+		p->p_sig = sig;
+		/*
+		 * Log signals which would cause core dumps
+		 * (Log as LOG_INFO to appease those who don't want
+		 * these messages.)
+		 * XXX : Todo, as well as euid, write out ruid too
+		 */
+		PROC_UNLOCK(p);
+		if (!mtx_owned(&Giant))
+			mtx_lock(&Giant);
+		if (coredump(td) == 0)
+			sig |= WCOREFLAG;
+		if (kern_logsigexit)
+			log(LOG_INFO,
+			    "pid %d (%s), uid %d: exited on signal %d%s\n",
+			    p->p_pid, p->p_comm,
+			    td->td_ucred ? td->td_ucred->cr_uid : -1,
+			    sig &~ WCOREFLAG,
+			    sig & WCOREFLAG ? " (core dumped)" : "");
+	} else {
+		PROC_UNLOCK(p);
+		if (!mtx_owned(&Giant))
+			mtx_lock(&Giant);
+	}
+	exit1(td, W_EXITCODE(0, sig));
+	/* NOTREACHED */
+}
+
+static char corefilename[MAXPATHLEN+1] = {"%N.core"};
+SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
+	      sizeof(corefilename), "process corefile name format string");
+
+/*
+ * expand_name(name, uid, pid)
+ * Expand the name described in corefilename, using name, uid, and pid.
+ * corefilename is a printf-like string, with three format specifiers:
+ *	%N	name of process ("name")
+ *	%P	process id (pid)
+ *	%U	user id (uid)
+ * For example, "%N.core" is the default; they can be disabled completely
+ * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
+ * This is controlled by the sysctl variable kern.corefile (see above).
+ */
+
+static char *
+expand_name(name, uid, pid)
+	const char *name;
+	uid_t uid;
+	pid_t pid;
+{
+	const char *format, *appendstr;
+	char *temp;
+	char buf[11];		/* Buffer for pid/uid -- max 4B */
+	size_t i, l, n;
+
+	format = corefilename;
+	temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO);
+	if (temp == NULL)
+		return (NULL);
+	for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) {
+		switch (format[i]) {
+		case '%':	/* Format character */
+			i++;
+			switch (format[i]) {
+			case '%':
+				appendstr = "%";
+				break;
+			case 'N':	/* process name */
+				appendstr = name;
+				break;
+			case 'P':	/* process id */
+				sprintf(buf, "%u", pid);
+				appendstr = buf;
+				break;
+			case 'U':	/* user id */
+				sprintf(buf, "%u", uid);
+				appendstr = buf;
+				break;
+			default:
+				appendstr = "";
+			  	log(LOG_ERR,
+				    "Unknown format character %c in `%s'\n",
+				    format[i], format);
+			}
+			l = strlen(appendstr);
+			if ((n + l) >= MAXPATHLEN)
+				goto toolong;
+			memcpy(temp + n, appendstr, l);
+			n += l;
+			break;
+		default:
+			temp[n++] = format[i];
+		}
+	}
+	if (format[i] != '\0')
+		goto toolong;
+	return (temp);
+toolong:
+	log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n",
+	    (long)pid, name, (u_long)uid);
+	free(temp, M_TEMP);
+	return (NULL);
+}
+
+/*
+ * Dump a process' core.  The main routine does some
+ * policy checking, and creates the name of the coredump;
+ * then it passes on a vnode and a size limit to the process-specific
+ * coredump routine if there is one; if there _is not_ one, it returns
+ * ENOSYS; otherwise it returns the error from the process-specific routine.
+ *
+ * XXX: VOP_GETATTR() here requires holding the vnode lock.
+ */
+
+static int
+coredump(struct thread *td)
+{
+	struct proc *p = td->td_proc;
+	register struct vnode *vp;
+	register struct ucred *cred = td->td_ucred;
+	struct flock lf;
+	struct nameidata nd;
+	struct vattr vattr;
+	int error, error1, flags;
+	struct mount *mp;
+	char *name;			/* name of corefile */
+	off_t limit;
+
+	PROC_LOCK(p);
+	_STOPEVENT(p, S_CORE, 0);
+
+	if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) {
+		PROC_UNLOCK(p);
+		return (EFAULT);
+	}
+	
+	/*
+	 * Note that the bulk of limit checking is done after
+	 * the corefile is created.  The exception is if the limit
+	 * for corefiles is 0, in which case we don't bother
+	 * creating the corefile at all.  This layout means that
+	 * a corefile is truncated instead of not being created,
+	 * if it is larger than the limit.
+	 */
+	limit = p->p_rlimit[RLIMIT_CORE].rlim_cur;
+	if (limit == 0) {
+		PROC_UNLOCK(p);
+		return 0;
+	}
+	PROC_UNLOCK(p);
+
+restart:
+	name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid);
+	if (name == NULL)
+		return (EINVAL);
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); /* XXXKSE */
+	flags = O_CREAT | FWRITE | O_NOFOLLOW;
+	error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR);
+	free(name, M_TEMP);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+
+	VOP_UNLOCK(vp, 0, td);
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	lf.l_type = F_WRLCK;
+	error = VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK);
+	if (error)
+		goto out2;
+
+	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+		lf.l_type = F_UNLCK;
+		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+		if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
+			return (error);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+
+	/* Don't dump to non-regular files or files with links. */
+	if (vp->v_type != VREG ||
+	    VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) {
+		error = EFAULT;
+		goto out1;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_size = 0;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	VOP_LEASE(vp, td, cred, LEASE_WRITE);
+	VOP_SETATTR(vp, &vattr, cred, td);
+	VOP_UNLOCK(vp, 0, td);
+	PROC_LOCK(p);
+	p->p_acflag |= ACORE;
+	PROC_UNLOCK(p);
+
+	error = p->p_sysent->sv_coredump ?
+	  p->p_sysent->sv_coredump(td, vp, limit) :
+	  ENOSYS;
+
+out1:
+	lf.l_type = F_UNLCK;
+	VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+	vn_finished_write(mp);
+out2:
+	error1 = vn_close(vp, FWRITE, cred, td);
+	if (error == 0)
+		error = error1;
+	return (error);
+}
+
+/*
+ * Nonexistent system call-- signal process (may want to handle it).
+ * Flag error in case process won't see signal immediately (blocked or ignored).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nosys_args {
+	int	dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+nosys(td, args)
+	struct thread *td;
+	struct nosys_args *args;
+{
+	struct proc *p = td->td_proc;
+
+	mtx_lock(&Giant);
+	PROC_LOCK(p);
+	psignal(p, SIGSYS);
+	PROC_UNLOCK(p);
+	mtx_unlock(&Giant);
+	return (ENOSYS);
+}
+
+/*
+ * Send a SIGIO or SIGURG signal to a process or process group using
+ * stored credentials rather than those of the current process.
+ */
+void
+pgsigio(sigiop, sig, checkctty)
+	struct sigio **sigiop;
+	int sig, checkctty;
+{
+	struct sigio *sigio;
+
+	SIGIO_LOCK();
+	sigio = *sigiop;
+	if (sigio == NULL) {
+		SIGIO_UNLOCK();
+		return;
+	}
+	if (sigio->sio_pgid > 0) {
+		PROC_LOCK(sigio->sio_proc);
+		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
+			psignal(sigio->sio_proc, sig);
+		PROC_UNLOCK(sigio->sio_proc);
+	} else if (sigio->sio_pgid < 0) {
+		struct proc *p;
+
+		PGRP_LOCK(sigio->sio_pgrp);
+		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
+			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
+				psignal(p, sig);
+			PROC_UNLOCK(p);
+		}
+		PGRP_UNLOCK(sigio->sio_pgrp);
+	}
+	SIGIO_UNLOCK();
+}
+
+static int
+filt_sigattach(struct knote *kn)
+{
+	struct proc *p = curproc;
+
+	kn->kn_ptr.p_proc = p;
+	kn->kn_flags |= EV_CLEAR;		/* automatically set */
+
+	PROC_LOCK(p);
+	SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
+	PROC_UNLOCK(p);
+
+	return (0);
+}
+
+static void
+filt_sigdetach(struct knote *kn)
+{
+	struct proc *p = kn->kn_ptr.p_proc;
+
+	PROC_LOCK(p);
+	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
+	PROC_UNLOCK(p);
+}
+
+/*
+ * signal knotes are shared with proc knotes, so we apply a mask to 
+ * the hint in order to differentiate them from process hints.  This
+ * could be avoided by using a signal-specific knote list, but probably
+ * isn't worth the trouble.
+ */
+static int
+filt_signal(struct knote *kn, long hint)
+{
+
+	if (hint & NOTE_SIGNAL) {
+		hint &= ~NOTE_SIGNAL;
+
+		if (kn->kn_id == hint)
+			kn->kn_data++;
+	}
+	return (kn->kn_data != 0);
+}
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
new file mode 100644
index 0000000..5e32eee
--- /dev/null
+++ b/sys/kern/kern_subr.c
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_zero.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+
+SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 
+	"Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
+
+#ifdef ZERO_COPY_SOCKETS
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/swap_pager.h>
+#include <sys/mbuf.h>
+#include <machine/cpu.h>
+
+/* Declared in uipc_socket.c */
+extern int so_zero_copy_receive;
+
+static int vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr,
+		       vm_offset_t uaddr);
+static int userspaceco(caddr_t cp, u_int cnt, struct uio *uio,
+			    struct vm_object *obj, int disposable);
+
+static int
+vm_pgmoveco(mapa, srcobj,  kaddr, uaddr)
+        vm_map_t mapa;
+	vm_object_t srcobj;
+	vm_offset_t kaddr, uaddr;
+{
+	vm_map_t map = mapa;
+	vm_page_t kern_pg, user_pg;
+	vm_object_t uobject;
+	vm_map_entry_t entry;
+	vm_pindex_t upindex, kpindex;
+	vm_prot_t prot;
+	boolean_t wired;
+
+	/*
+	 * First lookup the kernel page.
+	 */
+	kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr));
+
+	if ((vm_map_lookup(&map, uaddr,
+			   VM_PROT_READ, &entry, &uobject,
+			   &upindex, &prot, &wired)) != KERN_SUCCESS) {
+		return(EFAULT);
+	}
+	if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
+		vm_page_sleep_busy(user_pg, 1, "vm_pgmoveco");
+		pmap_remove(map->pmap, uaddr, uaddr+PAGE_SIZE);
+		vm_page_busy(user_pg);
+		vm_page_free(user_pg);
+	}
+
+	if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) ||
+	    (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) {
+		printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), "
+		       "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex,
+			kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0,
+			kern_pg->hold_count, (u_long)kern_pg->phys_addr);
+		if ((kern_pg->queue - kern_pg->pc) == PQ_FREE)
+			panic("vm_pgmoveco: renaming free page");
+		else
+			panic("vm_pgmoveco: renaming busy page");
+	}
+	kpindex = kern_pg->pindex;
+	vm_page_busy(kern_pg);
+	vm_page_rename(kern_pg, uobject, upindex);
+	vm_page_flag_clear(kern_pg, PG_BUSY);
+	kern_pg->valid = VM_PAGE_BITS_ALL;
+	
+	vm_map_lookup_done(map, entry);
+	return(KERN_SUCCESS);
+}
+#endif /* ZERO_COPY_SOCKETS */
+
+int
+uiomove(cp, n, uio)
+	register caddr_t cp;
+	register int n;
+	register struct uio *uio;
+{
+	struct thread *td = curthread;
+	register struct iovec *iov;
+	u_int cnt;
+	int error = 0;
+	int save = 0;
+
+	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+	    ("uiomove: mode"));
+	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
+	    ("uiomove proc"));
+
+	if (td) {
+		mtx_lock_spin(&sched_lock);
+		save = td->td_flags & TDF_DEADLKTREAT;
+		td->td_flags |= TDF_DEADLKTREAT;
+		mtx_unlock_spin(&sched_lock);
+	}
+
+	while (n > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		cnt = iov->iov_len;
+		if (cnt == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (cnt > n)
+			cnt = n;
+
+		switch (uio->uio_segflg) {
+
+		case UIO_USERSPACE:
+			if (ticks - PCPU_GET(switchticks) >= hogticks)
+				uio_yield();
+			if (uio->uio_rw == UIO_READ)
+				error = copyout(cp, iov->iov_base, cnt);
+			else
+				error = copyin(iov->iov_base, cp, cnt);
+			if (error)
+				goto out;
+			break;
+
+		case UIO_SYSSPACE:
+			if (uio->uio_rw == UIO_READ)
+				bcopy(cp, iov->iov_base, cnt);
+			else
+				bcopy(iov->iov_base, cp, cnt);
+			break;
+		case UIO_NOCOPY:
+			break;
+		}
+		iov->iov_base += cnt;
+		iov->iov_len -= cnt;
+		uio->uio_resid -= cnt;
+		uio->uio_offset += cnt;
+		cp += cnt;
+		n -= cnt;
+	}
+out:
+	if (td != curthread) printf("uiomove: IT CHANGED!");
+	td = curthread;	/* Might things have changed in copyin/copyout? */
+	if (td) {
+		mtx_lock_spin(&sched_lock);
+		td->td_flags = (td->td_flags & ~TDF_DEADLKTREAT) | save;
+		mtx_unlock_spin(&sched_lock);
+	}
+	return (error);
+}
+
+#if defined(ENABLE_VFS_IOOPT) || defined(ZERO_COPY_SOCKETS)
+/*
+ * Experimental support for zero-copy I/O
+ */
+static int
+userspaceco(cp, cnt, uio, obj, disposable)
+	caddr_t cp;
+	u_int cnt;
+	struct uio *uio;
+	struct vm_object *obj;
+	int disposable;
+{
+	struct iovec *iov;
+	int error;
+
+	iov = uio->uio_iov;
+
+#ifdef ZERO_COPY_SOCKETS
+
+	if (uio->uio_rw == UIO_READ) {
+		if ((so_zero_copy_receive != 0)
+		 && (obj != NULL)
+		 && ((cnt & PAGE_MASK) == 0)
+		 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
+		 && ((uio->uio_offset & PAGE_MASK) == 0)
+		 && ((((intptr_t) cp) & PAGE_MASK) == 0)
+		 && (obj->type == OBJT_DEFAULT)
+		 && (disposable != 0)) {
+			/* SOCKET: use page-trading */
+			/*
+			 * We only want to call vm_pgmoveco() on
+			 * disposeable pages, since it gives the
+			 * kernel page to the userland process.
+			 */
+			error =	vm_pgmoveco(&curproc->p_vmspace->vm_map,
+					    obj, (vm_offset_t)cp, 
+					    (vm_offset_t)iov->iov_base);
+
+			/*
+			 * If we get an error back, attempt
+			 * to use copyout() instead.  The
+			 * disposable page should be freed
+			 * automatically if we weren't able to move
+			 * it into userland.
+			 */
+			if (error != 0)
+				error = copyout(cp, iov->iov_base, cnt);
+#ifdef ENABLE_VFS_IOOPT
+		} else if ((vfs_ioopt != 0)
+		 && ((cnt & PAGE_MASK) == 0)
+		 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
+		 && ((uio->uio_offset & PAGE_MASK) == 0)
+		 && ((((intptr_t) cp) & PAGE_MASK) == 0)) {
+			error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
+					   uio->uio_offset, cnt,
+					   (vm_offset_t) iov->iov_base, NULL);
+#endif /* ENABLE_VFS_IOOPT */
+		} else {
+			error = copyout(cp, iov->iov_base, cnt);
+		}
+	} else {
+		error = copyin(iov->iov_base, cp, cnt);
+	}
+#else /* ZERO_COPY_SOCKETS */
+	if (uio->uio_rw == UIO_READ) {
+#ifdef ENABLE_VFS_IOOPT
+		if ((vfs_ioopt != 0)
+		 && ((cnt & PAGE_MASK) == 0)
+		 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
+		 && ((uio->uio_offset & PAGE_MASK) == 0)
+		 && ((((intptr_t) cp) & PAGE_MASK) == 0)) {
+			error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
+					   uio->uio_offset, cnt,
+					   (vm_offset_t) iov->iov_base, NULL);
+		} else
+#endif /* ENABLE_VFS_IOOPT */
+		{
+			error = copyout(cp, iov->iov_base, cnt);
+		}
+	} else {
+		error = copyin(iov->iov_base, cp, cnt);
+	}
+#endif /* ZERO_COPY_SOCKETS */
+
+	return (error);
+}
+
+int
+uiomoveco(cp, n, uio, obj, disposable)
+	caddr_t cp;
+	int n;
+	struct uio *uio;
+	struct vm_object *obj;
+	int disposable;
+{
+	struct iovec *iov;
+	u_int cnt;
+	int error;
+
+	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+	    ("uiomoveco: mode"));
+	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
+	    ("uiomoveco proc"));
+
+	while (n > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		cnt = iov->iov_len;
+		if (cnt == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (cnt > n)
+			cnt = n;
+
+		switch (uio->uio_segflg) {
+
+		case UIO_USERSPACE:
+			if (ticks - PCPU_GET(switchticks) >= hogticks)
+				uio_yield();
+
+			error = userspaceco(cp, cnt, uio, obj, disposable);
+
+			if (error)
+				return (error);
+			break;
+
+		case UIO_SYSSPACE:
+			if (uio->uio_rw == UIO_READ)
+				bcopy(cp, iov->iov_base, cnt);
+			else
+				bcopy(iov->iov_base, cp, cnt);
+			break;
+		case UIO_NOCOPY:
+			break;
+		}
+		iov->iov_base += cnt;
+		iov->iov_len -= cnt;
+		uio->uio_resid -= cnt;
+		uio->uio_offset += cnt;
+		cp += cnt;
+		n -= cnt;
+	}
+	return (0);
+}
+#endif /* ENABLE_VFS_IOOPT || ZERO_COPY_SOCKETS */
+
+#ifdef ENABLE_VFS_IOOPT
+
+/*
+ * Experimental support for zero-copy I/O
+ */
+int
+uioread(n, uio, obj, nread)
+	int n;
+	struct uio *uio;
+	struct vm_object *obj;
+	int *nread;
+{
+	int npagesmoved;
+	struct iovec *iov;
+	u_int cnt, tcnt;
+	int error;
+
+	*nread = 0;
+	if (vfs_ioopt < 2)
+		return 0;
+
+	error = 0;
+
+	while (n > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		cnt = iov->iov_len;
+		if (cnt == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (cnt > n)
+			cnt = n;
+
+		if ((uio->uio_segflg == UIO_USERSPACE) &&
+			((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
+				 ((uio->uio_offset & PAGE_MASK) == 0) ) {
+
+			if (cnt < PAGE_SIZE)
+				break;
+
+			cnt &= ~PAGE_MASK;
+
+			if (ticks - PCPU_GET(switchticks) >= hogticks)
+				uio_yield();
+			error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
+						uio->uio_offset, cnt,
+						(vm_offset_t) iov->iov_base, &npagesmoved);
+
+			if (npagesmoved == 0)
+				break;
+
+			tcnt = npagesmoved * PAGE_SIZE;
+			cnt = tcnt;
+
+			if (error)
+				break;
+
+			iov->iov_base += cnt;
+			iov->iov_len -= cnt;
+			uio->uio_resid -= cnt;
+			uio->uio_offset += cnt;
+			*nread += cnt;
+			n -= cnt;
+		} else {
+			break;
+		}
+	}
+	return error;
+}
+#endif /* ENABLE_VFS_IOOPT */
+
+/*
+ * Give next character to user as result of read.
+ */
+int
+ureadc(c, uio)
+	register int c;
+	register struct uio *uio;
+{
+	register struct iovec *iov;
+
+again:
+	if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
+		panic("ureadc");
+	iov = uio->uio_iov;
+	if (iov->iov_len == 0) {
+		uio->uio_iovcnt--;
+		uio->uio_iov++;
+		goto again;
+	}
+	switch (uio->uio_segflg) {
+
+	case UIO_USERSPACE:
+		if (subyte(iov->iov_base, c) < 0)
+			return (EFAULT);
+		break;
+
+	case UIO_SYSSPACE:
+		*iov->iov_base = c;
+		break;
+
+	case UIO_NOCOPY:
+		break;
+	}
+	iov->iov_base++;
+	iov->iov_len--;
+	uio->uio_resid--;
+	uio->uio_offset++;
+	return (0);
+}
+
+/*
+ * General routine to allocate a hash table.
+ */
+void *
+hashinit(elements, type, hashmask)
+	int elements;
+	struct malloc_type *type;
+	u_long *hashmask;
+{
+	long hashsize;
+	LIST_HEAD(generic, generic) *hashtbl;
+	int i;
+
+	if (elements <= 0)
+		panic("hashinit: bad elements");
+	for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
+		continue;
+	hashsize >>= 1;
+	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+	for (i = 0; i < hashsize; i++)
+		LIST_INIT(&hashtbl[i]);
+	*hashmask = hashsize - 1;
+	return (hashtbl);
+}
+
+static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
+			2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
+			7159, 7673, 8191, 12281, 16381, 24571, 32749 };
+#define NPRIMES (sizeof(primes) / sizeof(primes[0]))
+
+/*
+ * General routine to allocate a prime number sized hash table.
+ */
+void *
+phashinit(elements, type, nentries)
+	int elements;
+	struct malloc_type *type;
+	u_long *nentries;
+{
+	long hashsize;
+	LIST_HEAD(generic, generic) *hashtbl;
+	int i;
+
+	if (elements <= 0)
+		panic("phashinit: bad elements");
+	for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
+		i++;
+		if (i == NPRIMES)
+			break;
+		hashsize = primes[i];
+	}
+	hashsize = primes[i - 1];
+	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+	for (i = 0; i < hashsize; i++)
+		LIST_INIT(&hashtbl[i]);
+	*nentries = hashsize;
+	return (hashtbl);
+}
+
+void
+uio_yield()
+{
+	struct thread *td;
+
+	td = curthread;
+	mtx_lock_spin(&sched_lock);
+	DROP_GIANT();
+	td->td_priority = td->td_ksegrp->kg_user_pri; /* XXXKSE */
+	setrunqueue(td);
+	td->td_proc->p_stats->p_ru.ru_nivcsw++;
+	mi_switch();
+	mtx_unlock_spin(&sched_lock);
+	PICKUP_GIANT();
+}
+
+int
+copyinfrom(const void *src, void *dst, size_t len, int seg)
+{
+	int error = 0;
+
+	switch (seg) {
+	case UIO_USERSPACE:
+		error = copyin(src, dst, len);
+		break;
+	case UIO_SYSSPACE:
+		bcopy(src, dst, len);
+		break;
+	default:
+		panic("copyinfrom: bad seg %d\n", seg);
+	}
+	return (error);
+}
+
+int
+copyinstrfrom(const void *src, void *dst, size_t len, size_t *copied, int seg)
+{
+	int error = 0;
+
+	switch (seg) {
+	case UIO_USERSPACE:
+		error = copyinstr(src, dst, len, copied);
+		break;
+	case UIO_SYSSPACE:
+		error = copystr(src, dst, len, copied);
+		break;
+	default:
+		panic("copyinstrfrom: bad seg %d\n", seg);
+	}
+	return (error);
+}
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
new file mode 100644
index 0000000..2b531c0
--- /dev/null
+++ b/sys/kern/kern_switch.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <machine/critical.h>
+
+CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
+
+/*
+ * Global run queue.
+ */
+static struct runq runq;
+SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq)
+
+/*
+ * Wrappers which implement old interface; act on global run queue.
+ */
+
+struct thread *
+choosethread(void)
+{
+	return (runq_choose(&runq)->ke_thread);
+}
+
+int
+procrunnable(void)
+{
+	return runq_check(&runq);
+}
+
+void
+remrunqueue(struct thread *td)
+{
+	runq_remove(&runq, td->td_kse);
+}
+
+void
+setrunqueue(struct thread *td)
+{
+	runq_add(&runq, td->td_kse);
+}
+
+/* Critical sections that prevent preemption. */
+void
+critical_enter(void)
+{
+	struct thread *td;
+
+	td = curthread;
+	if (td->td_critnest == 0)
+		cpu_critical_enter();
+	td->td_critnest++;
+}
+
+void
+critical_exit(void)
+{
+	struct thread *td;
+
+	td = curthread;
+	if (td->td_critnest == 1) {
+		td->td_critnest = 0;
+		cpu_critical_exit();
+	} else {
+		td->td_critnest--;
+	}
+}
+
+/*
+ * Clear the status bit of the queue corresponding to priority level pri,
+ * indicating that it is empty.
+ */
+static __inline void
+runq_clrbit(struct runq *rq, int pri)
+{
+	struct rqbits *rqb;
+
+	rqb = &rq->rq_status;
+	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
+	    rqb->rqb_bits[RQB_WORD(pri)],
+	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
+	    RQB_BIT(pri), RQB_WORD(pri));
+	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
+}
+
+/*
+ * Find the index of the first non-empty run queue.  This is done by
+ * scanning the status bits, a set bit indicates a non-empty queue.
+ */
+static __inline int
+runq_findbit(struct runq *rq)
+{
+	struct rqbits *rqb;
+	int pri;
+	int i;
+
+	rqb = &rq->rq_status;
+	for (i = 0; i < RQB_LEN; i++)
+		if (rqb->rqb_bits[i]) {
+			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
+			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
+			    rqb->rqb_bits[i], i, pri);
+			return (pri);
+		}
+
+	return (-1);
+}
+
+/*
+ * Set the status bit of the queue corresponding to priority level pri,
+ * indicating that it is non-empty.
+ */
+static __inline void
+runq_setbit(struct runq *rq, int pri)
+{
+	struct rqbits *rqb;
+
+	rqb = &rq->rq_status;
+	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
+	    rqb->rqb_bits[RQB_WORD(pri)],
+	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
+	    RQB_BIT(pri), RQB_WORD(pri));
+	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
+}
+
+/*
+ * Add the process to the queue specified by its priority, and set the
+ * corresponding status bit.
+ */
+void
+runq_add(struct runq *rq, struct kse *ke)
+{
+	struct rqhead *rqh;
+	int pri;
+
+#ifdef INVARIANTS
+	struct proc *p = ke->ke_proc;
+#endif
+	if (ke->ke_flags & KEF_ONRUNQ)
+		return;
+	mtx_assert(&sched_lock, MA_OWNED);
+	KASSERT(p->p_stat == SRUN, ("runq_add: proc %p (%s) not SRUN",
+	    p, p->p_comm));
+	pri = ke->ke_thread->td_priority / RQ_PPQ;
+	ke->ke_rqindex = pri;
+	runq_setbit(rq, pri);
+	rqh = &rq->rq_queues[pri];
+	CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p",
+	    ke->ke_proc, ke->ke_thread->td_priority, pri, rqh);
+	TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
+	ke->ke_flags |= KEF_ONRUNQ;
+}
+
+/*
+ * Return true if there are runnable processes of any priority on the run
+ * queue, false otherwise.  Has no side effects, does not modify the run
+ * queue structure.
+ */
+int
+runq_check(struct runq *rq)
+{
+	struct rqbits *rqb;
+	int i;
+
+	rqb = &rq->rq_status;
+	for (i = 0; i < RQB_LEN; i++)
+		if (rqb->rqb_bits[i]) {
+			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
+			    rqb->rqb_bits[i], i);
+			return (1);
+		}
+	CTR0(KTR_RUNQ, "runq_check: empty");
+
+	return (0);
+}
+
+/*
+ * Find and remove the highest priority process from the run queue.
+ * If there are no runnable processes, the per-cpu idle process is
+ * returned.  Will not return NULL under any circumstances.
+ */
+struct kse *
+runq_choose(struct runq *rq)
+{
+	struct rqhead *rqh;
+	struct kse *ke;
+	int pri;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	if ((pri = runq_findbit(rq)) != -1) {
+		rqh = &rq->rq_queues[pri];
+		ke = TAILQ_FIRST(rqh);
+		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
+		KASSERT(ke->ke_proc->p_stat == SRUN,
+		    ("runq_choose: process %d(%s) in state %d", ke->ke_proc->p_pid,
+		    ke->ke_proc->p_comm, ke->ke_proc->p_stat));
+		CTR3(KTR_RUNQ, "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
+		TAILQ_REMOVE(rqh, ke, ke_procq);
+		if (TAILQ_EMPTY(rqh)) {
+			CTR0(KTR_RUNQ, "runq_choose: empty");
+			runq_clrbit(rq, pri);
+		}
+		ke->ke_flags &= ~KEF_ONRUNQ;
+		return (ke);
+	}
+	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
+
+	return (PCPU_GET(idlethread)->td_kse);
+}
+
+/*
+ * Initialize a run structure.
+ */
+void
+runq_init(struct runq *rq)
+{
+	int i;
+
+	bzero(rq, sizeof *rq);
+	for (i = 0; i < RQ_NQS; i++)
+		TAILQ_INIT(&rq->rq_queues[i]);
+}
+
+/*
+ * Remove the process from the queue specified by its priority, and clear the
+ * corresponding status bit if the queue becomes empty.
+ */
+void
+runq_remove(struct runq *rq, struct kse *ke)
+{
+	struct rqhead *rqh;
+	int pri;
+
+	if (!(ke->ke_flags & KEF_ONRUNQ))
+		return;
+	mtx_assert(&sched_lock, MA_OWNED);
+	pri = ke->ke_rqindex;
+	rqh = &rq->rq_queues[pri];
+	CTR4(KTR_RUNQ, "runq_remove: p=%p pri=%d %d rqh=%p",
+	    ke, ke->ke_thread->td_priority, pri, rqh);
+	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
+	TAILQ_REMOVE(rqh, ke, ke_procq);
+	if (TAILQ_EMPTY(rqh)) {
+		CTR0(KTR_RUNQ, "runq_remove: empty");
+		runq_clrbit(rq, pri);
+	}
+	ke->ke_flags &= ~KEF_ONRUNQ;
+}
diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c
new file mode 100644
index 0000000..2f69a00
--- /dev/null
+++ b/sys/kern/kern_sx.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (C) 2001 Jason Evans <jasone@freebsd.org>.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible 
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Shared/exclusive locks.  This implementation assures deterministic lock
+ * granting behavior, so that slocks and xlocks are interleaved.
+ *
+ * Priority propagation will not generally raise the priority of lock holders,
+ * so should not be relied upon in combination with sx locks.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+
+struct lock_class lock_class_sx = {
+	"sx",
+	LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE
+};
+
+#ifndef INVARIANTS
+#define	_sx_assert(sx, what, file, line)
+#endif
+
+void
+sx_sysinit(void *arg)
+{
+	struct sx_args *sargs = arg;
+
+	sx_init(sargs->sa_sx, sargs->sa_desc);
+}
+
+void
+sx_init(struct sx *sx, const char *description)
+{
+	struct lock_object *lock;
+
+	lock = &sx->sx_object;
+	KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
+	    ("sx lock %s %p already initialized", description, sx));
+	bzero(sx, sizeof(*sx));
+	lock->lo_class = &lock_class_sx;
+	lock->lo_type = lock->lo_name = description;
+	lock->lo_flags = LO_WITNESS | LO_RECURSABLE | LO_SLEEPABLE |
+	    LO_UPGRADABLE;
+	sx->sx_lock = mtx_pool_find(sx);
+	sx->sx_cnt = 0;
+	cv_init(&sx->sx_shrd_cv, description);
+	sx->sx_shrd_wcnt = 0;
+	cv_init(&sx->sx_excl_cv, description);
+	sx->sx_excl_wcnt = 0;
+	sx->sx_xholder = NULL;
+
+	LOCK_LOG_INIT(lock, 0);
+
+	WITNESS_INIT(lock);
+}
+
+void
+sx_destroy(struct sx *sx)
+{
+
+	LOCK_LOG_DESTROY(&sx->sx_object, 0);
+
+	KASSERT((sx->sx_cnt == 0 && sx->sx_shrd_wcnt == 0 && sx->sx_excl_wcnt ==
+	    0), ("%s (%s): holders or waiters\n", __func__,
+	    sx->sx_object.lo_name));
+
+	sx->sx_lock = NULL;
+	cv_destroy(&sx->sx_shrd_cv);
+	cv_destroy(&sx->sx_excl_cv);
+
+	WITNESS_DESTROY(&sx->sx_object);
+}
+
+void
+_sx_slock(struct sx *sx, const char *file, int line)
+{
+
+	mtx_lock(sx->sx_lock);
+	KASSERT(sx->sx_xholder != curthread,
+	    ("%s (%s): slock while xlock is held @ %s:%d\n", __func__,
+	    sx->sx_object.lo_name, file, line));
+
+	/*
+	 * Loop in case we lose the race for lock acquisition.
+	 */
+	while (sx->sx_cnt < 0) {
+		sx->sx_shrd_wcnt++;
+		cv_wait(&sx->sx_shrd_cv, sx->sx_lock);
+		sx->sx_shrd_wcnt--;
+	}
+
+	/* Acquire a shared lock. */
+	sx->sx_cnt++;
+
+	LOCK_LOG_LOCK("SLOCK", &sx->sx_object, 0, 0, file, line);
+	WITNESS_LOCK(&sx->sx_object, 0, file, line);
+
+	mtx_unlock(sx->sx_lock);
+}
+
+int
+_sx_try_slock(struct sx *sx, const char *file, int line)
+{
+
+	mtx_lock(sx->sx_lock);
+	if (sx->sx_cnt >= 0) {
+		sx->sx_cnt++;
+		LOCK_LOG_TRY("SLOCK", &sx->sx_object, 0, 1, file, line);
+		WITNESS_LOCK(&sx->sx_object, LOP_TRYLOCK, file, line);
+		mtx_unlock(sx->sx_lock);
+		return (1);
+	} else {
+		LOCK_LOG_TRY("SLOCK", &sx->sx_object, 0, 0, file, line);
+		mtx_unlock(sx->sx_lock);
+		return (0);
+	}
+}
+
+void
+_sx_xlock(struct sx *sx, const char *file, int line)
+{
+
+	mtx_lock(sx->sx_lock);
+
+	/*
+	 * With sx locks, we're absolutely not permitted to recurse on
+	 * xlocks, as it is fatal (deadlock). Normally, recursion is handled
+	 * by WITNESS, but as it is not semantically correct to hold the
+	 * xlock while in here, we consider it API abuse and put it under
+	 * INVARIANTS.
+	 */
+	KASSERT(sx->sx_xholder != curthread,
+	    ("%s (%s): xlock already held @ %s:%d", __func__,
+	    sx->sx_object.lo_name, file, line));
+
+	/* Loop in case we lose the race for lock acquisition. */
+	while (sx->sx_cnt != 0) {
+		sx->sx_excl_wcnt++;
+		cv_wait(&sx->sx_excl_cv, sx->sx_lock);
+		sx->sx_excl_wcnt--;
+	}
+
+	MPASS(sx->sx_cnt == 0);
+
+	/* Acquire an exclusive lock. */
+	sx->sx_cnt--;
+	sx->sx_xholder = curthread;
+
+	LOCK_LOG_LOCK("XLOCK", &sx->sx_object, 0, 0, file, line);
+	WITNESS_LOCK(&sx->sx_object, LOP_EXCLUSIVE, file, line);
+
+	mtx_unlock(sx->sx_lock);
+}
+
+int
+_sx_try_xlock(struct sx *sx, const char *file, int line)
+{
+
+	mtx_lock(sx->sx_lock);
+	if (sx->sx_cnt == 0) {
+		sx->sx_cnt--;
+		sx->sx_xholder = curthread;
+		LOCK_LOG_TRY("XLOCK", &sx->sx_object, 0, 1, file, line);
+		WITNESS_LOCK(&sx->sx_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file,
+		    line);
+		mtx_unlock(sx->sx_lock);
+		return (1);
+	} else {
+		LOCK_LOG_TRY("XLOCK", &sx->sx_object, 0, 0, file, line);
+		mtx_unlock(sx->sx_lock);
+		return (0);
+	}
+}
+
+void
+_sx_sunlock(struct sx *sx, const char *file, int line)
+{
+
+	_sx_assert(sx, SX_SLOCKED, file, line);
+	mtx_lock(sx->sx_lock);
+
+	WITNESS_UNLOCK(&sx->sx_object, 0, file, line);
+
+	/* Release. */
+	sx->sx_cnt--;
+
+	/*
+	 * If we just released the last shared lock, wake any waiters up, giving
+	 * exclusive lockers precedence.  In order to make sure that exclusive
+	 * lockers won't be blocked forever, don't wake shared lock waiters if
+	 * there are exclusive lock waiters.
+	 */
+	if (sx->sx_excl_wcnt > 0) {
+		if (sx->sx_cnt == 0)
+			cv_signal(&sx->sx_excl_cv);
+	} else if (sx->sx_shrd_wcnt > 0)
+		cv_broadcast(&sx->sx_shrd_cv);
+
+	LOCK_LOG_LOCK("SUNLOCK", &sx->sx_object, 0, 0, file, line);
+
+	mtx_unlock(sx->sx_lock);
+}
+
+void
+_sx_xunlock(struct sx *sx, const char *file, int line)
+{
+
+	_sx_assert(sx, SX_XLOCKED, file, line);
+	mtx_lock(sx->sx_lock);
+	MPASS(sx->sx_cnt == -1);
+
+	WITNESS_UNLOCK(&sx->sx_object, LOP_EXCLUSIVE, file, line);
+
+	/* Release. */
+	sx->sx_cnt++;
+	sx->sx_xholder = NULL;
+
+	/*
+	 * Wake up waiters if there are any.  Give precedence to slock waiters.
+	 */
+	if (sx->sx_shrd_wcnt > 0)
+		cv_broadcast(&sx->sx_shrd_cv);
+	else if (sx->sx_excl_wcnt > 0)
+		cv_signal(&sx->sx_excl_cv);
+
+	LOCK_LOG_LOCK("XUNLOCK", &sx->sx_object, 0, 0, file, line);
+
+	mtx_unlock(sx->sx_lock);
+}
+
+int
+_sx_try_upgrade(struct sx *sx, const char *file, int line)
+{
+
+	_sx_assert(sx, SX_SLOCKED, file, line);
+	mtx_lock(sx->sx_lock);
+
+	if (sx->sx_cnt == 1) {
+		sx->sx_cnt = -1;
+		sx->sx_xholder = curthread;
+
+		LOCK_LOG_TRY("XUPGRADE", &sx->sx_object, 0, 1, file, line);
+		WITNESS_UPGRADE(&sx->sx_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+
+		mtx_unlock(sx->sx_lock);
+		return (1);
+	} else {
+		LOCK_LOG_TRY("XUPGRADE", &sx->sx_object, 0, 0, file, line);
+		mtx_unlock(sx->sx_lock);
+		return (0);
+	}
+}
+
+void
+_sx_downgrade(struct sx *sx, const char *file, int line)
+{
+
+	_sx_assert(sx, SX_XLOCKED, file, line);
+	mtx_lock(sx->sx_lock);
+	MPASS(sx->sx_cnt == -1);
+
+	WITNESS_DOWNGRADE(&sx->sx_object, 0, file, line);
+
+	sx->sx_cnt = 1;
+	sx->sx_xholder = NULL;
+        if (sx->sx_shrd_wcnt > 0)
+                cv_broadcast(&sx->sx_shrd_cv);
+
+	LOCK_LOG_LOCK("XDOWNGRADE", &sx->sx_object, 0, 0, file, line);
+
+	mtx_unlock(sx->sx_lock);
+}
+
+#ifdef INVARIANT_SUPPORT
+#ifndef INVARIANTS
+#undef	_sx_assert
+#endif
+
+/*
+ * In the non-WITNESS case, sx_assert() can only detect that at least
+ * *some* thread owns an slock, but it cannot guarantee that *this*
+ * thread owns an slock.
+ */
+void
+_sx_assert(struct sx *sx, int what, const char *file, int line)
+{
+
+	switch (what) {
+	case SX_LOCKED:
+	case SX_SLOCKED:
+#ifdef WITNESS
+		witness_assert(&sx->sx_object, what, file, line);
+#else
+		mtx_lock(sx->sx_lock);
+		if (sx->sx_cnt <= 0 &&
+		    (what == SX_SLOCKED || sx->sx_xholder != curthread))
+			printf("Lock %s not %slocked @ %s:%d\n",
+			    sx->sx_object.lo_name, (what == SX_SLOCKED) ?
+			    "share " : "", file, line);
+		mtx_unlock(sx->sx_lock);
+#endif
+		break;
+	case SX_XLOCKED:
+		mtx_lock(sx->sx_lock);
+		if (sx->sx_xholder != curthread)
+			printf("Lock %s not exclusively locked @ %s:%d\n",
+			    sx->sx_object.lo_name, file, line);
+		mtx_unlock(sx->sx_lock);
+		break;
+	default:
+		panic("Unknown sx lock assertion: %d @ %s:%d", what, file,
+		    line);
+	}
+}
+#endif	/* INVARIANT_SUPPORT */
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
new file mode 100644
index 0000000..6f9adad
--- /dev/null
+++ b/sys/kern/kern_synch.c
@@ -0,0 +1,970 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/cpu.h>
+
+static void sched_setup(void *dummy);
+SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
+
+int	hogticks;
+int	lbolt;
+int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
+
+static struct callout loadav_callout;
+static struct callout schedcpu_callout;
+static struct callout roundrobin_callout;
+
+struct loadavg averunnable =
+	{ {0, 0, 0}, FSCALE };	/* load average, of runnable procs */
+/*
+ * Constants for averages over 1, 5, and 15 minutes
+ * when sampling at 5 second intervals.
+ */
+static fixpt_t cexp[3] = {
+	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
+	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
+	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
+};
+
+static void	endtsleep(void *);
+static void	loadav(void *arg);
+static void	roundrobin(void *arg);
+static void	schedcpu(void *arg);
+
+static int
+sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
+{
+	int error, new_val;
+
+	new_val = sched_quantum * tick;
+	error = sysctl_handle_int(oidp, &new_val, 0, req);
+        if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (new_val < tick)
+		return (EINVAL);
+	sched_quantum = new_val / tick;
+	hogticks = 2 * sched_quantum;
+	return (0);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof sched_quantum, sysctl_kern_quantum, "I",
+	"Roundrobin scheduling quantum in microseconds");
+
+/*
+ * Arrange to reschedule if necessary, taking the priorities and
+ * schedulers into account.
+ */
+void
+maybe_resched(struct thread *td)
+{
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	if (td->td_priority < curthread->td_priority)
+		curthread->td_kse->ke_flags |= KEF_NEEDRESCHED;
+}
+
+int 
+roundrobin_interval(void)
+{
+	return (sched_quantum);
+}
+
+/*
+ * Force switch among equal priority processes every 100ms.
+ * We don't actually need to force a context switch of the current process.
+ * The act of firing the event triggers a context switch to softclock() and
+ * then switching back out again which is equivalent to a preemption, thus
+ * no further work is needed on the local CPU.
+ */
+/* ARGSUSED */
+static void
+roundrobin(arg)
+	void *arg;
+{
+
+#ifdef SMP
+	mtx_lock_spin(&sched_lock);
+	forward_roundrobin();
+	mtx_unlock_spin(&sched_lock);
+#endif
+
+	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
+}
+
+/*
+ * Constants for digital decay and forget:
+ *	90% of (p_estcpu) usage in 5 * loadav time
+ *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
+ *          Note that, as ps(1) mentions, this can let percentages
+ *          total over 100% (I've seen 137.9% for 3 processes).
+ *
+ * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
+ *
+ * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
+ * That is, the system wants to compute a value of decay such
+ * that the following for loop:
+ * 	for (i = 0; i < (5 * loadavg); i++)
+ * 		p_estcpu *= decay;
+ * will compute
+ * 	p_estcpu *= 0.1;
+ * for all values of loadavg:
+ *
+ * Mathematically this loop can be expressed by saying:
+ * 	decay ** (5 * loadavg) ~= .1
+ *
+ * The system computes decay as:
+ * 	decay = (2 * loadavg) / (2 * loadavg + 1)
+ *
+ * We wish to prove that the system's computation of decay
+ * will always fulfill the equation:
+ * 	decay ** (5 * loadavg) ~= .1
+ *
+ * If we compute b as:
+ * 	b = 2 * loadavg
+ * then
+ * 	decay = b / (b + 1)
+ *
+ * We now need to prove two things:
+ *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
+ *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
+ *
+ * Facts:
+ *         For x close to zero, exp(x) =~ 1 + x, since
+ *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
+ *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
+ *         For x close to zero, ln(1+x) =~ x, since
+ *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
+ *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
+ *         ln(.1) =~ -2.30
+ *
+ * Proof of (1):
+ *    Solve (factor)**(power) =~ .1 given power (5*loadav):
+ *	solving for factor,
+ *      ln(factor) =~ (-2.30/5*loadav), or
+ *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
+ *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
+ *
+ * Proof of (2):
+ *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
+ *	solving for power,
+ *      power*ln(b/(b+1)) =~ -2.30, or
+ *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
+ *
+ * Actual power values for the implemented algorithm are as follows:
+ *      loadav: 1       2       3       4
+ *      power:  5.68    10.32   14.94   19.55
+ */
+
+/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
+#define	loadfactor(loadav)	(2 * (loadav))
+#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
+
+/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
+static int	fscale __unused = FSCALE;
+SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
+
+/*
+ * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
+ * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
+ * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
+ *
+ * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
+ *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
+ *
+ * If you don't want to bother with the faster/more-accurate formula, you
+ * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
+ * (more general) method of calculating the %age of CPU used by a process.
+ */
+#define	CCPU_SHIFT	11
+
+/*
+ * Recompute process priorities, every hz ticks.
+ * MP-safe, called without the Giant mutex.
+ */
+/* ARGSUSED */
+static void
+schedcpu(arg)
+	void *arg;
+{
+	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+	struct thread *td;
+	struct proc *p;
+	struct kse *ke;
+	struct ksegrp *kg;
+	int realstathz;
+	int awake;
+
+	realstathz = stathz ? stathz : hz;
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		mtx_lock_spin(&sched_lock);
+		p->p_swtime++;
+		FOREACH_KSEGRP_IN_PROC(p, kg) { 
+			awake = 0;
+			FOREACH_KSE_IN_GROUP(kg, ke) {
+				/*
+				 * Increment time in/out of memory and sleep
+				 * time (if sleeping).  We ignore overflow;
+				 * with 16-bit int's (remember them?)
+				 * overflow takes 45 days.
+				 */
+				/* XXXKSE */
+			/*	if ((ke->ke_flags & KEF_ONRUNQ) == 0) */
+				if (p->p_stat == SSLEEP || p->p_stat == SSTOP) {
+					ke->ke_slptime++;
+				} else {
+					ke->ke_slptime = 0;
+					awake = 1;
+				}
+
+				/*
+				 * pctcpu is only for ps?
+				 * Do it per kse.. and add them up at the end?
+				 * XXXKSE
+				 */
+				ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >> FSHIFT;
+				/*
+				 * If the kse has been idle the entire second,
+				 * stop recalculating its priority until
+				 * it wakes up.
+				 */
+				if (ke->ke_slptime > 1) {
+					continue;
+				}
+
+#if	(FSHIFT >= CCPU_SHIFT)
+				ke->ke_pctcpu += (realstathz == 100) ?
+				    ((fixpt_t) ke->ke_cpticks) <<
+				    (FSHIFT - CCPU_SHIFT) :
+				    100 * (((fixpt_t) ke->ke_cpticks) <<
+				    (FSHIFT - CCPU_SHIFT)) / realstathz;
+#else
+				ke->ke_pctcpu += ((FSCALE - ccpu) *
+				    (ke->ke_cpticks * FSCALE / realstathz)) >>
+				    FSHIFT;
+#endif
+				ke->ke_cpticks = 0;
+			} /* end of kse loop */
+			if (awake == 0) {
+				kg->kg_slptime++;
+			} else {
+				kg->kg_slptime = 0;
+			}
+			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
+		      	resetpriority(kg);
+			td = FIRST_THREAD_IN_PROC(p);
+		      	if (td->td_priority >= PUSER &&
+			    (p->p_sflag & PS_INMEM)) {
+				int changedqueue =
+				    ((td->td_priority / RQ_PPQ) !=
+				     (kg->kg_user_pri / RQ_PPQ));
+
+				td->td_priority = kg->kg_user_pri;
+				FOREACH_KSE_IN_GROUP(kg, ke) {
+					if ((ke->ke_oncpu == NOCPU) &&
+					    (p->p_stat == SRUN) && /* XXXKSE */
+					    changedqueue) {
+						remrunqueue(ke->ke_thread);
+						setrunqueue(ke->ke_thread);
+					}
+				}
+			}
+		} /* end of ksegrp loop */
+		mtx_unlock_spin(&sched_lock);
+	} /* end of process loop */
+	sx_sunlock(&allproc_lock);
+	wakeup((caddr_t)&lbolt);
+	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
+}
+
+/*
+ * Recalculate the priority of a process after it has slept for a while.
+ * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
+ * least six times the loadfactor will decay p_estcpu to zero.
+ */
+void
+updatepri(td)
+	register struct thread *td;
+{
+	register struct ksegrp *kg;
+	register unsigned int newcpu;
+	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+
+	if (td == NULL)
+		return;
+	kg = td->td_ksegrp;
+	newcpu = kg->kg_estcpu;
+	if (kg->kg_slptime > 5 * loadfac)
+		kg->kg_estcpu = 0;
+	else {
+		kg->kg_slptime--;	/* the first time was done in schedcpu */
+		while (newcpu && --kg->kg_slptime)
+			newcpu = decay_cpu(loadfac, newcpu);
+		kg->kg_estcpu = newcpu;
+	}
+	resetpriority(td->td_ksegrp);
+}
+
+/*
+ * We're only looking at 7 bits of the address; everything is
+ * aligned to 4, lots of things are aligned to greater powers
+ * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
+ */
+#define TABLESIZE	128
+static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE];
+#define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
+
+void
+sleepinit(void)
+{
+	int i;
+
+	sched_quantum = hz/10;
+	hogticks = 2 * sched_quantum;
+	for (i = 0; i < TABLESIZE; i++)
+		TAILQ_INIT(&slpque[i]);
+}
+
+/*
+ * General sleep call.  Suspends the current process until a wakeup is
+ * performed on the specified identifier.  The process will then be made
+ * runnable with the specified priority.  Sleeps at most timo/hz seconds
+ * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
+ * before and after sleeping, else signals are not checked.  Returns 0 if
+ * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
+ * signal needs to be delivered, ERESTART is returned if the current system
+ * call should be restarted if possible, and EINTR is returned if the system
+ * call should be interrupted by the signal (return EINTR).
+ *
+ * The mutex argument is exited before the caller is suspended, and
+ * entered before msleep returns.  If priority includes the PDROP
+ * flag the mutex is not entered before returning.
+ */
+int
+msleep(ident, mtx, priority, wmesg, timo)
+	void *ident;
+	struct mtx *mtx;
+	int priority, timo;
+	const char *wmesg;
+{
+	struct thread *td = curthread;
+	struct proc *p = td->td_proc;
+	int sig, catch = priority & PCATCH;
+	int rval = 0;
+	WITNESS_SAVE_DECL(mtx);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0);
+#endif
+	WITNESS_SLEEP(0, &mtx->mtx_object);
+	KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
+	    ("sleeping without a mutex"));
+	mtx_lock_spin(&sched_lock);
+	if (cold || panicstr) {
+		/*
+		 * After a panic, or during autoconfiguration,
+		 * just give interrupts a chance, then just return;
+		 * don't run any other procs or panic below,
+		 * in case this is the idle process and already asleep.
+		 */
+		if (mtx != NULL && priority & PDROP)
+			mtx_unlock(mtx);
+		mtx_unlock_spin(&sched_lock);
+		return (0);
+	}
+
+	DROP_GIANT();
+
+	if (mtx != NULL) {
+		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
+		WITNESS_SAVE(&mtx->mtx_object, mtx);
+		mtx_unlock(mtx);
+		if (priority & PDROP)
+			mtx = NULL;
+	}
+
+	KASSERT(p != NULL, ("msleep1"));
+	KASSERT(ident != NULL && td->td_proc->p_stat == SRUN, ("msleep"));
+
+	td->td_wchan = ident;
+	td->td_wmesg = wmesg;
+	td->td_kse->ke_slptime = 0;	/* XXXKSE */
+	td->td_ksegrp->kg_slptime = 0;
+	td->td_priority = priority & PRIMASK;
+	CTR5(KTR_PROC, "msleep: thread %p (pid %d, %s) on %s (%p)",
+	    td, p->p_pid, p->p_comm, wmesg, ident);
+	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], td, td_slpq);
+	if (timo)
+		callout_reset(&td->td_slpcallout, timo, endtsleep, td);
+	/*
+	 * We put ourselves on the sleep queue and start our timeout
+	 * before calling cursig, as we could stop there, and a wakeup
+	 * or a SIGCONT (or both) could occur while we were stopped.
+	 * A SIGCONT would cause us to be marked as SSLEEP
+	 * without resuming us, thus we must be ready for sleep
+	 * when cursig is called.  If the wakeup happens while we're
+	 * stopped, td->td_wchan will be 0 upon return from cursig.
+	 */
+	if (catch) {
+		CTR3(KTR_PROC, "msleep caught: proc %p (pid %d, %s)", p,
+		    p->p_pid, p->p_comm);
+		td->td_flags |= TDF_SINTR;
+		mtx_unlock_spin(&sched_lock);
+		PROC_LOCK(p);
+		sig = cursig(p);
+		mtx_lock_spin(&sched_lock);
+		PROC_UNLOCK(p);
+		if (sig != 0) {
+			if (td->td_wchan != NULL)
+				unsleep(td);
+		} else if (td->td_wchan == NULL)
+			catch = 0;
+	} else
+		sig = 0;
+	if (td->td_wchan != NULL) {
+		td->td_proc->p_stat = SSLEEP;
+		p->p_stats->p_ru.ru_nvcsw++;
+		mi_switch();
+	}
+	CTR3(KTR_PROC, "msleep resume: proc %p (pid %d, %s)", td, p->p_pid,
+	    p->p_comm);
+	KASSERT(td->td_proc->p_stat == SRUN, ("running but not SRUN"));
+	td->td_flags &= ~TDF_SINTR;
+	if (td->td_flags & TDF_TIMEOUT) {
+		td->td_flags &= ~TDF_TIMEOUT;
+		if (sig == 0)
+			rval = EWOULDBLOCK;
+	} else if (td->td_flags & TDF_TIMOFAIL)
+		td->td_flags &= ~TDF_TIMOFAIL;
+	else if (timo && callout_stop(&td->td_slpcallout) == 0) {
+		/*
+		 * This isn't supposed to be pretty.  If we are here, then
+		 * the endtsleep() callout is currently executing on another
+		 * CPU and is either spinning on the sched_lock or will be
+		 * soon.  If we don't synchronize here, there is a chance
+		 * that this process may msleep() again before the callout
+		 * has a chance to run and the callout may end up waking up
+		 * the wrong msleep().  Yuck.
+		 */
+		td->td_flags |= TDF_TIMEOUT;
+		p->p_stats->p_ru.ru_nivcsw++;
+		mi_switch();
+	}
+	mtx_unlock_spin(&sched_lock);
+
+	if (rval == 0 && catch) {
+		PROC_LOCK(p);
+		/* XXX: shouldn't we always be calling cursig() */ 
+		if (sig != 0 || (sig = cursig(p))) {
+			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
+				rval = EINTR;
+			else
+				rval = ERESTART;
+		}
+		PROC_UNLOCK(p);
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0);
+#endif
+	PICKUP_GIANT();
+	if (mtx != NULL) {
+		mtx_lock(mtx);
+		WITNESS_RESTORE(&mtx->mtx_object, mtx);
+	}
+	return (rval);
+}
+
+/*
+ * Implement timeout for msleep()
+ *
+ * If process hasn't been awakened (wchan non-zero),
+ * set timeout flag and undo the sleep.  If proc
+ * is stopped, just unsleep so it will remain stopped.
+ * MP-safe, called without the Giant mutex.
+ */
+static void
+endtsleep(arg)
+	void *arg;
+{
+	register struct thread *td = arg;
+
+	CTR3(KTR_PROC, "endtsleep: thread %p (pid %d, %s)", td, td->td_proc->p_pid,
+	    td->td_proc->p_comm);
+	mtx_lock_spin(&sched_lock);
+	/*
+	 * This is the other half of the synchronization with msleep()
+	 * described above.  If the PS_TIMEOUT flag is set, we lost the
+	 * race and just need to put the process back on the runqueue.
+	 */
+	if ((td->td_flags & TDF_TIMEOUT) != 0) {
+		td->td_flags &= ~TDF_TIMEOUT;
+		setrunqueue(td);
+	} else if (td->td_wchan != NULL) {
+		if (td->td_proc->p_stat == SSLEEP)  /* XXXKSE */
+			setrunnable(td);
+		else
+			unsleep(td);
+		td->td_flags |= TDF_TIMEOUT;
+	} else {
+		td->td_flags |= TDF_TIMOFAIL;
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Remove a process from its wait queue
+ */
+void
+unsleep(struct thread *td)
+{
+
+	mtx_lock_spin(&sched_lock);
+	if (td->td_wchan != NULL) {
+		TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq);
+		td->td_wchan = NULL;
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Make all processes sleeping on the specified identifier runnable.
+ */
+void
+wakeup(ident)
+	register void *ident;
+{
+	register struct slpquehead *qp;
+	register struct thread *td;
+	struct thread *ntd;
+	struct proc *p;
+
+	mtx_lock_spin(&sched_lock);
+	qp = &slpque[LOOKUP(ident)];
+restart:
+	for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
+		ntd = TAILQ_NEXT(td, td_slpq);
+		p = td->td_proc;
+		if (td->td_wchan == ident) {
+			TAILQ_REMOVE(qp, td, td_slpq);
+			td->td_wchan = NULL;
+			if (td->td_proc->p_stat == SSLEEP) {
+				/* OPTIMIZED EXPANSION OF setrunnable(p); */
+				CTR3(KTR_PROC, "wakeup: thread %p (pid %d, %s)",
+				    td, p->p_pid, p->p_comm);
+				if (td->td_ksegrp->kg_slptime > 1)
+					updatepri(td);
+				td->td_ksegrp->kg_slptime = 0;
+				td->td_kse->ke_slptime = 0;
+				td->td_proc->p_stat = SRUN;
+				if (p->p_sflag & PS_INMEM) {
+					setrunqueue(td);
+					maybe_resched(td);
+				} else {
+					p->p_sflag |= PS_SWAPINREQ;
+					wakeup((caddr_t)&proc0);
+				}
+				/* END INLINE EXPANSION */
+				goto restart;
+			}
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Make a process sleeping on the specified identifier runnable.
+ * May wake more than one process if a target process is currently
+ * swapped out.
+ */
+void
+wakeup_one(ident)
+	register void *ident;
+{
+	register struct slpquehead *qp;
+	register struct thread *td;
+	register struct proc *p;
+	struct thread *ntd;
+
+	mtx_lock_spin(&sched_lock);
+	qp = &slpque[LOOKUP(ident)];
+restart:
+	for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
+		ntd = TAILQ_NEXT(td, td_slpq);
+		p = td->td_proc;
+		if (td->td_wchan == ident) {
+			TAILQ_REMOVE(qp, td, td_slpq);
+			td->td_wchan = NULL;
+			if (td->td_proc->p_stat == SSLEEP) {
+				/* OPTIMIZED EXPANSION OF setrunnable(p); */
+				CTR3(KTR_PROC, "wakeup1: proc %p (pid %d, %s)",
+				    p, p->p_pid, p->p_comm);
+				if (td->td_ksegrp->kg_slptime > 1)
+					updatepri(td);
+				td->td_ksegrp->kg_slptime = 0;
+				td->td_kse->ke_slptime = 0;
+				td->td_proc->p_stat = SRUN;
+				if (p->p_sflag & PS_INMEM) {
+					setrunqueue(td);
+					maybe_resched(td);
+					break;
+				} else {
+					p->p_sflag |= PS_SWAPINREQ;
+					wakeup((caddr_t)&proc0);
+				}
+				/* END INLINE EXPANSION */
+				goto restart;
+			}
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * The machine independent parts of mi_switch().
+ */
+void
+mi_switch()
+{
+	struct bintime new_switchtime;
+	struct thread *td = curthread;	/* XXX */
+	register struct proc *p = td->td_proc;	/* XXX */
+#if 0
+	register struct rlimit *rlim;
+#endif
+	u_int sched_nest;
+
+	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
+#ifdef INVARIANTS
+	if (p->p_stat != SMTX && p->p_stat != SRUN)
+		mtx_assert(&Giant, MA_NOTOWNED);
+#endif
+
+	/*
+	 * Compute the amount of time during which the current
+	 * process was running, and add that to its total so far.
+	 */
+	binuptime(&new_switchtime);
+	bintime_add(&p->p_runtime, &new_switchtime);
+	bintime_sub(&p->p_runtime, PCPU_PTR(switchtime));
+
+#ifdef DDB
+	/*
+	 * Don't perform context switches from the debugger.
+	 */
+	if (db_active) {
+		mtx_unlock_spin(&sched_lock);
+		db_error("Context switches not allowed in the debugger.");
+	}
+#endif
+
+#if 0
+	/*
+	 * Check if the process exceeds its cpu resource allocation.
+	 * If over max, kill it.
+	 *
+	 * XXX drop sched_lock, pickup Giant
+	 */
+	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
+	    p->p_runtime > p->p_limit->p_cpulimit) {
+		rlim = &p->p_rlimit[RLIMIT_CPU];
+		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
+			mtx_unlock_spin(&sched_lock);
+			PROC_LOCK(p);
+			killproc(p, "exceeded maximum CPU limit");
+			mtx_lock_spin(&sched_lock);
+			PROC_UNLOCK(p);
+		} else {
+			mtx_unlock_spin(&sched_lock);
+			PROC_LOCK(p);
+			psignal(p, SIGXCPU);
+			mtx_lock_spin(&sched_lock);
+			PROC_UNLOCK(p);
+			if (rlim->rlim_cur < rlim->rlim_max) {
+				/* XXX: we should make a private copy */
+				rlim->rlim_cur += 5;
+			}
+		}
+	}
+#endif
+
+	/*
+	 * Pick a new current process and record its start time.
+	 */
+	cnt.v_swtch++;
+	PCPU_SET(switchtime, new_switchtime);
+	CTR3(KTR_PROC, "mi_switch: old proc %p (pid %d, %s)", p, p->p_pid,
+	    p->p_comm);
+	sched_nest = sched_lock.mtx_recurse;
+	td->td_lastcpu = td->td_kse->ke_oncpu;
+	td->td_kse->ke_oncpu = NOCPU;
+	td->td_kse->ke_flags &= ~KEF_NEEDRESCHED;
+	cpu_switch();
+	td->td_kse->ke_oncpu = PCPU_GET(cpuid);
+	sched_lock.mtx_recurse = sched_nest;
+	sched_lock.mtx_lock = (uintptr_t)td;
+	CTR3(KTR_PROC, "mi_switch: new proc %p (pid %d, %s)", p, p->p_pid,
+	    p->p_comm);
+	if (PCPU_GET(switchtime.sec) == 0)
+		binuptime(PCPU_PTR(switchtime));
+	PCPU_SET(switchticks, ticks);
+}
+
+/*
+ * Change process state to be runnable,
+ * placing it on the run queue if it is in memory,
+ * and awakening the swapper if it isn't in memory.
+ */
+void
+setrunnable(struct thread *td)
+{
+	struct proc *p = td->td_proc;
+
+	mtx_lock_spin(&sched_lock);
+	switch (p->p_stat) {
+	case SZOMB: /* not a thread flag XXXKSE */
+		panic("setrunnable(1)");
+	}
+	switch (td->td_proc->p_stat) {
+	case 0:
+	case SRUN:
+	case SWAIT:
+	default:
+		panic("setrunnable(2)");
+	case SSTOP:
+	case SSLEEP:			/* e.g. when sending signals */
+		if (td->td_flags & TDF_CVWAITQ)
+			cv_waitq_remove(td);
+		else
+			unsleep(td);
+		break;
+
+	case SIDL:
+		break;
+	}
+	td->td_proc->p_stat = SRUN;
+	if (td->td_ksegrp->kg_slptime > 1)
+		updatepri(td);
+	td->td_ksegrp->kg_slptime = 0;
+	td->td_kse->ke_slptime = 0;
+	if ((p->p_sflag & PS_INMEM) == 0) {
+		p->p_sflag |= PS_SWAPINREQ;
+		wakeup((caddr_t)&proc0);
+	} else {
+		setrunqueue(td);
+		maybe_resched(td);
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Compute the priority of a process when running in user mode.
+ * Arrange to reschedule if the resulting priority is better
+ * than that of the current process.
+ */
+void
+resetpriority(kg)
+	register struct ksegrp *kg;
+{
+	register unsigned int newpriority;
+	struct thread *td;
+
+	mtx_lock_spin(&sched_lock);
+	if (kg->kg_pri_class == PRI_TIMESHARE) {
+		newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
+		    NICE_WEIGHT * (kg->kg_nice - PRIO_MIN);
+		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
+		    PRI_MAX_TIMESHARE);
+		kg->kg_user_pri = newpriority;
+	}
+	FOREACH_THREAD_IN_GROUP(kg, td) {
+		maybe_resched(td);
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Compute a tenex style load average of a quantity on
+ * 1, 5 and 15 minute intervals.
+ * XXXKSE   Needs complete rewrite when correct info is available.
+ * Completely Bogus.. only works with 1:1 (but compiles ok now :-)
+ */
+static void
+loadav(void *arg)
+{
+	int i, nrun;
+	struct loadavg *avg;
+	struct proc *p;
+	struct ksegrp *kg;
+
+	avg = &averunnable;
+	sx_slock(&allproc_lock);
+	nrun = 0;
+	FOREACH_PROC_IN_SYSTEM(p) {
+		FOREACH_KSEGRP_IN_PROC(p, kg) {
+			switch (p->p_stat) {
+			case SRUN:
+				if ((p->p_flag & P_NOLOAD) != 0)
+					goto nextproc;
+				/* FALLTHROUGH */
+			case SIDL:
+				nrun++;
+			}
+nextproc:
+			continue;
+		}
+	}
+	sx_sunlock(&allproc_lock);
+	for (i = 0; i < 3; i++)
+		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
+		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
+
+	/*
+	 * Schedule the next update to occur after 5 seconds, but add a
+	 * random variation to avoid synchronisation with processes that
+	 * run at regular intervals.
+	 */
+	callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
+	    loadav, NULL);
+}
+
+/* ARGSUSED */
+static void
+sched_setup(dummy)
+	void *dummy;
+{
+
+	callout_init(&schedcpu_callout, 1);
+	callout_init(&roundrobin_callout, 0);
+	callout_init(&loadav_callout, 0);
+
+	/* Kick off timeout driven events by calling first time. */
+	roundrobin(NULL);
+	schedcpu(NULL);
+	loadav(NULL);
+}
+
+/*
+ * We adjust the priority of the current process.  The priority of
+ * a process gets worse as it accumulates CPU time.  The cpu usage
+ * estimator (p_estcpu) is increased here.  resetpriority() will
+ * compute a different priority each time p_estcpu increases by
+ * INVERSE_ESTCPU_WEIGHT
+ * (until MAXPRI is reached).  The cpu usage estimator ramps up
+ * quite quickly when the process is running (linearly), and decays
+ * away exponentially, at a rate which is proportionally slower when
+ * the system is busy.  The basic principle is that the system will
+ * 90% forget that the process used a lot of CPU time in 5 * loadav
+ * seconds.  This causes the system to favor processes which haven't
+ * run much recently, and to round-robin among other processes.
+ */
+void
+schedclock(td)
+	struct thread *td;
+{
+	struct kse *ke = td->td_kse;
+	struct ksegrp *kg = td->td_ksegrp;
+
+	if (td) {
+		ke->ke_cpticks++;
+		kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
+		if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
+			resetpriority(td->td_ksegrp);
+			if (td->td_priority >= PUSER)
+				td->td_priority = kg->kg_user_pri;
+		}
+	} else {
+		panic("schedclock");
+	}
+}
+
+/*
+ * General purpose yield system call
+ */
+int
+yield(struct thread *td, struct yield_args *uap)
+{
+	struct ksegrp *kg = td->td_ksegrp;
+
+	mtx_assert(&Giant, MA_NOTOWNED);
+	mtx_lock_spin(&sched_lock);
+	td->td_priority = PRI_MAX_TIMESHARE;
+	setrunqueue(td);
+	kg->kg_proc->p_stats->p_ru.ru_nvcsw++;
+	mi_switch();
+	mtx_unlock_spin(&sched_lock);
+	td->td_retval[0] = 0;
+
+	return (0);
+}
+
diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
new file mode 100644
index 0000000..2867bc9
--- /dev/null
+++ b/sys/kern/kern_syscalls.c
@@ -0,0 +1,123 @@
+/*-
+ * Copyright (c) 1999 Assar Westerlund
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/syscall.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/module.h>
+
+/*
+ * Acts like "nosys" but can be identified in sysent for dynamic call 
+ * number assignment for a limited number of calls. 
+ * 
+ * Place holder for system call slots reserved for loadable modules.
+ */     
+int
+lkmnosys(struct thread *td, struct nosys_args *args)
+{
+	return(nosys(td, args));
+}
+
+int
+lkmressys(struct thread *td, struct nosys_args *args)
+{
+	return(nosys(td, args));
+}
+
+int
+syscall_register(int *offset, struct sysent *new_sysent,
+		 struct sysent *old_sysent)
+{
+       if (*offset == NO_SYSCALL) {
+               int i;
+
+               for (i = 1; i < SYS_MAXSYSCALL; ++i)
+                       if (sysent[i].sy_call == (sy_call_t *)lkmnosys)
+                               break;
+               if (i == SYS_MAXSYSCALL)
+                       return ENFILE;
+               *offset = i;
+       } else if (*offset < 0 || *offset >= SYS_MAXSYSCALL)
+               return EINVAL;
+       else if (sysent[*offset].sy_call != (sy_call_t *)lkmnosys &&
+				sysent[*offset].sy_call != (sy_call_t *)lkmressys)
+               return EEXIST;
+
+       *old_sysent = sysent[*offset];
+       sysent[*offset] = *new_sysent;
+       return 0;
+}
+
+int
+syscall_deregister(int *offset, struct sysent *old_sysent)
+{
+       if (*offset)
+               sysent[*offset] = *old_sysent;
+       return 0;
+}
+
+int
+syscall_module_handler(struct module *mod, int what, void *arg)
+{
+       struct syscall_module_data *data = (struct syscall_module_data*)arg;
+       modspecific_t ms;
+       int error;
+
+       switch (what) {
+       case MOD_LOAD :
+               error = syscall_register(data->offset, data->new_sysent,
+                                        &data->old_sysent);
+               if (error)
+                       return error;
+	       ms.intval = *data->offset;
+	       MOD_XLOCK;
+	       module_setspecific(mod, &ms);
+	       MOD_XUNLOCK;
+               if (data->chainevh)
+                       error = data->chainevh(mod, what, data->chainarg);
+               return error;
+
+       case MOD_UNLOAD :
+               if (data->chainevh) {
+                       error = data->chainevh(mod, what, data->chainarg);
+                       if (error)
+                               return error;
+               }
+               error = syscall_deregister(data->offset, &data->old_sysent);
+               return error;
+       }
+
+       if (data->chainevh)
+               return data->chainevh(mod, what, data->chainarg);
+       else
+               return 0;
+}
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
new file mode 100644
index 0000000..6943bc5
--- /dev/null
+++ b/sys/kern/kern_sysctl.c
@@ -0,0 +1,1422 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/sysproto.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
+static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
+
+/*
+ * Locking - this locks the sysctl tree in memory.
+ */
+static struct sx sysctllock;
+
+#define	SYSCTL_LOCK()		sx_xlock(&sysctllock)
+#define	SYSCTL_UNLOCK()	sx_xunlock(&sysctllock)
+#define	SYSCTL_INIT()		sx_init(&sysctllock, "sysctl sysctllock")
+
+static int sysctl_root(SYSCTL_HANDLER_ARGS);
+
+struct sysctl_oid_list sysctl__children; /* root list */
+
+static struct sysctl_oid *
+sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
+{
+	struct sysctl_oid *oidp;
+
+	SLIST_FOREACH(oidp, list, oid_link) {
+		if (strcmp(oidp->oid_name, name) == 0) {
+			return (oidp);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Initialization of the MIB tree.
+ *
+ * Order by number in each list.
+ */
+
+void
+sysctl_register_oid(struct sysctl_oid *oidp)
+{
+	struct sysctl_oid_list *parent = oidp->oid_parent;
+	struct sysctl_oid *p;
+	struct sysctl_oid *q;
+
+	/*
+	 * First check if another oid with the same name already
+	 * exists in the parent's list.
+	 */
+	p = sysctl_find_oidname(oidp->oid_name, parent);
+	if (p != NULL) {
+		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+			p->oid_refcnt++;
+			return;
+		} else {
+			printf("can't re-use a leaf (%s)!\n", p->oid_name);
+			return;
+		}
+	}
+	/*
+	 * If this oid has a number OID_AUTO, give it a number which
+	 * is greater than any current oid.
+	 * NOTE: DO NOT change the starting value here, change it in
+	 * <sys/sysctl.h>, and make sure it is at least 256 to
+	 * accomodate e.g. net.inet.raw as a static sysctl node.
+	 */
+	if (oidp->oid_number == OID_AUTO) {
+		static int newoid = CTL_AUTO_START;
+
+		oidp->oid_number = newoid++;
+		if (newoid == 0x7fffffff)
+			panic("out of oids");
+	}
+#if 0
+	else if (oidp->oid_number >= CTL_AUTO_START) {
+		/* do not panic; this happens when unregistering sysctl sets */
+		printf("static sysctl oid too high: %d", oidp->oid_number);
+	}
+#endif
+
+	/*
+	 * Insert the oid into the parent's list in order.
+	 */
+	q = NULL;
+	SLIST_FOREACH(p, parent, oid_link) {
+		if (oidp->oid_number < p->oid_number)
+			break;
+		q = p;
+	}
+	if (q)
+		SLIST_INSERT_AFTER(q, oidp, oid_link);
+	else
+		SLIST_INSERT_HEAD(parent, oidp, oid_link);
+}
+
+void
+sysctl_unregister_oid(struct sysctl_oid *oidp)
+{
+	SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link);
+}
+
+/* Initialize a new context to keep track of dynamically added sysctls. */
+int
+sysctl_ctx_init(struct sysctl_ctx_list *c)
+{
+
+	if (c == NULL) {
+		return (EINVAL);
+	}
+	TAILQ_INIT(c);
+	return (0);
+}
+
+/* Free the context, and destroy all dynamic oids registered in this context */
+int
+sysctl_ctx_free(struct sysctl_ctx_list *clist)
+{
+	struct sysctl_ctx_entry *e, *e1;
+	int error;
+
+	error = 0;
+	/*
+	 * First perform a "dry run" to check if it's ok to remove oids.
+	 * XXX FIXME
+	 * XXX This algorithm is a hack. But I don't know any
+	 * XXX better solution for now...
+	 */
+	TAILQ_FOREACH(e, clist, link) {
+		error = sysctl_remove_oid(e->entry, 0, 0);
+		if (error)
+			break;
+	}
+	/*
+	 * Restore deregistered entries, either from the end,
+	 * or from the place where error occured.
+	 * e contains the entry that was not unregistered
+	 */
+	if (error)
+		e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
+	else
+		e1 = TAILQ_LAST(clist, sysctl_ctx_list);
+	while (e1 != NULL) {
+		sysctl_register_oid(e1->entry);
+		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
+	}
+	if (error)
+		return(EBUSY);
+	/* Now really delete the entries */
+	e = TAILQ_FIRST(clist);
+	while (e != NULL) {
+		e1 = TAILQ_NEXT(e, link);
+		error = sysctl_remove_oid(e->entry, 1, 0);
+		if (error)
+			panic("sysctl_remove_oid: corrupt tree, entry: %s",
+			    e->entry->oid_name);
+		free(e, M_SYSCTLOID);
+		e = e1;
+	}
+	return (error);
+}
+
+/* Add an entry to the context */
+struct sysctl_ctx_entry *
+sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+	struct sysctl_ctx_entry *e;
+
+	if (clist == NULL || oidp == NULL)
+		return(NULL);
+	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
+	e->entry = oidp;
+	TAILQ_INSERT_HEAD(clist, e, link);
+	return (e);
+}
+
+/* Find an entry in the context */
+struct sysctl_ctx_entry *
+sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+	struct sysctl_ctx_entry *e;
+
+	if (clist == NULL || oidp == NULL)
+		return(NULL);
+	TAILQ_FOREACH(e, clist, link) {
+		if(e->entry == oidp)
+			return(e);
+	}
+	return (e);
+}
+
+/*
+ * Delete an entry from the context.
+ * NOTE: this function doesn't free oidp! You have to remove it
+ * with sysctl_remove_oid().
+ */
+int
+sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+	struct sysctl_ctx_entry *e;
+
+	if (clist == NULL || oidp == NULL)
+		return (EINVAL);
+	e = sysctl_ctx_entry_find(clist, oidp);
+	if (e != NULL) {
+		TAILQ_REMOVE(clist, e, link);
+		free(e, M_SYSCTLOID);
+		return (0);
+	} else
+		return (ENOENT);
+}
+
+/*
+ * Remove dynamically created sysctl trees.
+ * oidp - top of the tree to be removed
+ * del - if 0 - just deregister, otherwise free up entries as well
+ * recurse - if != 0 traverse the subtree to be deleted
+ */
+int
+sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
+{
+	struct sysctl_oid *p;
+	int error;
+
+	if (oidp == NULL)
+		return(EINVAL);
+	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
+		printf("can't remove non-dynamic nodes!\n");
+		return (EINVAL);
+	}
+	/*
+	 * WARNING: normal method to do this should be through
+	 * sysctl_ctx_free(). Use recursing as the last resort
+	 * method to purge your sysctl tree of leftovers...
+	 * However, if some other code still references these nodes,
+	 * it will panic.
+	 */
+	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+		if (oidp->oid_refcnt == 1) {
+			SLIST_FOREACH(p, SYSCTL_CHILDREN(oidp), oid_link) {
+				if (!recurse)
+					return (ENOTEMPTY);
+				error = sysctl_remove_oid(p, del, recurse);
+				if (error)
+					return (error);
+			}
+			if (del)
+				free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID);
+		}
+	}
+	if (oidp->oid_refcnt > 1 ) {
+		oidp->oid_refcnt--;
+	} else {
+		if (oidp->oid_refcnt == 0) {
+			printf("Warning: bad oid_refcnt=%u (%s)!\n",
+				oidp->oid_refcnt, oidp->oid_name);
+			return (EINVAL);
+		}
+		sysctl_unregister_oid(oidp);
+		if (del) {
+			if (oidp->descr)
+				free(oidp->descr, M_SYSCTLOID);
+			free((void *)(uintptr_t)(const void *)oidp->oid_name,
+			     M_SYSCTLOID);
+			free(oidp, M_SYSCTLOID);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Create new sysctls at run time.
+ * clist may point to a valid context initialized with sysctl_ctx_init().
+ */
+struct sysctl_oid *
+sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
+	int number, const char *name, int kind, void *arg1, int arg2,
+	int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
+{
+	struct sysctl_oid *oidp;
+	ssize_t len;
+	char *newname;
+
+	/* You have to hook up somewhere.. */
+	if (parent == NULL)
+		return(NULL);
+	/* Check if the node already exists, otherwise create it */
+	oidp = sysctl_find_oidname(name, parent);
+	if (oidp != NULL) {
+		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+			oidp->oid_refcnt++;
+			/* Update the context */
+			if (clist != NULL)
+				sysctl_ctx_entry_add(clist, oidp);
+			return (oidp);
+		} else {
+			printf("can't re-use a leaf (%s)!\n", name);
+			return (NULL);
+		}
+	}
+	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
+	oidp->oid_parent = parent;
+	SLIST_NEXT(oidp, oid_link) = NULL;
+	oidp->oid_number = number;
+	oidp->oid_refcnt = 1;
+	len = strlen(name);
+	newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
+	bcopy(name, newname, len + 1);
+	newname[len] = '\0';
+	oidp->oid_name = newname;
+	oidp->oid_handler = handler;
+	oidp->oid_kind = CTLFLAG_DYN | kind;
+	if ((kind & CTLTYPE) == CTLTYPE_NODE) {
+		/* Allocate space for children */
+		SYSCTL_CHILDREN(oidp) = malloc(sizeof(struct sysctl_oid_list),
+		    M_SYSCTLOID, M_WAITOK);
+		SLIST_INIT(SYSCTL_CHILDREN(oidp));
+	} else {
+		oidp->oid_arg1 = arg1;
+		oidp->oid_arg2 = arg2;
+	}
+	oidp->oid_fmt = fmt;
+	if (descr) {
+		int len = strlen(descr) + 1;
+		oidp->descr = malloc(len, M_SYSCTLOID, M_WAITOK);
+		if (oidp->descr)
+			strcpy(oidp->descr, descr);
+	}
+	/* Update the context, if used */
+	if (clist != NULL)
+		sysctl_ctx_entry_add(clist, oidp);
+	/* Register this oid */
+	sysctl_register_oid(oidp);
+	return (oidp);
+}
+
+/*
+ * Register the kernel's oids on startup.
+ */
+SET_DECLARE(sysctl_set, struct sysctl_oid);
+
+static void
+sysctl_register_all(void *arg)
+{
+	struct sysctl_oid **oidp;
+
+	SYSCTL_INIT();
+	SET_FOREACH(oidp, sysctl_set)
+		sysctl_register_oid(*oidp);
+}
+SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);
+
+/*
+ * "Staff-functions"
+ *
+ * These functions implement a presently undocumented interface 
+ * used by the sysctl program to walk the tree, and get the type
+ * so it can print the value.
+ * This interface is under work and consideration, and should probably
+ * be killed with a big axe by the first person who can find the time.
+ * (be aware though, that the proper interface isn't as obvious as it
+ * may seem, there are various conflicting requirements.
+ *
+ * {0,0}	printf the entire MIB-tree.
+ * {0,1,...}	return the name of the "..." OID.
+ * {0,2,...}	return the next OID.
+ * {0,3}	return the OID of the name in "new"
+ * {0,4,...}	return the kind & format info for the "..." OID.
+ * {0,5,...}	return the description the "..." OID.
+ */
+
+static void
+sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
+{
+	int k;
+	struct sysctl_oid *oidp;
+
+	SLIST_FOREACH(oidp, l, oid_link) {
+
+		for (k=0; k<i; k++)
+			printf(" ");
+
+		printf("%d %s ", oidp->oid_number, oidp->oid_name);
+
+		printf("%c%c",
+			oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
+			oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
+
+		if (oidp->oid_handler)
+			printf(" *Handler");
+
+		switch (oidp->oid_kind & CTLTYPE) {
+			case CTLTYPE_NODE:
+				printf(" Node\n");
+				if (!oidp->oid_handler) {
+					sysctl_sysctl_debug_dump_node(
+						oidp->oid_arg1, i+2);
+				}
+				break;
+			case CTLTYPE_INT:    printf(" Int\n"); break;
+			case CTLTYPE_STRING: printf(" String\n"); break;
+			case CTLTYPE_QUAD:   printf(" Quad\n"); break;
+			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
+			default:	     printf("\n");
+		}
+
+	}
+}
+
+static int
+sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+
+	error = suser(req->td);
+	if (error)
+		return error;
+	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
+	return ENOENT;
+}
+
+SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
+	0, 0, sysctl_sysctl_debug, "-", "");
+
+static int
+sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	int error = 0;
+	struct sysctl_oid *oid;
+	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
+	char buf[10];
+
+	while (namelen) {
+		if (!lsp) {
+			snprintf(buf,sizeof(buf),"%d",*name);
+			if (req->oldidx)
+				error = SYSCTL_OUT(req, ".", 1);
+			if (!error)
+				error = SYSCTL_OUT(req, buf, strlen(buf));
+			if (error)
+				return (error);
+			namelen--;
+			name++;
+			continue;
+		}
+		lsp2 = 0;
+		SLIST_FOREACH(oid, lsp, oid_link) {
+			if (oid->oid_number != *name)
+				continue;
+
+			if (req->oldidx)
+				error = SYSCTL_OUT(req, ".", 1);
+			if (!error)
+				error = SYSCTL_OUT(req, oid->oid_name,
+					strlen(oid->oid_name));
+			if (error)
+				return (error);
+
+			namelen--;
+			name++;
+
+			if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
+				break;
+
+			if (oid->oid_handler)
+				break;
+
+			lsp2 = (struct sysctl_oid_list *)oid->oid_arg1;
+			break;
+		}
+		lsp = lsp2;
+	}
+	return (SYSCTL_OUT(req, "", 1));
+}
+
+SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, "");
+
+static int
+sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, 
+	int *next, int *len, int level, struct sysctl_oid **oidpp)
+{
+	struct sysctl_oid *oidp;
+
+	*len = level;
+	SLIST_FOREACH(oidp, lsp, oid_link) {
+		*next = oidp->oid_number;
+		*oidpp = oidp;
+
+		if (!namelen) {
+			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
+				return 0;
+			if (oidp->oid_handler) 
+				/* We really should call the handler here...*/
+				return 0;
+			lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
+			if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, 
+				len, level+1, oidpp))
+				return 0;
+			goto next;
+		}
+
+		if (oidp->oid_number < *name)
+			continue;
+
+		if (oidp->oid_number > *name) {
+			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+				return 0;
+			if (oidp->oid_handler)
+				return 0;
+			lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
+			if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, 
+				next+1, len, level+1, oidpp))
+				return (0);
+			goto next;
+		}
+		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+			continue;
+
+		if (oidp->oid_handler)
+			continue;
+
+		lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
+		if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, 
+			len, level+1, oidpp))
+			return (0);
+	next:
+		namelen = 1;
+		*len = level;
+	}
+	return 1;
+}
+
+static int
+sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	int i, j, error;
+	struct sysctl_oid *oid;
+	struct sysctl_oid_list *lsp = &sysctl__children;
+	int newoid[CTL_MAXNAME];
+
+	i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
+	if (i)
+		return ENOENT;
+	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
+	return (error);
+}
+
+SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, "");
+
+static int
+name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp)
+{
+	int i;
+	struct sysctl_oid *oidp;
+	struct sysctl_oid_list *lsp = &sysctl__children;
+	char *p;
+
+	if (!*name)
+		return ENOENT;
+
+	p = name + strlen(name) - 1 ;
+	if (*p == '.')
+		*p = '\0';
+
+	*len = 0;
+
+	for (p = name; *p && *p != '.'; p++) 
+		;
+	i = *p;
+	if (i == '.')
+		*p = '\0';
+
+	oidp = SLIST_FIRST(lsp);
+
+	while (oidp && *len < CTL_MAXNAME) {
+		if (strcmp(name, oidp->oid_name)) {
+			oidp = SLIST_NEXT(oidp, oid_link);
+			continue;
+		}
+		*oid++ = oidp->oid_number;
+		(*len)++;
+
+		if (!i) {
+			if (oidpp)
+				*oidpp = oidp;
+			return (0);
+		}
+
+		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+			break;
+
+		if (oidp->oid_handler)
+			break;
+
+		lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
+		oidp = SLIST_FIRST(lsp);
+		name = p+1;
+		for (p = name; *p && *p != '.'; p++) 
+				;
+		i = *p;
+		if (i == '.')
+			*p = '\0';
+	}
+	return ENOENT;
+}
+
+static int
+sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
+{
+	char *p;
+	int error, oid[CTL_MAXNAME], len;
+	struct sysctl_oid *op = 0;
+
+	if (!req->newlen) 
+		return ENOENT;
+	if (req->newlen >= MAXPATHLEN)	/* XXX arbitrary, undocumented */
+		return (ENAMETOOLONG);
+
+	p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
+
+	error = SYSCTL_IN(req, p, req->newlen);
+	if (error) {
+		free(p, M_SYSCTL);
+		return (error);
+	}
+
+	p [req->newlen] = '\0';
+
+	error = name2oid(p, oid, &len, &op);
+
+	free(p, M_SYSCTL);
+
+	if (error)
+		return (error);
+
+	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
+	return (error);
+}
+
+SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, 
+	sysctl_sysctl_name2oid, "I", "");
+
+static int
+sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
+{
+	struct sysctl_oid *oid;
+	int error;
+
+	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
+	if (error)
+		return (error);
+
+	if (!oid->oid_fmt)
+		return (ENOENT);
+	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
+	return (error);
+}
+
+
+SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, "");
+
+static int
+sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
+{
+	struct sysctl_oid *oid;
+	int error;
+
+	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
+	if (error)
+		return (error);
+
+	if (!oid->descr)
+		return (ENOENT);
+	error = SYSCTL_OUT(req, oid->descr, strlen(oid->descr) + 1);
+	return (error);
+}
+
+SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD, sysctl_sysctl_oiddescr, "");
+
+/*
+ * Default "handler" functions.
+ */
+
+/*
+ * Handle an int, signed or unsigned.
+ * Two cases:
+ *     a variable:  point arg1 at it.
+ *     a constant:  pass it in arg2.
+ */
+
+int
+sysctl_handle_int(SYSCTL_HANDLER_ARGS)
+{
+	int error = 0;
+
+	if (arg1)
+		error = SYSCTL_OUT(req, arg1, sizeof(int));
+	else
+		error = SYSCTL_OUT(req, &arg2, sizeof(int));
+
+	if (error || !req->newptr)
+		return (error);
+
+	if (!arg1)
+		error = EPERM;
+	else
+		error = SYSCTL_IN(req, arg1, sizeof(int));
+	return (error);
+}
+
+/*
+ * Handle a long, signed or unsigned.  arg1 points to it.
+ */
+
+int
+sysctl_handle_long(SYSCTL_HANDLER_ARGS)
+{
+	int error = 0;
+
+	if (!arg1)
+		return (EINVAL);
+	error = SYSCTL_OUT(req, arg1, sizeof(long));
+
+	if (error || !req->newptr)
+		return (error);
+
+	error = SYSCTL_IN(req, arg1, sizeof(long));
+	return (error);
+}
+
+/*
+ * Handle our generic '\0' terminated 'C' string.
+ * Two cases:
+ * 	a variable string:  point arg1 at it, arg2 is max length.
+ * 	a constant string:  point arg1 at it, arg2 is zero.
+ */
+
+int
+sysctl_handle_string(SYSCTL_HANDLER_ARGS)
+{
+	int error=0;
+
+	error = SYSCTL_OUT(req, arg1, strlen((char *)arg1)+1);
+
+	if (error || !req->newptr)
+		return (error);
+
+	if ((req->newlen - req->newidx) >= arg2) {
+		error = EINVAL;
+	} else {
+		arg2 = (req->newlen - req->newidx);
+		error = SYSCTL_IN(req, arg1, arg2);
+		((char *)arg1)[arg2] = '\0';
+	}
+
+	return (error);
+}
+
+/*
+ * Handle any kind of opaque data.
+ * arg1 points to it, arg2 is the size.
+ */
+
+int
+sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+
+	error = SYSCTL_OUT(req, arg1, arg2);
+
+	if (error || !req->newptr)
+		return (error);
+
+	error = SYSCTL_IN(req, arg1, arg2);
+
+	return (error);
+}
+
+/*
+ * Transfer functions to/from kernel space.
+ * XXX: rather untested at this point
+ */
+static int
+sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
+{
+	size_t i = 0;
+
+	if (req->oldptr) {
+		i = l;
+		if (req->oldlen <= req->oldidx)
+			i = 0;
+		else
+			if (i > req->oldlen - req->oldidx)
+				i = req->oldlen - req->oldidx;
+		if (i > 0)
+			bcopy(p, (char *)req->oldptr + req->oldidx, i);
+	}
+	req->oldidx += l;
+	if (req->oldptr && i != l)
+		return (ENOMEM);
+	return (0);
+}
+
+static int
+sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
+{
+	if (!req->newptr)
+		return 0;
+	if (req->newlen - req->newidx < l)
+		return (EINVAL);
+	bcopy((char *)req->newptr + req->newidx, p, l);
+	req->newidx += l;
+	return (0);
+}
+
+int
+kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
+    size_t *oldlenp, void *new, size_t newlen, size_t *retval)
+{
+	int error = 0;
+	struct sysctl_req req;
+
+	bzero(&req, sizeof req);
+
+	req.td = td;
+
+	if (oldlenp) {
+		req.oldlen = *oldlenp;
+	}
+
+	if (old) {
+		req.oldptr= old;
+	}
+
+	if (new != NULL) {
+		req.newlen = newlen;
+		req.newptr = new;
+	}
+
+	req.oldfunc = sysctl_old_kernel;
+	req.newfunc = sysctl_new_kernel;
+	req.lock = 1;
+
+	SYSCTL_LOCK();
+
+	error = sysctl_root(0, name, namelen, &req);
+
+	if (req.lock == 2)
+		vsunlock(req.oldptr, req.oldlen);
+
+	SYSCTL_UNLOCK();
+
+	if (error && error != ENOMEM)
+		return (error);
+
+	if (retval) {
+		if (req.oldptr && req.oldidx > req.oldlen)
+			*retval = req.oldlen;
+		else
+			*retval = req.oldidx;
+	}
+	return (error);
+}
+
+int
+kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
+    void *new, size_t newlen, size_t *retval)
+{
+        int oid[CTL_MAXNAME];
+        size_t oidlen, plen;
+	int error;
+
+	oid[0] = 0;		/* sysctl internal magic */
+	oid[1] = 3;		/* name2oid */
+	oidlen = sizeof(oid);
+
+	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
+	    (void *)name, strlen(name), &plen);
+	if (error)
+		return (error);
+
+	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
+	    new, newlen, retval);
+	return (error);
+}
+
+/*
+ * Transfer function to/from user space.
+ */
+static int
+sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
+{
+	int error = 0;
+	size_t i = 0;
+
+	if (req->lock == 1 && req->oldptr) {
+		vslock(req->oldptr, req->oldlen);
+		req->lock = 2;
+	}
+	if (req->oldptr) {
+		i = l;
+		if (req->oldlen <= req->oldidx)
+			i = 0;
+		else
+			if (i > req->oldlen - req->oldidx)
+				i = req->oldlen - req->oldidx;
+		if (i > 0)
+			error = copyout(p, (char *)req->oldptr + req->oldidx,
+					i);
+	}
+	req->oldidx += l;
+	if (error)
+		return (error);
+	if (req->oldptr && i < l)
+		return (ENOMEM);
+	return (0);
+}
+
+static int
+sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
+{
+	int error;
+
+	if (!req->newptr)
+		return 0;
+	if (req->newlen - req->newidx < l)
+		return (EINVAL);
+	error = copyin((char *)req->newptr + req->newidx, p, l);
+	req->newidx += l;
+	return (error);
+}
+
+int
+sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
+    int *nindx, struct sysctl_req *req)
+{
+	struct sysctl_oid *oid;
+	int indx;
+
+	oid = SLIST_FIRST(&sysctl__children);
+	indx = 0;
+	while (oid && indx < CTL_MAXNAME) {
+		if (oid->oid_number == name[indx]) {
+			indx++;
+			if (oid->oid_kind & CTLFLAG_NOLOCK)
+				req->lock = 0;
+			if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+				if (oid->oid_handler != NULL ||
+				    indx == namelen) {
+					*noid = oid;
+					if (nindx != NULL)
+						*nindx = indx;
+					return (0);
+				}
+				oid = SLIST_FIRST(
+				    (struct sysctl_oid_list *)oid->oid_arg1);
+			} else if (indx == namelen) {
+				*noid = oid;
+				if (nindx != NULL)
+					*nindx = indx;
+				return (0);
+			} else {
+				return (ENOTDIR);
+			}
+		} else {
+			oid = SLIST_NEXT(oid, oid_link);
+		}
+	}
+	return (ENOENT);
+}
+
+/*
+ * Traverse our tree, and find the right node, execute whatever it points
+ * to, and return the resulting error code.
+ */
+
+int
+sysctl_root(SYSCTL_HANDLER_ARGS)
+{
+	struct sysctl_oid *oid;
+	int error, indx;
+
+	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
+	if (error)
+		return (error);
+
+	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+		/*
+		 * You can't call a sysctl when it's a node, but has
+		 * no handler.  Inform the user that it's a node.
+		 * The indx may or may not be the same as namelen.
+		 */
+		if (oid->oid_handler == NULL)
+			return (EISDIR);
+	}
+
+	/* Is this sysctl writable? */
+	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
+		return (EPERM);
+
+	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
+
+	/* Is this sysctl sensitive to securelevels? */
+	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
+		error = securelevel_gt(req->td->td_ucred, 0);
+		if (error)
+			return (error);
+	}
+
+	/* Is this sysctl writable by only privileged users? */
+	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
+		int flags;
+
+		if (oid->oid_kind & CTLFLAG_PRISON)
+			flags = PRISON_ROOT;
+		else
+			flags = 0;
+		error = suser_cred(req->td->td_ucred, flags);
+		if (error)
+			return (error);
+	}
+
+	if (!oid->oid_handler)
+		return EINVAL;
+
+	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE)
+		error = oid->oid_handler(oid, (int *)arg1 + indx, arg2 - indx,
+		    req);
+	else
+		error = oid->oid_handler(oid, oid->oid_arg1, oid->oid_arg2,
+		    req);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sysctl_args {
+	int	*name;
+	u_int	namelen;
+	void	*old;
+	size_t	*oldlenp;
+	void	*new;
+	size_t	newlen;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+__sysctl(struct thread *td, struct sysctl_args *uap)
+{
+	int error, name[CTL_MAXNAME];
+	size_t j;
+
+	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
+		return (EINVAL);
+
+ 	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
+ 	if (error)
+		return (error);
+
+	mtx_lock(&Giant);
+
+	error = userland_sysctl(td, name, uap->namelen,
+		uap->old, uap->oldlenp, 0,
+		uap->new, uap->newlen, &j);
+	if (error && error != ENOMEM)
+		goto done2;
+	if (uap->oldlenp) {
+		int i = copyout(&j, uap->oldlenp, sizeof(j));
+		if (i)
+			error = i;
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * This is used from various compatibility syscalls too.  That's why name
+ * must be in kernel space.
+ */
+int
+userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
+    size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval)
+{
+	int error = 0;
+	struct sysctl_req req, req2;
+
+	bzero(&req, sizeof req);
+
+	req.td = td;
+
+	if (oldlenp) {
+		if (inkernel) {
+			req.oldlen = *oldlenp;
+		} else {
+			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
+			if (error)
+				return (error);
+		}
+	}
+
+	if (old) {
+		if (!useracc(old, req.oldlen, VM_PROT_WRITE))
+			return (EFAULT);
+		req.oldptr= old;
+	}
+
+	if (new != NULL) {
+		if (!useracc(new, req.newlen, VM_PROT_READ))
+			return (EFAULT);
+		req.newlen = newlen;
+		req.newptr = new;
+	}
+
+	req.oldfunc = sysctl_old_user;
+	req.newfunc = sysctl_new_user;
+	req.lock = 1;
+
+	SYSCTL_LOCK();
+
+	do {
+	    req2 = req;
+	    error = sysctl_root(0, name, namelen, &req2);
+	} while (error == EAGAIN);
+
+	req = req2;
+	if (req.lock == 2)
+		vsunlock(req.oldptr, req.oldlen);
+
+	SYSCTL_UNLOCK();
+
+	if (error && error != ENOMEM)
+		return (error);
+
+	if (retval) {
+		if (req.oldptr && req.oldidx > req.oldlen)
+			*retval = req.oldlen;
+		else
+			*retval = req.oldidx;
+	}
+	return (error);
+}
+
+#ifdef COMPAT_43
+#include <sys/socket.h>
+#include <vm/vm_param.h>
+
+#define	KINFO_PROC		(0<<8)
+#define	KINFO_RT		(1<<8)
+#define	KINFO_VNODE		(2<<8)
+#define	KINFO_FILE		(3<<8)
+#define	KINFO_METER		(4<<8)
+#define	KINFO_LOADAVG		(5<<8)
+#define	KINFO_CLOCKRATE		(6<<8)
+
+/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
+#define	KINFO_BSDI_SYSINFO	(101<<8)
+
+/*
+ * XXX this is bloat, but I hope it's better here than on the potentially
+ * limited kernel stack...  -Peter
+ */
+
+static struct {
+	int	bsdi_machine;		/* "i386" on BSD/386 */
+/*      ^^^ this is an offset to the string, relative to the struct start */
+	char	*pad0;
+	long	pad1;
+	long	pad2;
+	long	pad3;
+	u_long	pad4;
+	u_long	pad5;
+	u_long	pad6;
+
+	int	bsdi_ostype;		/* "BSD/386" on BSD/386 */
+	int	bsdi_osrelease;		/* "1.1" on BSD/386 */
+	long	pad7;
+	long	pad8;
+	char	*pad9;
+
+	long	pad10;
+	long	pad11;
+	int	pad12;
+	long	pad13;
+	quad_t	pad14;
+	long	pad15;
+
+	struct	timeval pad16;
+	/* we dont set this, because BSDI's uname used gethostname() instead */
+	int	bsdi_hostname;		/* hostname on BSD/386 */
+
+	/* the actual string data is appended here */
+
+} bsdi_si;
+/*
+ * this data is appended to the end of the bsdi_si structure during copyout.
+ * The "char *" offsets are relative to the base of the bsdi_si struct.
+ * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
+ * should not exceed the length of the buffer here... (or else!! :-)
+ */
+static char bsdi_strings[80];	/* It had better be less than this! */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getkerninfo_args {
+	int	op;
+	char	*where;
+	size_t	*size;
+	int	arg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+ogetkerninfo(struct thread *td, struct getkerninfo_args *uap)
+{
+	int error, name[6];
+	size_t size;
+	u_int needed = 0;
+
+	mtx_lock(&Giant);
+
+	switch (uap->op & 0xff00) {
+
+	case KINFO_RT:
+		name[0] = CTL_NET;
+		name[1] = PF_ROUTE;
+		name[2] = 0;
+		name[3] = (uap->op & 0xff0000) >> 16;
+		name[4] = uap->op & 0xff;
+		name[5] = uap->arg;
+		error = userland_sysctl(td, name, 6, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_VNODE:
+		name[0] = CTL_KERN;
+		name[1] = KERN_VNODE;
+		error = userland_sysctl(td, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_PROC:
+		name[0] = CTL_KERN;
+		name[1] = KERN_PROC;
+		name[2] = uap->op & 0xff;
+		name[3] = uap->arg;
+		error = userland_sysctl(td, name, 4, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_FILE:
+		name[0] = CTL_KERN;
+		name[1] = KERN_FILE;
+		error = userland_sysctl(td, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_METER:
+		name[0] = CTL_VM;
+		name[1] = VM_METER;
+		error = userland_sysctl(td, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_LOADAVG:
+		name[0] = CTL_VM;
+		name[1] = VM_LOADAVG;
+		error = userland_sysctl(td, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_CLOCKRATE:
+		name[0] = CTL_KERN;
+		name[1] = KERN_CLOCKRATE;
+		error = userland_sysctl(td, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
+		break;
+
+	case KINFO_BSDI_SYSINFO: {
+		/*
+		 * this is pretty crude, but it's just enough for uname()
+		 * from BSDI's 1.x libc to work.
+		 *
+		 * *size gives the size of the buffer before the call, and
+		 * the amount of data copied after a successful call.
+		 * If successful, the return value is the amount of data
+		 * available, which can be larger than *size.
+		 *
+		 * BSDI's 2.x product apparently fails with ENOMEM if *size
+		 * is too small.
+		 */
+
+		u_int left;
+		char *s;
+
+		bzero((char *)&bsdi_si, sizeof(bsdi_si));
+		bzero(bsdi_strings, sizeof(bsdi_strings));
+
+		s = bsdi_strings;
+
+		bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, ostype);
+		s += strlen(s) + 1;
+
+		bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, osrelease);
+		s += strlen(s) + 1;
+
+		bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, machine);
+		s += strlen(s) + 1;
+
+		needed = sizeof(bsdi_si) + (s - bsdi_strings);
+
+		if ((uap->where == NULL) || (uap->size == NULL)) {
+			/* process is asking how much buffer to supply.. */
+			size = needed;
+			error = 0;
+			break;
+		}
+
+		if ((error = copyin(uap->size, &size, sizeof(size))) != 0)
+			break;
+
+		/* if too much buffer supplied, trim it down */
+		if (size > needed)
+			size = needed;
+
+		/* how much of the buffer is remaining */
+		left = size;
+
+		if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
+			break;
+
+		/* is there any point in continuing? */
+		if (left > sizeof(bsdi_si)) {
+			left -= sizeof(bsdi_si);
+			error = copyout(&bsdi_strings,
+					uap->where + sizeof(bsdi_si), left);
+		}
+		break;
+	}
+
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	if (error == 0) {
+		td->td_retval[0] = needed ? needed : size;
+		if (uap->size) {
+			error = copyout((caddr_t)&size, (caddr_t)uap->size,
+				    sizeof(size));
+		}
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+#endif /* COMPAT_43 */
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
new file mode 100644
index 0000000..fabc204
--- /dev/null
+++ b/sys/kern/kern_tc.c
@@ -0,0 +1,684 @@
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ntp.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/timepps.h>
+#include <sys/timetc.h>
+#include <sys/timex.h>
+
+/*
+ * Implement a dummy timecounter which we can use until we get a real one
+ * in the air.  This allows the console and other early stuff to use
+ * time services.
+ */
+
+static u_int
+dummy_get_timecount(struct timecounter *tc)
+{
+	static u_int now;
+
+	return (++now);
+}
+
+static struct timecounter dummy_timecounter = {
+	dummy_get_timecount, 0, ~0u, 1000000, "dummy",
+};
+
+struct timehands {
+	/* These fields must be initialized by the driver. */
+	struct timecounter	*th_counter;
+	int64_t			th_adjustment;
+	u_int64_t		th_scale;
+	u_int	 		th_offset_count;
+	struct bintime		th_offset;
+	struct timeval		th_microtime;
+	struct timespec		th_nanotime;
+	/* Fields not to be copied in tc_windup start with th_generation. */
+	volatile u_int		th_generation;
+	struct timehands	*th_next;
+};
+
+extern struct timehands th0;
+static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0};
+static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9};
+static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8};
+static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th7};
+static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th6};
+static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th5};
+static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th4};
+static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th3};
+static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th2};
+static struct timehands th0 = {
+	&dummy_timecounter,
+	0,
+	(uint64_t)-1 / 1000000,
+	0,
+	{1, 0},
+	{0, 0},
+	{0, 0},
+	1,
+	&th1
+};
+
+static struct timehands *volatile timehands = &th0;
+struct timecounter *timecounter = &dummy_timecounter;
+static struct timecounter *timecounters = &dummy_timecounter;
+
+time_t time_second = 1;
+
+static struct bintime boottimebin;
+struct timeval boottime;
+SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD,
+    &boottime, timeval, "System boottime");
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+
+#define TC_STATS(foo) \
+	static u_int foo; \
+	SYSCTL_UINT(_kern_timecounter, OID_AUTO, foo, CTLFLAG_RD, &foo, 0, "") \
+	struct __hack
+
+TC_STATS(nbinuptime);    TC_STATS(nnanouptime);    TC_STATS(nmicrouptime);
+TC_STATS(nbintime);      TC_STATS(nnanotime);      TC_STATS(nmicrotime);
+TC_STATS(ngetbinuptime); TC_STATS(ngetnanouptime); TC_STATS(ngetmicrouptime);
+TC_STATS(ngetbintime);   TC_STATS(ngetnanotime);   TC_STATS(ngetmicrotime);
+
+#undef TC_STATS
+
+static void tc_windup(void);
+
+/*
+ * Return the difference between the timehands' counter value now and what
+ * was when we copied it to the timehands' offset_count.
+ */
+static __inline u_int
+tc_delta(struct timehands *th)
+{
+	struct timecounter *tc;
+
+	tc = th->th_counter;
+	return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
+	    tc->tc_counter_mask);
+}
+
+/*
+ * Functions for reading the time.  We have to loop until we are sure that
+ * the timehands that we operated on was not updated under our feet.  See
+ * the comment in <sys/time.h> for a description of these 12 functions.
+ */
+
+void
+binuptime(struct bintime *bt)
+{
+	struct timehands *th;
+	u_int gen;
+
+	nbinuptime++;
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*bt = th->th_offset;
+		bintime_addx(bt, th->th_scale * tc_delta(th));
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+nanouptime(struct timespec *tsp)
+{
+	struct bintime bt;
+
+	nnanouptime++;
+	binuptime(&bt);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+microuptime(struct timeval *tvp)
+{
+	struct bintime bt;
+
+	nmicrouptime++;
+	binuptime(&bt);
+	bintime2timeval(&bt, tvp);
+}
+
+void
+bintime(struct bintime *bt)
+{
+
+	nbintime++;
+	binuptime(bt);
+	bintime_add(bt, &boottimebin);
+}
+
+void
+nanotime(struct timespec *tsp)
+{
+	struct bintime bt;
+
+	nnanotime++;
+	bintime(&bt);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+microtime(struct timeval *tvp)
+{
+	struct bintime bt;
+
+	nmicrotime++;
+	bintime(&bt);
+	bintime2timeval(&bt, tvp);
+}
+
+void
+getbinuptime(struct bintime *bt)
+{
+	struct timehands *th;
+	u_int gen;
+
+	ngetbinuptime++;
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*bt = th->th_offset;
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getnanouptime(struct timespec *tsp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	ngetnanouptime++;
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		bintime2timespec(&th->th_offset, tsp);
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getmicrouptime(struct timeval *tvp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	ngetmicrouptime++;
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		bintime2timeval(&th->th_offset, tvp);
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getbintime(struct bintime *bt)
+{
+	struct timehands *th;
+	u_int gen;
+
+	ngetbintime++;
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*bt = th->th_offset;
+	} while (gen == 0 || gen != th->th_generation);
+	bintime_add(bt, &boottimebin);
+}
+
+void
+getnanotime(struct timespec *tsp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	ngetnanotime++;
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*tsp = th->th_nanotime;
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getmicrotime(struct timeval *tvp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	ngetmicrotime++;
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*tvp = th->th_microtime;
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+/*
+ * Initialize a new timecounter.
+ * We should really try to rank the timecounters and intelligently determine
+ * if the new timecounter is better than the current one.  This is subject
+ * to further study.  For now always use the new timecounter.
+ */
+void
+tc_init(struct timecounter *tc)
+{
+
+	tc->tc_next = timecounters;
+	timecounters = tc;
+	printf("Timecounter \"%s\"  frequency %lu Hz\n",
+	    tc->tc_name, (u_long)tc->tc_frequency);
+	(void)tc->tc_get_timecount(tc);
+	(void)tc->tc_get_timecount(tc);
+	timecounter = tc;
+}
+
+/* Report the frequency of the current timecounter. */
+u_int32_t
+tc_getfrequency(void)
+{
+
+	return (timehands->th_counter->tc_frequency);
+}
+
+/*
+ * Step our concept of GMT.  This is done by modifying our estimate of
+ * when we booted.  XXX: needs futher work.
+ */
+void
+tc_setclock(struct timespec *ts)
+{
+	struct timespec ts2;
+
+	nanouptime(&ts2);
+	boottime.tv_sec = ts->tv_sec - ts2.tv_sec;
+	/* XXX boottime should probably be a timespec. */
+	boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000;
+	if (boottime.tv_usec < 0) {
+		boottime.tv_usec += 1000000;
+		boottime.tv_sec--;
+	}
+	timeval2bintime(&boottime, &boottimebin);
+
+	/* XXX fiddle all the little crinkly bits around the fiords... */
+	tc_windup();
+}
+
+/*
+ * Initialize the next struct timehands in the ring and make
+ * it the active timehands.  Along the way we might switch to a different
+ * timecounter and/or do seconds processing in NTP.  Slightly magic.
+ */
+static void
+tc_windup(void)
+{
+	struct bintime bt;
+	struct timehands *th, *tho;
+	u_int64_t scale;
+	u_int delta, ncount, ogen;
+	int i;
+
+	/*
+	 * Make the next timehands a copy of the current one, but do not
+	 * overwrite the generation or next pointer.  While we update
+	 * the contents, the generation must be zero.
+	 */
+	tho = timehands;
+	th = tho->th_next;
+	ogen = th->th_generation;
+	th->th_generation = 0;
+	bcopy(tho, th, offsetof(struct timehands, th_generation));
+
+	/*
+	 * Capture a timecounter delta on the current timecounter and if
+	 * changing timecounters, a counter value from the new timecounter.
+	 * Update the offset fields accordingly.
+	 */
+	delta = tc_delta(th);
+	if (th->th_counter != timecounter)
+		ncount = timecounter->tc_get_timecount(timecounter);
+	else
+		ncount = 0;
+	th->th_offset_count += delta;
+	th->th_offset_count &= th->th_counter->tc_counter_mask;
+	bintime_addx(&th->th_offset, th->th_scale * delta);
+
+	/*
+	 * Hardware latching timecounters may not generate interrupts on
+	 * PPS events, so instead we poll them.  There is a finite risk that
+	 * the hardware might capture a count which is later than the one we
+	 * got above, and therefore possibly in the next NTP second which might
+	 * have a different rate than the current NTP second.  It doesn't
+	 * matter in practice.
+	 */
+	if (tho->th_counter->tc_poll_pps)
+		tho->th_counter->tc_poll_pps(tho->th_counter);
+
+	/*
+	 * Deal with NTP second processing.  The for loop normally only
+	 * iterates once, but in extreme situations it might keep NTP sane
+	 * if timeouts are not run for several seconds.
+	 */
+	for (i = th->th_offset.sec - tho->th_offset.sec; i > 0; i--)
+		ntp_update_second(&th->th_adjustment, &th->th_offset.sec);
+
+	/* Now is a good time to change timecounters. */
+	if (th->th_counter != timecounter) {
+		th->th_counter = timecounter;
+		th->th_offset_count = ncount;
+	}
+
+	/*-
+	 * Recalculate the scaling factor.  We want the number of 1/2^64
+	 * fractions of a second per period of the hardware counter, taking
+	 * into account the th_adjustment factor which the NTP PLL/adjtime(2)
+	 * processing provides us with.
+	 *
+	 * The th_adjustment is nanoseconds per second with 32 bit binary
+	 * fraction and want 64 bit binary fraction of second:
+	 *
+	 *	 x = a * 2^32 / 10^9 = a * 4.294967296
+	 *
+	 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
+	 * we can only multiply by about 850 without overflowing, but that
+	 * leaves suitably precise fractions for multiply before divide.
+	 *
+	 * Divide before multiply with a fraction of 2199/512 results in a
+	 * systematic undercompensation of 10PPM of th_adjustment.  On a
+	 * 5000PPM adjustment this is a 0.05PPM error.  This is acceptable.
+ 	 *
+	 * We happily sacrifice the lowest of the 64 bits of our result
+	 * to the goddess of code clarity.
+	 *
+	 */
+	scale = (u_int64_t)1 << 63;
+	scale += (th->th_adjustment / 1024) * 2199;
+	scale /= th->th_counter->tc_frequency;
+	th->th_scale = scale * 2;
+
+	/* Update the GMT timestamps used for the get*() functions. */
+	bt = th->th_offset;
+	bintime_add(&bt, &boottimebin);
+	bintime2timeval(&bt, &th->th_microtime);
+	bintime2timespec(&bt, &th->th_nanotime);
+
+	/*
+	 * Now that the struct timehands is again consistent, set the new
+	 * generation number, making sure to not make it zero.
+	 */
+	if (++ogen == 0)
+		ogen = 1;
+	th->th_generation = ogen;
+
+	/* Go live with the new struct timehands. */
+	time_second = th->th_microtime.tv_sec;
+	timehands = th;
+}
+
+/* Report or change the active timecounter hardware. */
+static int
+sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
+{
+	char newname[32];
+	struct timecounter *newtc, *tc;
+	int error;
+
+	tc = timecounter;
+	strncpy(newname, tc->tc_name, sizeof(newname));
+	newname[sizeof(newname) - 1] = '\0';
+	error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
+	if (error != 0 || req->newptr == NULL ||
+	    strcmp(newname, tc->tc_name) == 0)
+		return (error);
+	for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
+		if (strcmp(newname, newtc->tc_name) != 0)
+			continue;
+
+		/* Warm up new timecounter. */
+		(void)newtc->tc_get_timecount(newtc);
+		(void)newtc->tc_get_timecount(newtc);
+
+		timecounter = newtc;
+		return (0);
+	}
+	return (EINVAL);
+}
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
+    0, 0, sysctl_kern_timecounter_hardware, "A", "");
+
+/*
+ * RFC 2783 PPS-API implementation.
+ */
+
+int
+pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
+{
+	pps_params_t *app;
+	struct pps_fetch_args *fapi;
+#ifdef PPS_SYNC
+	struct pps_kcbind_args *kapi;
+#endif
+
+	switch (cmd) {
+	case PPS_IOC_CREATE:
+		return (0);
+	case PPS_IOC_DESTROY:
+		return (0);
+	case PPS_IOC_SETPARAMS:
+		app = (pps_params_t *)data;
+		if (app->mode & ~pps->ppscap)
+			return (EINVAL);
+		pps->ppsparam = *app;
+		return (0);
+	case PPS_IOC_GETPARAMS:
+		app = (pps_params_t *)data;
+		*app = pps->ppsparam;
+		app->api_version = PPS_API_VERS_1;
+		return (0);
+	case PPS_IOC_GETCAP:
+		*(int*)data = pps->ppscap;
+		return (0);
+	case PPS_IOC_FETCH:
+		fapi = (struct pps_fetch_args *)data;
+		if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
+			return (EINVAL);
+		if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec)
+			return (EOPNOTSUPP);
+		pps->ppsinfo.current_mode = pps->ppsparam.mode;
+		fapi->pps_info_buf = pps->ppsinfo;
+		return (0);
+	case PPS_IOC_KCBIND:
+#ifdef PPS_SYNC
+		kapi = (struct pps_kcbind_args *)data;
+		/* XXX Only root should be able to do this */
+		if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
+			return (EINVAL);
+		if (kapi->kernel_consumer != PPS_KC_HARDPPS)
+			return (EINVAL);
+		if (kapi->edge & ~pps->ppscap)
+			return (EINVAL);
+		pps->kcmode = kapi->edge;
+		return (0);
+#else
+		return (EOPNOTSUPP);
+#endif
+	default:
+		return (ENOTTY);
+	}
+}
+
+void
+pps_init(struct pps_state *pps)
+{
+	pps->ppscap |= PPS_TSFMT_TSPEC;
+	if (pps->ppscap & PPS_CAPTUREASSERT)
+		pps->ppscap |= PPS_OFFSETASSERT;
+	if (pps->ppscap & PPS_CAPTURECLEAR)
+		pps->ppscap |= PPS_OFFSETCLEAR;
+}
+
+void
+pps_capture(struct pps_state *pps)
+{
+	struct timehands *th;
+
+	th = timehands;
+	pps->capgen = th->th_generation;
+	pps->capth = th;
+	pps->capcount = th->th_counter->tc_get_timecount(th->th_counter);
+	if (pps->capgen != th->th_generation)
+		pps->capgen = 0;
+}
+
+void
+pps_event(struct pps_state *pps, int event)
+{
+	struct bintime bt;
+	struct timespec ts, *tsp, *osp;
+	u_int tcount, *pcount;
+	int foff, fhard;
+	pps_seq_t *pseq;
+
+	/* If the timecounter was wound up underneath us, bail out. */
+	if (pps->capgen == 0 || pps->capgen != pps->capth->th_generation)
+		return;
+
+	/* Things would be easier with arrays. */
+	if (event == PPS_CAPTUREASSERT) {
+		tsp = &pps->ppsinfo.assert_timestamp;
+		osp = &pps->ppsparam.assert_offset;
+		foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
+		fhard = pps->kcmode & PPS_CAPTUREASSERT;
+		pcount = &pps->ppscount[0];
+		pseq = &pps->ppsinfo.assert_sequence;
+	} else {
+		tsp = &pps->ppsinfo.clear_timestamp;
+		osp = &pps->ppsparam.clear_offset;
+		foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
+		fhard = pps->kcmode & PPS_CAPTURECLEAR;
+		pcount = &pps->ppscount[1];
+		pseq = &pps->ppsinfo.clear_sequence;
+	}
+
+	/*
+	 * If the timecounter changed, we cannot compare the count values, so
+	 * we have to drop the rest of the PPS-stuff until the next event.
+	 */
+	if (pps->ppstc != pps->capth->th_counter) {
+		pps->ppstc = pps->capth->th_counter;
+		*pcount = pps->capcount;
+		pps->ppscount[2] = pps->capcount;
+		return;
+	}
+
+	/* Return if nothing really happened. */
+	if (*pcount == pps->capcount)
+		return;
+
+	/* Convert the count to a timespec. */
+	tcount = pps->capcount - pps->capth->th_offset_count;
+	tcount &= pps->capth->th_counter->tc_counter_mask;
+	bt = pps->capth->th_offset;
+	bintime_addx(&bt, pps->capth->th_scale * tcount);
+	bintime_add(&bt, &boottimebin);
+	bintime2timespec(&bt, &ts);
+
+	/* If the timecounter was wound up underneath us, bail out. */
+	if (pps->capgen != pps->capth->th_generation)
+		return;
+
+	*pcount = pps->capcount;
+	(*pseq)++;
+	*tsp = ts;
+
+	if (foff) {
+		timespecadd(tsp, osp);
+		if (tsp->tv_nsec < 0) {
+			tsp->tv_nsec += 1000000000;
+			tsp->tv_sec -= 1;
+		}
+	}
+#ifdef PPS_SYNC
+	if (fhard) {
+		/*
+		 * Feed the NTP PLL/FLL.
+		 * The FLL wants to know how many nanoseconds elapsed since
+		 * the previous event.
+		 * I have never been able to convince myself that this code
+		 * is actually correct:  Using th_scale is bound to contain
+		 * a phase correction component from userland, when running
+		 * as FLL, so the number hardpps() gets is not meaningful IMO.
+		 */
+		tcount = pps->capcount - pps->ppscount[2];
+		pps->ppscount[2] = pps->capcount;
+		tcount &= pps->capth->th_counter->tc_counter_mask;
+		bt.sec = 0;
+		bt.frac = 0;
+		bintime_addx(&bt, pps->capth->th_scale * tcount);
+		bintime2timespec(&bt, &ts);
+		hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec);
+	}
+#endif
+}
+
+/*
+ * Timecounters need to be updated every so often to prevent the hardware
+ * counter from overflowing.  Updating also recalculates the cached values
+ * used by the get*() family of functions, so their precision depends on
+ * the update frequency.
+ */
+
+static int tc_tick;
+SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tick, 0, "");
+
+static void
+tc_ticktock(void *dummy)
+{
+
+	tc_windup();
+	timeout(tc_ticktock, NULL, tc_tick);
+}
+
+static void
+inittimecounter(void *dummy)
+{
+	u_int p;
+
+	/*
+	 * Set the initial timeout to
+	 * max(1, <approx. number of hardclock ticks in a millisecond>).
+	 * People should probably not use the sysctl to set the timeout
+	 * to smaller than its inital value, since that value is the
+	 * smallest reasonable one.  If they want better timestamps they
+	 * should use the non-"get"* functions.
+	 */
+	if (hz > 1000)
+		tc_tick = (hz + 500) / 1000;
+	else
+		tc_tick = 1;
+	p = (tc_tick * 1000000) / hz;
+	printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
+
+	/* warm up new timecounter (again) and get rolling. */
+	(void)timecounter->tc_get_timecount(timecounter);
+	(void)timecounter->tc_get_timecount(timecounter);
+	tc_ticktock(NULL);
+}
+
+SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_FIRST, inittimecounter, NULL)
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
new file mode 100644
index 0000000..645170e
--- /dev/null
+++ b/sys/kern/kern_time.c
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_time.c	8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/time.h>
+#include <sys/timetc.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+struct timezone tz;
+
+/*
+ * Time of day and interval timer support.
+ *
+ * These routines provide the kernel entry points to get and set
+ * the time-of-day and per-process interval timers.  Subroutines
+ * here provide support for adding and subtracting timeval structures
+ * and decrementing interval timers, optionally reloading the interval
+ * timers when they expire.
+ */
+
+static int	nanosleep1(struct thread *td, struct timespec *rqt,
+		    struct timespec *rmt);
+static int	settime(struct thread *, struct timeval *);
+static void	timevalfix(struct timeval *);
+static void	no_lease_updatetime(int);
+
+static void 
+no_lease_updatetime(deltat)
+	int deltat;
+{
+}
+
+void (*lease_updatetime)(int)  = no_lease_updatetime;
+
+static int
+settime(td, tv)
+	struct thread *td;
+	struct timeval *tv;
+{
+	struct timeval delta, tv1, tv2;
+	static struct timeval maxtime, laststep;
+	struct timespec ts;
+	int s;
+
+	s = splclock();
+	microtime(&tv1);
+	delta = *tv;
+	timevalsub(&delta, &tv1);
+
+	/*
+	 * If the system is secure, we do not allow the time to be 
+	 * set to a value earlier than 1 second less than the highest
+	 * time we have yet seen. The worst a miscreant can do in
+	 * this circumstance is "freeze" time. He couldn't go
+	 * back to the past.
+	 *
+	 * We similarly do not allow the clock to be stepped more
+	 * than one second, nor more than once per second. This allows
+	 * a miscreant to make the clock march double-time, but no worse.
+	 */
+	if (securelevel_gt(td->td_ucred, 1) != 0) {
+		if (delta.tv_sec < 0 || delta.tv_usec < 0) {
+			/*
+			 * Update maxtime to latest time we've seen.
+			 */
+			if (tv1.tv_sec > maxtime.tv_sec)
+				maxtime = tv1;
+			tv2 = *tv;
+			timevalsub(&tv2, &maxtime);
+			if (tv2.tv_sec < -1) {
+				tv->tv_sec = maxtime.tv_sec - 1;
+				printf("Time adjustment clamped to -1 second\n");
+			}
+		} else {
+			if (tv1.tv_sec == laststep.tv_sec) {
+				splx(s);
+				return (EPERM);
+			}
+			if (delta.tv_sec > 1) {
+				tv->tv_sec = tv1.tv_sec + 1;
+				printf("Time adjustment clamped to +1 second\n");
+			}
+			laststep = *tv;
+		}
+	}
+
+	ts.tv_sec = tv->tv_sec;
+	ts.tv_nsec = tv->tv_usec * 1000;
+	mtx_lock(&Giant);
+	tc_setclock(&ts);
+	(void) splsoftclock();
+	lease_updatetime(delta.tv_sec);
+	splx(s);
+	resettodr();
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_gettime_args {
+	clockid_t clock_id;
+	struct	timespec *tp;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+clock_gettime(td, uap)
+	struct thread *td;
+	struct clock_gettime_args *uap;
+{
+	struct timespec ats;
+
+	if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+		return (EINVAL);
+	mtx_lock(&Giant);
+	nanotime(&ats);
+	mtx_unlock(&Giant);
+	return (copyout(&ats, SCARG(uap, tp), sizeof(ats)));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_settime_args {
+	clockid_t clock_id;
+	const struct	timespec *tp;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+clock_settime(td, uap)
+	struct thread *td;
+	struct clock_settime_args *uap;
+{
+	struct timeval atv;
+	struct timespec ats;
+	int error;
+
+	if ((error = suser(td)) != 0)
+		return (error);
+	if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+		return (EINVAL);
+	if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0)
+		return (error);
+	if (ats.tv_nsec < 0 || ats.tv_nsec >= 1000000000)
+		return (EINVAL);
+	/* XXX Don't convert nsec->usec and back */
+	TIMESPEC_TO_TIMEVAL(&atv, &ats);
+	error = settime(td, &atv);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_getres_args {
+	clockid_t clock_id;
+	struct	timespec *tp;
+};
+#endif
+
+int
+clock_getres(td, uap)
+	struct thread *td;
+	struct clock_getres_args *uap;
+{
+	struct timespec ts;
+	int error;
+
+	if (SCARG(uap, clock_id) != CLOCK_REALTIME)
+		return (EINVAL);
+	error = 0;
+	if (SCARG(uap, tp)) {
+		ts.tv_sec = 0;
+		ts.tv_nsec = 1000000000 / tc_getfrequency();
+		error = copyout(&ts, SCARG(uap, tp), sizeof(ts));
+	}
+	return (error);
+}
+
+static int nanowait;
+
+static int
+nanosleep1(td, rqt, rmt)
+	struct thread *td;
+	struct timespec *rqt, *rmt;
+{
+	struct timespec ts, ts2, ts3;
+	struct timeval tv;
+	int error;
+
+	if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
+		return (EINVAL);
+	if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
+		return (0);
+	getnanouptime(&ts);
+	timespecadd(&ts, rqt);
+	TIMESPEC_TO_TIMEVAL(&tv, rqt);
+	for (;;) {
+		error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp",
+		    tvtohz(&tv));
+		getnanouptime(&ts2);
+		if (error != EWOULDBLOCK) {
+			if (error == ERESTART)
+				error = EINTR;
+			if (rmt != NULL) {
+				timespecsub(&ts, &ts2);
+				if (ts.tv_sec < 0)
+					timespecclear(&ts);
+				*rmt = ts;
+			}
+			return (error);
+		}
+		if (timespeccmp(&ts2, &ts, >=))
+			return (0);
+		ts3 = ts;
+		timespecsub(&ts3, &ts2);
+		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
+	}
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nanosleep_args {
+	struct	timespec *rqtp;
+	struct	timespec *rmtp;
+};
+#endif
+
+/* 
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+nanosleep(td, uap)
+	struct thread *td;
+	struct nanosleep_args *uap;
+{
+	struct timespec rmt, rqt;
+	int error;
+
+	error = copyin(SCARG(uap, rqtp), &rqt, sizeof(rqt));
+	if (error)
+		return (error);
+
+	mtx_lock(&Giant);
+	if (SCARG(uap, rmtp)) {
+		if (!useracc((caddr_t)SCARG(uap, rmtp), sizeof(rmt), 
+		    VM_PROT_WRITE)) {
+			error = EFAULT;
+			goto done2;
+		}
+	}
+	error = nanosleep1(td, &rqt, &rmt);
+	if (error && SCARG(uap, rmtp)) {
+		int error2;
+
+		error2 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt));
+		if (error2)	/* XXX shouldn't happen, did useracc() above */
+			error = error2;
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct gettimeofday_args {
+	struct	timeval *tp;
+	struct	timezone *tzp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+gettimeofday(td, uap)
+	struct thread *td;
+	register struct gettimeofday_args *uap;
+{
+	struct timeval atv;
+	int error = 0;
+
+	if (uap->tp) {
+		microtime(&atv);
+		error = copyout((caddr_t)&atv, (caddr_t)uap->tp, sizeof (atv));
+	}
+	if (error == 0 && uap->tzp != NULL) {
+		mtx_lock(&Giant);
+		error = copyout((caddr_t)&tz, (caddr_t)uap->tzp,
+		    sizeof (tz));
+		mtx_unlock(&Giant);
+	}
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct settimeofday_args {
+	struct	timeval *tv;
+	struct	timezone *tzp;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+settimeofday(td, uap)
+	struct thread *td;
+	struct settimeofday_args *uap;
+{
+	struct timeval atv;
+	struct timezone atz;
+	int error = 0;
+
+	if ((error = suser(td)))
+		return (error);
+	/* Verify all parameters before changing time. */
+	if (uap->tv) {
+		if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
+		    sizeof(atv))))
+			return (error);
+		if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
+			return (EINVAL);
+	}
+	if (uap->tzp &&
+	    (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz))))
+		return (error);
+	
+	if (uap->tv && (error = settime(td, &atv)))
+		return (error);
+	if (uap->tzp) {
+		mtx_lock(&Giant);
+		tz = atz;
+		mtx_unlock(&Giant);
+	}
+	return (error);
+}
+/*
+ * Get value of an interval timer.  The process virtual and
+ * profiling virtual time timers are kept in the p_stats area, since
+ * they can be swapped out.  These are kept internally in the
+ * way they are specified externally: in time until they expire.
+ *
+ * The real time interval timer is kept in the process table slot
+ * for the process, and its value (it_value) is kept as an
+ * absolute time rather than as a delta, so that it is easy to keep
+ * periodic real-time signals from drifting.
+ *
+ * Virtual time timers are processed in the hardclock() routine of
+ * kern_clock.c.  The real time timer is processed by a timeout
+ * routine, called from the softclock() routine.  Since a callout
+ * may be delayed in real time due to interrupt processing in the system,
+ * it is possible for the real time timeout routine (realitexpire, given below),
+ * to be delayed in real time past when it is supposed to occur.  It
+ * does not suffice, therefore, to reload the real timer .it_value from the
+ * real time timers .it_interval.  Rather, we compute the next time in
+ * absolute time the timer should go off.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getitimer_args {
+	u_int	which;
+	struct	itimerval *itv;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getitimer(td, uap)
+	struct thread *td;
+	register struct getitimer_args *uap;
+{
+	struct proc *p = td->td_proc;
+	struct timeval ctv;
+	struct itimerval aitv;
+	int s;
+	int error;
+
+	if (uap->which > ITIMER_PROF)
+		return (EINVAL);
+
+	mtx_lock(&Giant);
+
+	s = splclock(); /* XXX still needed ? */
+	if (uap->which == ITIMER_REAL) {
+		/*
+		 * Convert from absolute to relative time in .it_value
+		 * part of real time timer.  If time for real time timer
+		 * has passed return 0, else return difference between
+		 * current time and time for the timer to go off.
+		 */
+		aitv = p->p_realtimer;
+		if (timevalisset(&aitv.it_value)) {
+			getmicrouptime(&ctv);
+			if (timevalcmp(&aitv.it_value, &ctv, <))
+				timevalclear(&aitv.it_value);
+			else
+				timevalsub(&aitv.it_value, &ctv);
+		}
+	} else {
+		aitv = p->p_stats->p_timer[uap->which];
+	}
+	splx(s);
+	error = copyout((caddr_t)&aitv, (caddr_t)uap->itv,
+	    sizeof (struct itimerval));
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setitimer_args {
+	u_int	which;
+	struct	itimerval *itv, *oitv;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setitimer(td, uap)
+	struct thread *td;
+	register struct setitimer_args *uap;
+{
+	struct proc *p = td->td_proc;
+	struct itimerval aitv;
+	struct timeval ctv;
+	register struct itimerval *itvp;
+	int s, error = 0;
+
+	if (uap->which > ITIMER_PROF)
+		return (EINVAL);
+	itvp = uap->itv;
+	if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv,
+	    sizeof(struct itimerval))))
+		return (error);
+
+	mtx_lock(&Giant);
+
+	if ((uap->itv = uap->oitv) &&
+	    (error = getitimer(td, (struct getitimer_args *)uap))) {
+		goto done2;
+	}
+	if (itvp == 0) {
+		error = 0;
+		goto done2;
+	}
+	if (itimerfix(&aitv.it_value)) {
+		error = EINVAL;
+		goto done2;
+	}
+	if (!timevalisset(&aitv.it_value)) {
+		timevalclear(&aitv.it_interval);
+	} else if (itimerfix(&aitv.it_interval)) {
+		error = EINVAL;
+		goto done2;
+	}
+	s = splclock(); /* XXX: still needed ? */
+	if (uap->which == ITIMER_REAL) {
+		if (timevalisset(&p->p_realtimer.it_value))
+			callout_stop(&p->p_itcallout);
+		if (timevalisset(&aitv.it_value)) 
+			callout_reset(&p->p_itcallout, tvtohz(&aitv.it_value),
+			    realitexpire, p);
+		getmicrouptime(&ctv);
+		timevaladd(&aitv.it_value, &ctv);
+		p->p_realtimer = aitv;
+	} else {
+		p->p_stats->p_timer[uap->which] = aitv;
+	}
+	splx(s);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Real interval timer expired:
+ * send process whose timer expired an alarm signal.
+ * If time is not set up to reload, then just return.
+ * Else compute next time timer should go off which is > current time.
+ * This is where delay in processing this timeout causes multiple
+ * SIGALRM calls to be compressed into one.
+ * tvtohz() always adds 1 to allow for the time until the next clock
+ * interrupt being strictly less than 1 clock tick, but we don't want
+ * that here since we want to appear to be in sync with the clock
+ * interrupt even when we're delayed.
+ */
+void
+realitexpire(arg)
+	void *arg;
+{
+	register struct proc *p;
+	struct timeval ctv, ntv;
+	int s;
+
+	p = (struct proc *)arg;
+	PROC_LOCK(p);
+	psignal(p, SIGALRM);
+	if (!timevalisset(&p->p_realtimer.it_interval)) {
+		timevalclear(&p->p_realtimer.it_value);
+		PROC_UNLOCK(p);
+		return;
+	}
+	for (;;) {
+		s = splclock(); /* XXX: still neeeded ? */
+		timevaladd(&p->p_realtimer.it_value,
+		    &p->p_realtimer.it_interval);
+		getmicrouptime(&ctv);
+		if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
+			ntv = p->p_realtimer.it_value;
+			timevalsub(&ntv, &ctv);
+			callout_reset(&p->p_itcallout, tvtohz(&ntv) - 1,
+			    realitexpire, p);
+			splx(s);
+			PROC_UNLOCK(p);
+			return;
+		}
+		splx(s);
+	}
+	/*NOTREACHED*/
+}
+
+/*
+ * Check that a proposed value to load into the .it_value or
+ * .it_interval part of an interval timer is acceptable, and
+ * fix it to have at least minimal value (i.e. if it is less
+ * than the resolution of the clock, round it up.)
+ */
+int
+itimerfix(tv)
+	struct timeval *tv;
+{
+
+	if (tv->tv_sec < 0 || tv->tv_sec > 100000000 ||
+	    tv->tv_usec < 0 || tv->tv_usec >= 1000000)
+		return (EINVAL);
+	if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
+		tv->tv_usec = tick;
+	return (0);
+}
+
+/*
+ * Decrement an interval timer by a specified number
+ * of microseconds, which must be less than a second,
+ * i.e. < 1000000.  If the timer expires, then reload
+ * it.  In this case, carry over (usec - old value) to
+ * reduce the value reloaded into the timer so that
+ * the timer does not drift.  This routine assumes
+ * that it is called in a context where the timers
+ * on which it is operating cannot change in value.
+ */
+int
+itimerdecr(itp, usec)
+	register struct itimerval *itp;
+	int usec;
+{
+
+	if (itp->it_value.tv_usec < usec) {
+		if (itp->it_value.tv_sec == 0) {
+			/* expired, and already in next interval */
+			usec -= itp->it_value.tv_usec;
+			goto expire;
+		}
+		itp->it_value.tv_usec += 1000000;
+		itp->it_value.tv_sec--;
+	}
+	itp->it_value.tv_usec -= usec;
+	usec = 0;
+	if (timevalisset(&itp->it_value))
+		return (1);
+	/* expired, exactly at end of interval */
+expire:
+	if (timevalisset(&itp->it_interval)) {
+		itp->it_value = itp->it_interval;
+		itp->it_value.tv_usec -= usec;
+		if (itp->it_value.tv_usec < 0) {
+			itp->it_value.tv_usec += 1000000;
+			itp->it_value.tv_sec--;
+		}
+	} else
+		itp->it_value.tv_usec = 0;		/* sec is already 0 */
+	return (0);
+}
+
+/*
+ * Add and subtract routines for timevals.
+ * N.B.: subtract routine doesn't deal with
+ * results which are before the beginning,
+ * it just gets very confused in this case.
+ * Caveat emptor.
+ */
+void
+timevaladd(t1, t2)
+	struct timeval *t1, *t2;
+{
+
+	t1->tv_sec += t2->tv_sec;
+	t1->tv_usec += t2->tv_usec;
+	timevalfix(t1);
+}
+
+void
+timevalsub(t1, t2)
+	struct timeval *t1, *t2;
+{
+
+	t1->tv_sec -= t2->tv_sec;
+	t1->tv_usec -= t2->tv_usec;
+	timevalfix(t1);
+}
+
+static void
+timevalfix(t1)
+	struct timeval *t1;
+{
+
+	if (t1->tv_usec < 0) {
+		t1->tv_sec--;
+		t1->tv_usec += 1000000;
+	}
+	if (t1->tv_usec >= 1000000) {
+		t1->tv_sec++;
+		t1->tv_usec -= 1000000;
+	}
+}
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
new file mode 100644
index 0000000..937b0c2
--- /dev/null
+++ b/sys/kern/kern_timeout.c
@@ -0,0 +1,414 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	From: @(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+/*
+ * TODO:
+ *	allocate more timeout table slots when table overflows.
+ */
+
+/* Exported to machdep.c and/or kern_clock.c.  */
+struct callout *callout;
+struct callout_list callfree;
+int callwheelsize, callwheelbits, callwheelmask;
+struct callout_tailq *callwheel;
+int softticks;			/* Like ticks, but for softclock(). */
+struct mtx callout_lock;
+
+static struct callout *nextsoftcheck;	/* Next callout to be checked. */
+
+/*
+ * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization 
+ *
+ *	This code is called very early in the kernel initialization sequence,
+ *	and may be called more then once.
+ */
+caddr_t
+kern_timeout_callwheel_alloc(caddr_t v)
+{
+	/*
+	 * Calculate callout wheel size
+	 */
+	for (callwheelsize = 1, callwheelbits = 0;
+	     callwheelsize < ncallout;
+	     callwheelsize <<= 1, ++callwheelbits)
+		;
+	callwheelmask = callwheelsize - 1;
+
+	callout = (struct callout *)v;
+	v = (caddr_t)(callout + ncallout);
+	callwheel = (struct callout_tailq *)v;
+	v = (caddr_t)(callwheel + callwheelsize);
+	return(v);
+}
+
+/*
+ * kern_timeout_callwheel_init() - initialize previously reserved callwheel
+ *				   space.
+ *
+ *	This code is called just once, after the space reserved for the
+ *	callout wheel has been finalized.
+ */
+void
+kern_timeout_callwheel_init(void)
+{
+	int i;
+
+	SLIST_INIT(&callfree);
+	for (i = 0; i < ncallout; i++) {
+		callout_init(&callout[i], 0);
+		callout[i].c_flags = CALLOUT_LOCAL_ALLOC;
+		SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle);
+	}
+	for (i = 0; i < callwheelsize; i++) {
+		TAILQ_INIT(&callwheel[i]);
+	}
+	mtx_init(&callout_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
+}
+
+/*
+ * The callout mechanism is based on the work of Adam M. Costello and 
+ * George Varghese, published in a technical report entitled "Redesigning
+ * the BSD Callout and Timer Facilities" and modified slightly for inclusion
+ * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
+ * used in this implementation was published by G.Varghese and A. Lauck in
+ * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
+ * the Efficient Implementation of a Timer Facility" in the Proceedings of
+ * the 11th ACM Annual Symposium on Operating Systems Principles,
+ * Austin, Texas Nov 1987.
+ */
+
+/*
+ * Software (low priority) clock interrupt.
+ * Run periodic events from timeout queue.
+ */
+void
+softclock(void *dummy)
+{
+	register struct callout *c;
+	register struct callout_tailq *bucket;
+	register int curticks;
+	register int steps;	/* #steps since we last allowed interrupts */
+
+#ifndef MAX_SOFTCLOCK_STEPS
+#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */
+#endif /* MAX_SOFTCLOCK_STEPS */
+
+	steps = 0;
+	mtx_lock_spin(&callout_lock);
+	while (softticks != ticks) {
+		softticks++;
+		/*
+		 * softticks may be modified by hard clock, so cache
+		 * it while we work on a given bucket.
+		 */
+		curticks = softticks;
+		bucket = &callwheel[curticks & callwheelmask];
+		c = TAILQ_FIRST(bucket);
+		while (c) {
+			if (c->c_time != curticks) {
+				c = TAILQ_NEXT(c, c_links.tqe);
+				++steps;
+				if (steps >= MAX_SOFTCLOCK_STEPS) {
+					nextsoftcheck = c;
+					/* Give interrupts a chance. */
+					mtx_unlock_spin(&callout_lock);
+					;	/* nothing */
+					mtx_lock_spin(&callout_lock);
+					c = nextsoftcheck;
+					steps = 0;
+				}
+			} else {
+				void (*c_func)(void *);
+				void *c_arg;
+				int c_flags;
+
+				nextsoftcheck = TAILQ_NEXT(c, c_links.tqe);
+				TAILQ_REMOVE(bucket, c, c_links.tqe);
+				c_func = c->c_func;
+				c_arg = c->c_arg;
+				c_flags = c->c_flags;
+				c->c_func = NULL;
+				if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
+					c->c_flags = CALLOUT_LOCAL_ALLOC;
+					SLIST_INSERT_HEAD(&callfree, c,
+							  c_links.sle);
+				} else {
+					c->c_flags =
+					    (c->c_flags & ~CALLOUT_PENDING);
+				}
+				mtx_unlock_spin(&callout_lock);
+				if (!(c_flags & CALLOUT_MPSAFE))
+					mtx_lock(&Giant);
+				c_func(c_arg);
+				if (!(c_flags & CALLOUT_MPSAFE))
+					mtx_unlock(&Giant);
+				mtx_lock_spin(&callout_lock);
+				steps = 0;
+				c = nextsoftcheck;
+			}
+		}
+	}
+	nextsoftcheck = NULL;
+	mtx_unlock_spin(&callout_lock);
+}
+
+/*
+ * timeout --
+ *	Execute a function after a specified length of time.
+ *
+ * untimeout --
+ *	Cancel previous timeout function call.
+ *
+ * callout_handle_init --
+ *	Initialize a handle so that using it with untimeout is benign.
+ *
+ *	See AT&T BCI Driver Reference Manual for specification.  This
+ *	implementation differs from that one in that although an 
+ *	identification value is returned from timeout, the original
+ *	arguments to timeout as well as the identifier are used to
+ *	identify entries for untimeout.
+ */
+struct callout_handle
+timeout(ftn, arg, to_ticks)
+	timeout_t *ftn;
+	void *arg;
+	int to_ticks;
+{
+	struct callout *new;
+	struct callout_handle handle;
+
+	mtx_lock_spin(&callout_lock);
+
+	/* Fill in the next free callout structure. */
+	new = SLIST_FIRST(&callfree);
+	if (new == NULL)
+		/* XXX Attempt to malloc first */
+		panic("timeout table full");
+	SLIST_REMOVE_HEAD(&callfree, c_links.sle);
+	
+	callout_reset(new, to_ticks, ftn, arg);
+
+	handle.callout = new;
+	mtx_unlock_spin(&callout_lock);
+	return (handle);
+}
+
+void
+untimeout(ftn, arg, handle)
+	timeout_t *ftn;
+	void *arg;
+	struct callout_handle handle;
+{
+
+	/*
+	 * Check for a handle that was initialized
+	 * by callout_handle_init, but never used
+	 * for a real timeout.
+	 */
+	if (handle.callout == NULL)
+		return;
+
+	mtx_lock_spin(&callout_lock);
+	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
+		callout_stop(handle.callout);
+	mtx_unlock_spin(&callout_lock);
+}
+
+void
+callout_handle_init(struct callout_handle *handle)
+{
+	handle->callout = NULL;
+}
+
+/*
+ * New interface; clients allocate their own callout structures.
+ *
+ * callout_reset() - establish or change a timeout
+ * callout_stop() - disestablish a timeout
+ * callout_init() - initialize a callout structure so that it can
+ *	safely be passed to callout_reset() and callout_stop()
+ *
+ * <sys/callout.h> defines three convenience macros:
+ *
+ * callout_active() - returns truth if callout has not been serviced
+ * callout_pending() - returns truth if callout is still waiting for timeout
+ * callout_deactivate() - marks the callout as having been serviced
+ */
+void
+callout_reset(c, to_ticks, ftn, arg)
+	struct	callout *c;
+	int	to_ticks;
+	void	(*ftn)(void *);
+	void	*arg;
+{
+
+	mtx_lock_spin(&callout_lock);
+	if (c->c_flags & CALLOUT_PENDING)
+		callout_stop(c);
+
+	/*
+	 * We could unlock callout_lock here and lock it again before the
+	 * TAILQ_INSERT_TAIL, but there's no point since doing this setup
+	 * doesn't take much time.
+	 */
+	if (to_ticks <= 0)
+		to_ticks = 1;
+
+	c->c_arg = arg;
+	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
+	c->c_func = ftn;
+	c->c_time = ticks + to_ticks;
+	TAILQ_INSERT_TAIL(&callwheel[c->c_time & callwheelmask], 
+			  c, c_links.tqe);
+	mtx_unlock_spin(&callout_lock);
+}
+
+int
+callout_stop(c)
+	struct	callout *c;
+{
+
+	mtx_lock_spin(&callout_lock);
+	/*
+	 * Don't attempt to delete a callout that's not on the queue.
+	 */
+	if (!(c->c_flags & CALLOUT_PENDING)) {
+		c->c_flags &= ~CALLOUT_ACTIVE;
+		mtx_unlock_spin(&callout_lock);
+		return (0);
+	}
+	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+
+	if (nextsoftcheck == c) {
+		nextsoftcheck = TAILQ_NEXT(c, c_links.tqe);
+	}
+	TAILQ_REMOVE(&callwheel[c->c_time & callwheelmask], c, c_links.tqe);
+	c->c_func = NULL;
+
+	if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
+		SLIST_INSERT_HEAD(&callfree, c, c_links.sle);
+	}
+	mtx_unlock_spin(&callout_lock);
+	return (1);
+}
+
+void
+callout_init(c, mpsafe)
+	struct	callout *c;
+	int mpsafe;
+{
+	bzero(c, sizeof *c);
+	if (mpsafe)
+		c->c_flags |= CALLOUT_MPSAFE;
+}
+
+#ifdef APM_FIXUP_CALLTODO
+/* 
+ * Adjust the kernel calltodo timeout list.  This routine is used after 
+ * an APM resume to recalculate the calltodo timer list values with the 
+ * number of hz's we have been sleeping.  The next hardclock() will detect 
+ * that there are fired timers and run softclock() to execute them.
+ *
+ * Please note, I have not done an exhaustive analysis of what code this
+ * might break.  I am motivated to have my select()'s and alarm()'s that
+ * have expired during suspend firing upon resume so that the applications
+ * which set the timer can do the maintanence the timer was for as close
+ * as possible to the originally intended time.  Testing this code for a 
+ * week showed that resuming from a suspend resulted in 22 to 25 timers 
+ * firing, which seemed independant on whether the suspend was 2 hours or
+ * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
+ */
+void
+adjust_timeout_calltodo(time_change)
+    struct timeval *time_change;
+{
+	register struct callout *p;
+	unsigned long delta_ticks;
+
+	/* 
+	 * How many ticks were we asleep?
+	 * (stolen from tvtohz()).
+	 */
+
+	/* Don't do anything */
+	if (time_change->tv_sec < 0)
+		return;
+	else if (time_change->tv_sec <= LONG_MAX / 1000000)
+		delta_ticks = (time_change->tv_sec * 1000000 +
+			       time_change->tv_usec + (tick - 1)) / tick + 1;
+	else if (time_change->tv_sec <= LONG_MAX / hz)
+		delta_ticks = time_change->tv_sec * hz +
+			      (time_change->tv_usec + (tick - 1)) / tick + 1;
+	else
+		delta_ticks = LONG_MAX;
+
+	if (delta_ticks > INT_MAX)
+		delta_ticks = INT_MAX;
+
+	/* 
+	 * Now rip through the timer calltodo list looking for timers
+	 * to expire.
+	 */
+
+	/* don't collide with softclock() */
+	mtx_lock_spin(&callout_lock);
+	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
+		p->c_time -= delta_ticks;
+
+		/* Break if the timer had more time on it than delta_ticks */
+		if (p->c_time > 0)
+			break;
+
+		/* take back the ticks the timer didn't use (p->c_time <= 0) */
+		delta_ticks = -p->c_time;
+	}
+	mtx_unlock_spin(&callout_lock);
+
+	return;
+}
+#endif /* APM_FIXUP_CALLTODO */
diff --git a/sys/kern/kern_uuid.c b/sys/kern/kern_uuid.c
new file mode 100644
index 0000000..ba5faa5
--- /dev/null
+++ b/sys/kern/kern_uuid.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2002 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/socket.h>
+#include <sys/sysproto.h>
+#include <sys/uuid.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+
+/*
+ * See also:
+ *	http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
+ *	http://www.opengroup.org/onlinepubs/009629399/apdxa.htm
+ *
+ * Note that the generator state is itself an UUID, but the time and clock
+ * sequence fields are written in the native byte order.
+ */
+
+CTASSERT(sizeof(struct uuid) == 16);
+
+/* We use an alternative, more convenient representation in the generator. */
+struct uuid_private {
+	union {
+		uint64_t	ll;		/* internal. */
+		struct {
+			uint32_t	low;
+			uint16_t	mid;
+			uint16_t	hi;
+		} x;
+	} time;
+	uint16_t	seq;			/* Big-endian. */
+	uint16_t	node[UUID_NODE_LEN>>1];
+};
+
+CTASSERT(sizeof(struct uuid_private) == 16);
+
+static struct uuid_private uuid_last;
+
+static struct mtx uuid_mutex;
+MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF);
+
+/*
+ * Return the first MAC address we encounter or, if none was found,
+ * construct a sufficiently random multicast address. We don't try
+ * to return the same MAC address as previously returned. We always
+ * generate a new multicast address if no MAC address exists in the
+ * system.
+ * It would be nice to know if 'ifnet' or any of its sub-structures
+ * has been changed in any way. If not, we could simply skip the
+ * scan and safely return the MAC address we returned before.
+ */
+static void
+uuid_node(uint16_t *node)
+{
+	struct ifnet *ifp;
+	struct ifaddr *ifa;
+	struct sockaddr_dl *sdl;
+	int i;
+
+	/* XXX: lock ifnet. */
+	TAILQ_FOREACH(ifp, &ifnet, if_link) {
+		/* Walk the address list */
+		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+			sdl = (struct sockaddr_dl*)ifa->ifa_addr;
+			if (sdl != NULL && sdl->sdl_family == AF_LINK &&
+			    sdl->sdl_type == IFT_ETHER) {
+				/* Got a MAC address. */
+				bcopy(LLADDR(sdl), node, UUID_NODE_LEN);
+				/* XXX: unlock ifnet. */
+				return;
+			}
+		}
+	}
+	/* XXX: unlock ifnet. */
+
+	for (i = 0; i < (UUID_NODE_LEN>>1); i++)
+		node[i] = (uint16_t)arc4random();
+	*((uint8_t*)node) |= 0x80;
+}
+
+/*
+ * Get the current time as a 60 bit count of 100-nanosecond intervals
+ * since 00:00:00.00, October 15,1582. We apply a magic offset to convert
+ * the Unix time since 00:00:00.00, Januari 1, 1970 to the date of the
+ * Gregorian reform to the Christian calendar.
+ */
+static uint64_t
+uuid_time(void)
+{
+	struct bintime bt;
+	uint64_t time = 0x01B21DD213814000LL;
+
+	bintime(&bt);
+	time += (uint64_t)bt.sec * 10000000LL;
+	time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32;
+	return (time & ((1LL << 60) - 1LL));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct uuidgen_args {
+	struct uuid *store;
+	int	count;
+};
+#endif
+
+int uuidgen(struct thread *td, struct uuidgen_args *uap)
+{
+	struct uuid_private uuid;
+	uint64_t time;
+	int error;
+
+	/*
+	 * Limit the number of UUIDs that can be created at the same time
+	 * to some arbitrary number. This isn't really necessary, but I
+	 * like to have some sort of upper-bound that's less than 2G :-)
+	 * XXX needs to be tunable.
+	 */
+	if (uap->count < 1 || uap->count > 2048)
+		return (EINVAL);
+
+	/* XXX: pre-validate accessibility to the whole of the UUID store? */
+
+	mtx_lock(&uuid_mutex);
+
+	uuid_node(uuid.node);
+	time = uuid_time();
+
+	if (uuid_last.time.ll == 0LL || uuid_last.node[0] != uuid.node[0] ||
+	    uuid_last.node[1] != uuid.node[1] ||
+	    uuid_last.node[2] != uuid.node[2])
+		uuid.seq = (uint16_t)arc4random() & 0x3fff;
+	else if (uuid_last.time.ll >= time)
+		uuid.seq = (uuid_last.seq + 1) & 0x3fff;
+	else
+		uuid.seq = uuid_last.seq;
+
+	uuid_last = uuid;
+	uuid_last.time.ll = (time + uap->count - 1) & ((1LL << 60) - 1LL);
+
+	mtx_unlock(&uuid_mutex);
+
+	/* Set sequence and variant and deal with byte order. */
+	uuid.seq = htobe16(uuid.seq | 0x8000);
+
+	/* XXX: this should copyout larger chunks at a time. */
+	do {
+		/* Set time and version (=1) and deal with byte order. */
+		uuid.time.x.low = (uint32_t)time;
+		uuid.time.x.mid = (uint16_t)(time >> 32);
+		uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12);
+		error = copyout(&uuid, uap->store, sizeof(uuid));
+		uap->store++;
+		uap->count--;
+		time++;
+	} while (uap->count > 0 && !error);
+
+	return (error);
+}
+
+int
+snprintf_uuid(char *buf, size_t sz, struct uuid *uuid)
+{
+	struct uuid_private *id;
+	int cnt;
+
+	id = (struct uuid_private *)uuid;
+	cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x",
+	    id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq),
+	    be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2]));
+	return (cnt);
+}
+
+int
+printf_uuid(struct uuid *uuid)
+{
+	char buf[38];
+
+	snprintf_uuid(buf, sizeof(buf), uuid);
+	return (printf("%s", buf));
+}
+
+int
+sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid)
+{
+	char buf[38];
+
+	snprintf_uuid(buf, sizeof(buf), uuid);
+	return (sbuf_printf(sb, "%s", buf));
+}
diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c
new file mode 100644
index 0000000..9d4136b
--- /dev/null
+++ b/sys/kern/kern_xxx.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_xxx.c	8.2 (Berkeley) 11/14/93
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/utsname.h>
+
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+
+#ifndef _SYS_SYSPROTO_H_
+struct gethostname_args {
+	char	*hostname;
+	u_int	len;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ogethostname(td, uap)
+	struct thread *td;
+	struct gethostname_args *uap;
+{
+	int name[2];
+	int error;
+	size_t len = uap->len;
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_HOSTNAME;
+	mtx_lock(&Giant);
+	error = userland_sysctl(td, name, 2, uap->hostname, &len, 1, 0, 0, 0);
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sethostname_args {
+	char	*hostname;
+	u_int	len;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osethostname(td, uap)
+	struct thread *td;
+	register struct sethostname_args *uap;
+{
+	int name[2];
+	int error;
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_HOSTNAME;
+	mtx_lock(&Giant);
+	if ((error = suser_cred(td->td_ucred, PRISON_ROOT)) == 0) {
+		error = userland_sysctl(td, name, 2, 0, 0, 0,
+		    uap->hostname, uap->len, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogethostid_args {
+	int	dummy;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ogethostid(td, uap)
+	struct thread *td;
+	struct ogethostid_args *uap;
+{
+
+	*(long *)(td->td_retval) = hostid;
+	return (0);
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+#ifdef COMPAT_43
+#ifndef _SYS_SYSPROTO_H_
+struct osethostid_args {
+	long	hostid;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+osethostid(td, uap)
+	struct thread *td;
+	struct osethostid_args *uap;
+{
+	int error;
+
+	mtx_lock(&Giant);
+	if ((error = suser(td)))
+		hostid = uap->hostid;
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+oquota(td, uap)
+	struct thread *td;
+	struct oquota_args *uap;
+{
+	return (ENOSYS);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * This is the FreeBSD-1.1 compatable uname(2) interface.  These
+ * days it is done in libc as a wrapper around a bunch of sysctl's.
+ * This must maintain the old 1.1 binary ABI.
+ */
+#if SYS_NMLN != 32
+#error "FreeBSD-1.1 uname syscall has been broken"
+#endif
+#ifndef _SYS_SYSPROTO_H_
+struct uname_args {
+        struct utsname  *name;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+uname(td, uap)
+	struct thread *td;
+	struct uname_args *uap;
+{
+	int name[2], error;
+	size_t len;
+	char *s, *us;
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_OSTYPE;
+	len = sizeof (uap->name->sysname);
+	mtx_lock(&Giant);
+	error = userland_sysctl(td, name, 2, uap->name->sysname, &len, 
+		1, 0, 0, 0);
+	if (error)
+		goto done2;
+	subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
+
+	name[1] = KERN_HOSTNAME;
+	len = sizeof uap->name->nodename;
+	error = userland_sysctl(td, name, 2, uap->name->nodename, &len, 
+		1, 0, 0, 0);
+	if (error)
+		goto done2;
+	subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
+
+	name[1] = KERN_OSRELEASE;
+	len = sizeof uap->name->release;
+	error = userland_sysctl(td, name, 2, uap->name->release, &len, 
+		1, 0, 0, 0);
+	if (error)
+		goto done2;
+	subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
+
+/*
+	name = KERN_VERSION;
+	len = sizeof uap->name->version;
+	error = userland_sysctl(td, name, 2, uap->name->version, &len, 
+		1, 0, 0, 0);
+	if (error)
+		goto done2;
+	subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
+*/
+
+/*
+ * this stupid hackery to make the version field look like FreeBSD 1.1
+ */
+	for(s = version; *s && *s != '#'; s++);
+
+	for(us = uap->name->version; *s && *s != ':'; s++) {
+		error = subyte( us++, *s);
+		if (error)
+			goto done2;
+	}
+	error = subyte( us++, 0);
+	if (error)
+		goto done2;
+
+	name[0] = CTL_HW;
+	name[1] = HW_MACHINE;
+	len = sizeof uap->name->machine;
+	error = userland_sysctl(td, name, 2, uap->name->machine, &len, 
+		1, 0, 0, 0);
+	if (error)
+		goto done2;
+	subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdomainname_args {
+        char    *domainname;
+        int     len;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getdomainname(td, uap)
+        struct thread *td;
+        struct getdomainname_args *uap;
+{
+	int domainnamelen;
+	int error;
+
+	mtx_lock(&Giant);
+	domainnamelen = strlen(domainname) + 1;
+	if ((u_int)uap->len > domainnamelen + 1)
+		uap->len = domainnamelen + 1;
+	error = copyout((caddr_t)domainname, (caddr_t)uap->domainname, uap->len);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setdomainname_args {
+        char    *domainname;
+        int     len;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setdomainname(td, uap)
+        struct thread *td;
+        struct setdomainname_args *uap;
+{
+        int error, domainnamelen;
+
+	mtx_lock(&Giant);
+        if ((error = suser(td)))
+		goto done2;
+        if ((u_int)uap->len > sizeof (domainname) - 1) {
+		error = EINVAL;
+		goto done2;
+	}
+        domainnamelen = uap->len;
+        error = copyin((caddr_t)uap->domainname, domainname, uap->len);
+        domainname[domainnamelen] = 0;
+done2:
+	mtx_unlock(&Giant);
+        return (error);
+}
+
diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c
new file mode 100644
index 0000000..c9081c3
--- /dev/null
+++ b/sys/kern/ksched.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 1996, 1997
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/* ksched: Soft real time scheduling based on "rtprio".
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+
+#include <posix4/posix4.h>
+
+/* ksched: Real-time extension to support POSIX priority scheduling.
+ */
+
+struct ksched {
+	struct timespec rr_interval;
+};
+
+int ksched_attach(struct ksched **p)
+{
+	struct ksched *ksched= p31b_malloc(sizeof(*ksched));
+
+	ksched->rr_interval.tv_sec = 0;
+	ksched->rr_interval.tv_nsec = 1000000000L / roundrobin_interval();
+
+	*p = ksched;
+	return 0;
+}
+
+int ksched_detach(struct ksched *ks)
+{
+	p31b_free(ks);
+
+	return 0;
+}
+
+/*
+ * XXX About priorities
+ *
+ *	POSIX 1003.1b requires that numerically higher priorities be of
+ *	higher priority.  It also permits sched_setparam to be
+ *	implementation defined for SCHED_OTHER.  I don't like
+ *	the notion of inverted priorites for normal processes when
+ *  you can use "setpriority" for that.
+ *
+ *	I'm rejecting sched_setparam for SCHED_OTHER with EINVAL.
+ */
+
+/* Macros to convert between the unix (lower numerically is higher priority)
+ * and POSIX 1003.1b (higher numerically is higher priority)
+ */
+
+#define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P))
+#define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P))
+
+/* These improve readability a bit for me:
+ */
+#define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX)
+#define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN)
+
+static __inline int
+getscheduler(register_t *ret, struct ksched *ksched, struct thread *td)
+{
+	struct rtprio rtp;
+	int e = 0;
+
+	mtx_lock_spin(&sched_lock);
+	pri_to_rtp(td->td_ksegrp, &rtp);
+	mtx_unlock_spin(&sched_lock);
+	switch (rtp.type)
+	{
+		case RTP_PRIO_FIFO:
+		*ret = SCHED_FIFO;
+		break;
+
+		case RTP_PRIO_REALTIME:
+		*ret = SCHED_RR;
+		break;
+
+		default:
+		*ret = SCHED_OTHER;
+		break;
+	}
+
+	return e;
+}
+
+int ksched_setparam(register_t *ret, struct ksched *ksched,
+	struct thread *td, const struct sched_param *param)
+{
+	register_t policy;
+	int e;
+
+	e = getscheduler(&policy, ksched, td);
+
+	if (e == 0)
+	{
+		if (policy == SCHED_OTHER)
+			e = EINVAL;
+		else
+			e = ksched_setscheduler(ret, ksched, td, policy, param);
+	}
+
+	return e;
+}
+
+int ksched_getparam(register_t *ret, struct ksched *ksched,
+	struct thread *td, struct sched_param *param)
+{
+	struct rtprio rtp;
+
+	mtx_lock_spin(&sched_lock);
+	pri_to_rtp(td->td_ksegrp, &rtp);
+	mtx_unlock_spin(&sched_lock);
+	if (RTP_PRIO_IS_REALTIME(rtp.type))
+		param->sched_priority = rtpprio_to_p4prio(rtp.prio);
+
+	return 0;
+}
+
+/*
+ * XXX The priority and scheduler modifications should
+ *     be moved into published interfaces in kern/kern_sync.
+ *
+ * The permissions to modify process p were checked in "p31b_proc()".
+ *
+ */
+int ksched_setscheduler(register_t *ret, struct ksched *ksched,
+	struct thread *td, int policy, const struct sched_param *param)
+{
+	int e = 0;
+	struct rtprio rtp;
+	struct ksegrp *kg = td->td_ksegrp;
+
+	switch(policy)
+	{
+		case SCHED_RR:
+		case SCHED_FIFO:
+
+		if (param->sched_priority >= P1B_PRIO_MIN &&
+		param->sched_priority <= P1B_PRIO_MAX)
+		{
+			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+			rtp.type = (policy == SCHED_FIFO)
+				? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
+
+			mtx_lock_spin(&sched_lock);
+			rtp_to_pri(&rtp, kg);
+			td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */
+			mtx_unlock_spin(&sched_lock);
+		}
+		else
+			e = EPERM;
+
+
+		break;
+
+		case SCHED_OTHER:
+		{
+			rtp.type = RTP_PRIO_NORMAL;
+			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+			mtx_lock_spin(&sched_lock);
+			rtp_to_pri(&rtp, kg);
+
+			/* XXX Simply revert to whatever we had for last
+			 *     normal scheduler priorities.
+			 *     This puts a requirement
+			 *     on the scheduling code: You must leave the
+			 *     scheduling info alone.
+			 */
+			td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */
+			mtx_unlock_spin(&sched_lock);
+		}
+		break;
+	}
+
+	return e;
+}
+
+int ksched_getscheduler(register_t *ret, struct ksched *ksched, struct thread *td)
+{
+	return getscheduler(ret, ksched, td);
+}
+
+/* ksched_yield: Yield the CPU.
+ */
+int ksched_yield(register_t *ret, struct ksched *ksched)
+{
+	mtx_lock_spin(&sched_lock);
+	curthread->td_kse->ke_flags |= KEF_NEEDRESCHED;
+	mtx_unlock_spin(&sched_lock);
+	return 0;
+}
+
+int ksched_get_priority_max(register_t*ret, struct ksched *ksched, int policy)
+{
+	int e = 0;
+
+	switch (policy)
+	{
+		case SCHED_FIFO:
+		case SCHED_RR:
+		*ret = RTP_PRIO_MAX;
+		break;
+
+		case SCHED_OTHER:
+		*ret =  PRIO_MAX;
+		break;
+
+		default:
+		e = EINVAL;
+	}
+
+	return e;
+}
+
+int ksched_get_priority_min(register_t *ret, struct ksched *ksched, int policy)
+{
+	int e = 0;
+
+	switch (policy)
+	{
+		case SCHED_FIFO:
+		case SCHED_RR:
+		*ret = P1B_PRIO_MIN;
+		break;
+
+		case SCHED_OTHER:
+		*ret =  PRIO_MIN;
+		break;
+
+		default:
+		e = EINVAL;
+	}
+
+	return e;
+}
+
+int ksched_rr_get_interval(register_t *ret, struct ksched *ksched,
+	struct thread *td, struct timespec *timespec)
+{
+	*timespec = ksched->rr_interval;
+
+	return 0;
+}
diff --git a/sys/kern/link_aout.c b/sys/kern/link_aout.c
new file mode 100644
index 0000000..5a863bd
--- /dev/null
+++ b/sys/kern/link_aout.c
@@ -0,0 +1,590 @@
+/*-
+ * Copyright (c) 1997-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifdef __i386__
+
+#define FREEBSD_AOUT	1
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+
+
+#include "linker_if.h"
+
+#ifndef __ELF__
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <machine/vmparam.h>
+#endif
+
+#include <a.out.h>
+#include <link.h>
+
+typedef struct aout_file {
+    struct linker_file	lf;		/* Common fields */
+    int			preloaded;	/* Was this pre-loader */
+    char*		address;	/* Load address */
+    struct _dynamic*	dynamic;	/* Symbol table etc. */
+} *aout_file_t;
+
+static int		link_aout_link_preload(linker_class_t lc,
+					       const char* modname, linker_file_t*);
+static int		link_aout_link_preload_finish(linker_file_t);
+
+static int		link_aout_load_file(linker_class_t lc, const char*, linker_file_t*);
+static int		link_aout_lookup_symbol(linker_file_t, const char*,
+						c_linker_sym_t*);
+static int		link_aout_symbol_values(linker_file_t file, c_linker_sym_t sym,
+						linker_symval_t* symval);
+static int		link_aout_search_symbol(linker_file_t lf, caddr_t value,
+						c_linker_sym_t* sym, long* diffp);
+static void		link_aout_unload_file(linker_file_t);
+static void		link_aout_unload_preload(linker_file_t);
+static int		link_aout_lookup_set(linker_file_t, const char*,
+					     void ***, void ***, int*);
+
+static kobj_method_t link_aout_methods[] = {
+    KOBJMETHOD(linker_lookup_symbol,	link_aout_lookup_symbol),
+    KOBJMETHOD(linker_symbol_values,	link_aout_symbol_values),
+    KOBJMETHOD(linker_search_symbol,	link_aout_search_symbol),
+    KOBJMETHOD(linker_unload,		link_aout_unload_file),
+    KOBJMETHOD(linker_load_file,	link_aout_load_file),
+    KOBJMETHOD(linker_link_preload,	link_aout_link_preload),
+    KOBJMETHOD(linker_link_preload_finish, link_aout_link_preload_finish),
+    KOBJMETHOD(linker_lookup_set,	link_aout_lookup_set),
+    { 0, 0 }
+};
+
+static struct linker_class link_aout_class = {
+    "a.out", link_aout_methods, sizeof(struct aout_file)
+};
+
+static int		relocate_file(aout_file_t af);
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic __DYNAMIC;
+
+static void
+link_aout_init(void* arg)
+{
+#ifndef __ELF__
+    struct _dynamic* dp = &__DYNAMIC;
+#endif
+
+    linker_add_class(&link_aout_class);
+
+#ifndef __ELF__
+    if (dp) {
+	aout_file_t af;
+
+	linker_kernel_file =
+	    linker_make_file(kernelname, &link_aout_class);
+	if (linker_kernel_file == NULL)
+	    panic("link_aout_init: Can't create linker structures for kernel");
+	af = (aout_file_t) linker_kernel_file;
+	af->address = 0;
+	af->dynamic = dp;
+	linker_kernel_file->address = (caddr_t) KERNBASE;
+	linker_kernel_file->size = -(long)linker_kernel_file->address;
+    }
+#endif
+}
+
+SYSINIT(link_aout, SI_SUB_KLD, SI_ORDER_THIRD, link_aout_init, 0);
+
+static int
+link_aout_link_preload(linker_class_t lc,
+		       const char* filename, linker_file_t* result)
+{
+    caddr_t		modptr, baseptr;
+    char		*type;
+    struct exec		*ehdr;
+    aout_file_t		af;
+    linker_file_t	lf;
+    
+    /* Look to see if we have the module preloaded. */
+    modptr = preload_search_by_name(filename);
+    if (modptr == NULL)
+	return ENOENT;
+
+    if (((type = (char *)preload_search_info(modptr, MODINFO_TYPE)) == NULL) ||
+	strcmp(type, "a.out module") ||
+	((baseptr = preload_search_info(modptr, MODINFO_ADDR)) == NULL) ||
+	((ehdr = (struct exec *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_AOUTEXEC)) == NULL))
+	return(0);			/* we can't handle this */
+
+    /* Register with kld */
+    lf = linker_make_file(filename, &link_aout_class);
+    if (lf == NULL) {
+	return(ENOMEM);
+    }
+    af = (aout_file_t) lf;
+
+    /* Looks like we can handle this one */
+    filename = preload_search_info(modptr, MODINFO_NAME);
+    af->preloaded = 1;
+    af->address = baseptr;
+
+    /* Assume _DYNAMIC is the first data item. */
+    af->dynamic = (struct _dynamic*)(af->address + ehdr->a_text);
+    if (af->dynamic->d_version != LD_VERSION_BSD) {
+	linker_file_unload(lf);
+	return(0);			/* we can't handle this */
+    }
+    af->dynamic->d_un.d_sdt = (struct section_dispatch_table *)
+	((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address);
+
+    lf->address = af->address;
+    lf->size = ehdr->a_text + ehdr->a_data + ehdr->a_bss;
+    *result = lf;
+    return(0);
+}
+
+static int
+link_aout_link_preload_finish(linker_file_t lf)
+{
+    aout_file_t af;
+    int error;
+
+    af = (aout_file_t) lf;
+    error = relocate_file(af);
+    if (error) {
+	linker_file_unload(lf);
+	return(error);
+    }
+    return(0);
+}
+
+static int
+link_aout_load_file(linker_class_t lc, const char* filename, linker_file_t* result)
+{
+    struct nameidata nd;
+    struct thread *td = curthread;	/* XXX */
+    int error = 0;
+    int resid, flags;
+    struct exec header;
+    aout_file_t af;
+    linker_file_t lf = 0;
+
+    NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+    flags = FREAD;
+    error = vn_open(&nd, &flags, 0);
+    if (error)
+	return error;
+    NDFREE(&nd, NDF_ONLY_PNBUF);
+
+    /*
+     * Read the a.out header from the file.
+     */
+    error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) &header, sizeof header, 0,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+    if (error)
+	goto out;
+
+    if (N_BADMAG(header) || !(N_GETFLAG(header) & EX_DYNAMIC))
+	goto out;
+
+    /*
+     * We have an a.out file, so make some space to read it in.
+     */
+    lf = linker_make_file(filename, &link_aout_class);
+    if (lf == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+
+    af = (aout_file_t) lf;
+    af->address = malloc(header.a_text + header.a_data + header.a_bss,
+			 M_LINKER, M_WAITOK);
+    
+    /*
+     * Read the text and data sections and zero the bss.
+     */
+    error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) af->address,
+		    header.a_text + header.a_data, 0,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+    if (error)
+	goto out;
+    bzero(af->address + header.a_text + header.a_data, header.a_bss);
+
+    /*
+     * Assume _DYNAMIC is the first data item.
+     */
+    af->dynamic = (struct _dynamic*) (af->address + header.a_text);
+    if (af->dynamic->d_version != LD_VERSION_BSD) {
+	error = ENOEXEC;
+	goto out;
+    }
+    af->dynamic->d_un.d_sdt = (struct section_dispatch_table *)
+	((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address);
+
+    lf->address = af->address;
+    lf->size = header.a_text + header.a_data + header.a_bss;
+
+    error = linker_load_dependencies(lf);
+    if (error)
+	goto out;
+    error = relocate_file(af);
+    if (error)
+	goto out;
+
+    *result = lf;
+
+out:
+    if (error && lf)
+	linker_file_unload(lf);
+    VOP_UNLOCK(nd.ni_vp, 0, td);
+    vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+
+    return error;
+}
+
+static void
+link_aout_unload_file(linker_file_t file)
+{
+    aout_file_t af = (aout_file_t) file;
+
+    if (af->preloaded) {
+	link_aout_unload_preload(file);
+	return;
+    }
+
+    if (af->address)
+	free(af->address, M_LINKER);
+}
+
+static void
+link_aout_unload_preload(linker_file_t file)
+{
+    if (file->filename)
+	preload_delete_name(file->filename);
+}
+
+/*
+ * XXX i386 dependant.
+ */
+static long
+read_relocation(struct relocation_info* r, char* addr)
+{
+    int length = r->r_length;
+
+    if (length == 0)
+	return *(u_char*) addr;
+    else if (length == 1)
+	return *(u_short*) addr;
+    else if (length == 2)
+	return *(u_int*) addr;
+    else
+	printf("link_aout: unsupported relocation size %d\n", r->r_length);
+    return 0;
+}
+
+static void
+write_relocation(struct relocation_info* r, char* addr, long value)
+{
+    int length = r->r_length;
+
+    if (length == 0)
+	*(u_char*) addr = value;
+    else if (length == 1)
+	*(u_short*) addr = value;
+    else if (length == 2)
+	*(u_int*) addr = value;
+    else
+	printf("link_aout: unsupported relocation size %d\n", r->r_length);
+}
+
+#define AOUT_RELOC(af, type, off) (type*) ((af)->address + (off))
+
+static int
+relocate_file(aout_file_t af)
+{
+    struct relocation_info* rel;
+    struct relocation_info* erel;
+    struct relocation_info* r;
+    struct nzlist* symbolbase;
+    char* stringbase;
+    struct nzlist* np;
+    char* sym;
+    long relocation;
+
+    rel = AOUT_RELOC(af, struct relocation_info, LD_REL(af->dynamic));
+    erel = AOUT_RELOC(af, struct relocation_info,
+		      LD_REL(af->dynamic) + LD_RELSZ(af->dynamic));
+    symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+    stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+    for (r = rel; r < erel; r++) {
+	char* addr;
+
+	if (r->r_address == 0)
+	    break;
+
+	addr = AOUT_RELOC(af, char, r->r_address);
+	if (r->r_extern) {
+	    np = &symbolbase[r->r_symbolnum];
+	    sym = &stringbase[np->nz_strx];
+
+	    if (sym[0] != '_') {
+		printf("link_aout: bad symbol name %s\n", sym);
+		relocation = 0;
+	    } else
+		relocation = (intptr_t)
+		    linker_file_lookup_symbol(&af->lf, sym + 1,
+					      np->nz_type != (N_SETV+N_EXT));
+	    if (!relocation) {
+		printf("link_aout: symbol %s not found\n", sym);
+		return ENOENT;
+	    }
+	    
+	    relocation += read_relocation(r, addr);
+
+	    if (r->r_jmptable) {
+		printf("link_aout: can't cope with jump table relocations\n");
+		continue;
+	    }
+
+	    if (r->r_pcrel)
+		relocation -= (intptr_t) af->address;
+
+	    if (r->r_copy) {
+		printf("link_aout: can't cope with copy relocations\n");
+		continue;
+	    }
+	    
+	    write_relocation(r, addr, relocation);
+	} else {
+	    write_relocation(r, addr,
+			     (intptr_t)(read_relocation(r, addr) + af->address));
+	}
+	
+    }
+
+    return 0;
+}
+
+static long
+symbol_hash_value(aout_file_t af, const char* name)
+{
+    long hashval;
+    const char* p;
+
+    hashval = '_';		/* fake a starting '_' for C symbols */
+    for (p = name; *p; p++)
+	hashval = (hashval << 1) + *p;
+
+    return (hashval & 0x7fffffff) % LD_BUCKETS(af->dynamic);
+}
+
+int
+link_aout_lookup_symbol(linker_file_t file, const char* name,
+			c_linker_sym_t* sym)
+{
+    aout_file_t af = (aout_file_t) file;
+    long hashval;
+    struct rrs_hash* hashbase;
+    struct nzlist* symbolbase;
+    char* stringbase;
+    struct rrs_hash* hp;
+    struct nzlist* np;
+    char* cp;
+
+    if (LD_BUCKETS(af->dynamic) == 0)
+	return 0;
+
+    hashbase = AOUT_RELOC(af, struct rrs_hash, LD_HASH(af->dynamic));
+    symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+    stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+restart:
+    hashval = symbol_hash_value(af, name);
+    hp = &hashbase[hashval];
+    if (hp->rh_symbolnum == -1)
+	return ENOENT;
+
+    while (hp) {
+	np = (struct nzlist *) &symbolbase[hp->rh_symbolnum];
+	cp = stringbase + np->nz_strx;
+	/*
+	 * Note: we fake the leading '_' for C symbols.
+	 */
+	if (cp[0] == '_' && !strcmp(cp + 1, name))
+	    break;
+
+	if (hp->rh_next == 0)
+	    hp = NULL;
+	else
+	    hp = &hashbase[hp->rh_next];
+    }
+
+    if (hp == NULL)
+	/*
+	 * Not found.
+	 */
+	return ENOENT;
+
+    /*
+     * Check for an aliased symbol, whatever that is.
+     */
+    if (np->nz_type == N_INDR+N_EXT) {
+	name = stringbase + (++np)->nz_strx + 1; /* +1 for '_' */
+	goto restart;
+    }
+
+    /*
+     * Check this is an actual definition of the symbol.
+     */
+    if (np->nz_value == 0)
+	return ENOENT;
+
+    if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) {
+	if (np->nz_other == AUX_FUNC)
+	    /* weak function */
+	    return ENOENT;
+    }
+
+    *sym = (linker_sym_t) np;
+
+    return 0;
+}
+
+
+static int
+link_aout_symbol_values(linker_file_t file, c_linker_sym_t sym,
+			linker_symval_t* symval)
+{
+    aout_file_t af = (aout_file_t) file;
+    const struct nzlist* np = (const struct nzlist*) sym;
+    char* stringbase;
+    long numsym = LD_STABSZ(af->dynamic) / sizeof(struct nzlist);
+    struct nzlist *symbase;
+
+    /* Is it one of ours?  It could be another module... */
+    symbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic));
+    if (np < symbase)
+	return ENOENT;
+    if ((np - symbase) > numsym)
+	return ENOENT;
+
+    stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic));
+
+    symval->name = stringbase + np->nz_strx + 1; /* +1 for '_' */
+    if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) {
+	symval->value = 0;
+	symval->size = np->nz_value;
+    } else {
+	symval->value = AOUT_RELOC(af, char, np->nz_value);
+	symval->size = np->nz_size;
+    }
+    return 0;
+}
+
+static int
+link_aout_search_symbol(linker_file_t lf, caddr_t value,
+			c_linker_sym_t* sym, long* diffp)
+{
+	aout_file_t af = (aout_file_t) lf;
+	u_long off = (uintptr_t) (void *) value;
+	u_long diff = off;
+	u_long sp_nz_value;
+	struct nzlist* sp;
+	struct nzlist* ep;
+	struct nzlist* best = 0;
+
+	for (sp = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)),
+		 ep = (struct nzlist *) ((caddr_t) sp + LD_STABSZ(af->dynamic));
+	     sp < ep; sp++) {
+		if (sp->nz_name == 0)
+			continue;
+		sp_nz_value = sp->nz_value + (uintptr_t) (void *) af->address;
+		if (off >= sp_nz_value) {
+			if (off - sp_nz_value < diff) {
+				diff = off - sp_nz_value;
+				best = sp;
+				if (diff == 0)
+					break;
+			} else if (off - sp_nz_value == diff) {
+				best = sp;
+			}
+		}
+	}
+	if (best == 0)
+		*diffp = off;
+	else
+		*diffp = diff;
+	*sym = (linker_sym_t) best;
+
+	return 0;
+}
+
+/*
+ * Look up a linker set on an a.out + gnu LD system.
+ */
+struct generic_linker_set {
+	int	ls_length;
+	void	*ls_items[1];
+};
+static int
+link_aout_lookup_set(linker_file_t lf, const char *name,
+		     void ***startp, void ***stopp, int *countp)
+{
+	c_linker_sym_t sym;
+	linker_symval_t symval;
+	void **start, **stop;
+	int error, count;
+	struct generic_linker_set *setp;
+
+	error = link_aout_lookup_symbol(lf, name, &sym);
+	if (error)
+		return error;
+	link_aout_symbol_values(lf, sym, &symval);
+	if (symval.value == 0)
+		return ESRCH;
+	setp = (struct generic_linker_set *)symval.value;
+	count = setp->ls_length;
+	start = &setp->ls_items[0];
+	stop = &setp->ls_items[count];
+	if (startp)
+		*startp = start;
+	if (stopp)
+		*stopp = stop;
+	if (countp)
+		*countp = count;
+	return 0;
+}
+
+#endif /* __i386__ */
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
new file mode 100644
index 0000000..dd59405
--- /dev/null
+++ b/sys/kern/link_elf.c
@@ -0,0 +1,1239 @@
+/*-
+ * Copyright (c) 1998-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+
+#include <machine/elf.h>
+#ifdef GPROF
+#include <machine/profile.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#ifdef SPARSE_MAPPING
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#endif
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#ifdef __AOUT__
+#include <nlist.h>
+#endif
+#include <link.h>
+
+#include "linker_if.h"
+
+typedef struct elf_file {
+    struct linker_file	lf;		/* Common fields */
+    int			preloaded;	/* Was file pre-loaded */
+    caddr_t		address;	/* Relocation address */
+#ifdef SPARSE_MAPPING
+    vm_object_t		object;		/* VM object to hold file pages */
+#endif
+    Elf_Dyn*		dynamic;	/* Symbol table etc. */
+    Elf_Hashelt		nbuckets;	/* DT_HASH info */
+    Elf_Hashelt		nchains;
+    const Elf_Hashelt*	buckets;
+    const Elf_Hashelt*	chains;
+    caddr_t		hash;
+    caddr_t		strtab;		/* DT_STRTAB */
+    int			strsz;		/* DT_STRSZ */
+    const Elf_Sym*	symtab;		/* DT_SYMTAB */
+    Elf_Addr*		got;		/* DT_PLTGOT */
+    const Elf_Rel*	pltrel;		/* DT_JMPREL */
+    int			pltrelsize;	/* DT_PLTRELSZ */
+    const Elf_Rela*	pltrela;	/* DT_JMPREL */
+    int			pltrelasize;	/* DT_PLTRELSZ */
+    const Elf_Rel*	rel;		/* DT_REL */
+    int			relsize;	/* DT_RELSZ */
+    const Elf_Rela*	rela;		/* DT_RELA */
+    int			relasize;	/* DT_RELASZ */
+    caddr_t		modptr;
+    const Elf_Sym*	ddbsymtab;	/* The symbol table we are using */
+    long		ddbsymcnt;	/* Number of symbols */
+    caddr_t		ddbstrtab;	/* String table */
+    long		ddbstrcnt;	/* number of bytes in string table */
+    caddr_t		symbase;	/* malloc'ed symbold base */
+    caddr_t		strbase;	/* malloc'ed string base */
+#ifdef DDB
+    struct link_map	gdb;		/* hooks for gdb */
+#endif
+} *elf_file_t;
+
+static int	link_elf_link_preload(linker_class_t cls,
+				      const char*, linker_file_t*);
+static int	link_elf_link_preload_finish(linker_file_t);
+static int	link_elf_load_file(linker_class_t, const char*, linker_file_t*);
+static int	link_elf_lookup_symbol(linker_file_t, const char*,
+				       c_linker_sym_t*);
+static int	link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t*);
+static int	link_elf_search_symbol(linker_file_t, caddr_t value,
+				       c_linker_sym_t* sym, long* diffp);
+
+static void	link_elf_unload_file(linker_file_t);
+static void	link_elf_unload_preload(linker_file_t);
+static int	link_elf_lookup_set(linker_file_t, const char *,
+				    void ***, void ***, int *);
+static int	link_elf_each_function_name(linker_file_t,
+				int (*)(const char *, void *),
+				void *);
+
+static kobj_method_t link_elf_methods[] = {
+    KOBJMETHOD(linker_lookup_symbol,	link_elf_lookup_symbol),
+    KOBJMETHOD(linker_symbol_values,	link_elf_symbol_values),
+    KOBJMETHOD(linker_search_symbol,	link_elf_search_symbol),
+    KOBJMETHOD(linker_unload,		link_elf_unload_file),
+    KOBJMETHOD(linker_load_file,	link_elf_load_file),
+    KOBJMETHOD(linker_link_preload,	link_elf_link_preload),
+    KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish),
+    KOBJMETHOD(linker_lookup_set,	link_elf_lookup_set),
+    KOBJMETHOD(linker_each_function_name, link_elf_each_function_name),
+    { 0, 0 }
+};
+
+static struct linker_class link_elf_class = {
+#if ELF_TARG_CLASS == ELFCLASS32
+    "elf32",
+#else
+    "elf64",
+#endif
+    link_elf_methods, sizeof(struct elf_file)
+};
+
+static int		parse_dynamic(elf_file_t ef);
+static int		relocate_file(elf_file_t ef);
+static int		link_elf_preload_parse_symbols(elf_file_t ef);
+
+#ifdef DDB
+static void		r_debug_state(struct r_debug *dummy_one,
+				      struct link_map *dummy_two);
+
+/*
+ * A list of loaded modules for GDB to use for loading symbols.
+ */
+struct r_debug r_debug;
+
+#define GDB_STATE(s)	r_debug.r_state = s; r_debug_state(NULL, NULL);
+
+/*
+ * Function for the debugger to set a breakpoint on to gain control.
+ */
+void
+r_debug_state(struct r_debug *dummy_one __unused,
+	      struct link_map *dummy_two __unused)
+{
+}
+
+#endif
+
+#ifdef __ia64__
+Elf_Addr link_elf_get_gp(linker_file_t);
+#endif
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_elf_init(void* arg)
+{
+#ifdef __ELF__
+    Elf_Dyn	*dp;
+    caddr_t	modptr, baseptr, sizeptr;
+    elf_file_t	ef;
+    char	*modname;
+#ifdef DDB
+    char *newfilename;
+#endif
+#endif
+
+    linker_add_class(&link_elf_class);
+
+#ifdef __ELF__
+    dp = (Elf_Dyn*) &_DYNAMIC;
+    modname = NULL;
+    modptr = preload_search_by_type("elf kernel");
+    if (modptr)
+	modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+    if (modname == NULL)
+	modname = "kernel";
+    linker_kernel_file = linker_make_file(modname, &link_elf_class);
+    if (linker_kernel_file == NULL)
+	panic("link_elf_init: Can't create linker structures for kernel");
+    
+    ef = (elf_file_t) linker_kernel_file;
+    ef->preloaded = 1;
+    ef->address = 0;
+#ifdef SPARSE_MAPPING
+    ef->object = 0;
+#endif
+    ef->dynamic = dp;
+
+    if (dp)
+	parse_dynamic(ef);
+    linker_kernel_file->address = (caddr_t) KERNBASE;
+    linker_kernel_file->size = -(intptr_t)linker_kernel_file->address;
+
+    if (modptr) {
+	ef->modptr = modptr;
+	baseptr = preload_search_info(modptr, MODINFO_ADDR);
+	if (baseptr)
+	    linker_kernel_file->address = *(caddr_t *)baseptr;
+	sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+	if (sizeptr)
+	    linker_kernel_file->size = *(size_t *)sizeptr;
+    }
+    (void)link_elf_preload_parse_symbols(ef);
+
+#ifdef DDB
+    ef->gdb.l_addr = linker_kernel_file->address;
+    newfilename = malloc(strlen(modname) + 1, M_LINKER, M_WAITOK);
+    strcpy(newfilename, modname);
+    ef->gdb.l_name = newfilename;
+    ef->gdb.l_ld = dp;
+    ef->gdb.l_prev = 0;
+    ef->gdb.l_next = 0;
+
+    r_debug.r_map = &ef->gdb;
+    r_debug.r_brk = r_debug_state;
+    r_debug.r_state = RT_CONSISTENT;
+
+    r_debug_state(NULL, NULL);	/* say hello to gdb! */
+#endif
+
+#endif
+}
+
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+
+static int
+link_elf_preload_parse_symbols(elf_file_t ef)
+{
+    caddr_t	pointer;
+    caddr_t	ssym, esym, base;
+    caddr_t	strtab;
+    int		strcnt;
+    Elf_Sym*	symtab;
+    int		symcnt;
+
+    if (ef->modptr == NULL)
+	return 0;
+    pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM);
+    if (pointer == NULL)
+	return 0;
+    ssym = *(caddr_t *)pointer;
+    pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM);
+    if (pointer == NULL)
+	return 0;
+    esym = *(caddr_t *)pointer;
+
+    base = ssym;
+
+    symcnt = *(long *)base;
+    base += sizeof(long);
+    symtab = (Elf_Sym *)base;
+    base += roundup(symcnt, sizeof(long));
+
+    if (base > esym || base < ssym) {
+	printf("Symbols are corrupt!\n");
+	return EINVAL;
+    }
+
+    strcnt = *(long *)base;
+    base += sizeof(long);
+    strtab = base;
+    base += roundup(strcnt, sizeof(long));
+
+    if (base > esym || base < ssym) {
+	printf("Symbols are corrupt!\n");
+	return EINVAL;
+    }
+
+    ef->ddbsymtab = symtab;
+    ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+    ef->ddbstrtab = strtab;
+    ef->ddbstrcnt = strcnt;
+
+    return 0;
+}
+
+static int
+parse_dynamic(elf_file_t ef)
+{
+    Elf_Dyn *dp;
+    int plttype = DT_REL;
+
+    for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+	switch (dp->d_tag) {
+	case DT_HASH:
+	{
+	    /* From src/libexec/rtld-elf/rtld.c */
+	    const Elf_Hashelt *hashtab = (const Elf_Hashelt *)
+		(ef->address + dp->d_un.d_ptr);
+	    ef->nbuckets = hashtab[0];
+	    ef->nchains = hashtab[1];
+	    ef->buckets = hashtab + 2;
+	    ef->chains = ef->buckets + ef->nbuckets;
+	    break;
+	}
+	case DT_STRTAB:
+	    ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_STRSZ:
+	    ef->strsz = dp->d_un.d_val;
+	    break;
+	case DT_SYMTAB:
+	    ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_SYMENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Sym))
+		return ENOEXEC;
+	    break;
+	case DT_PLTGOT:
+	    ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_REL:
+	    ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_RELSZ:
+	    ef->relsize = dp->d_un.d_val;
+	    break;
+	case DT_RELENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Rel))
+		return ENOEXEC;
+	    break;
+	case DT_JMPREL:
+	    ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_PLTRELSZ:
+	    ef->pltrelsize = dp->d_un.d_val;
+	    break;
+	case DT_RELA:
+	    ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_RELASZ:
+	    ef->relasize = dp->d_un.d_val;
+	    break;
+	case DT_RELAENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Rela))
+		return ENOEXEC;
+	    break;
+	case DT_PLTREL:
+	    plttype = dp->d_un.d_val;
+	    if (plttype != DT_REL && plttype != DT_RELA)
+		return ENOEXEC;
+	    break;
+#ifdef DDB
+	case DT_DEBUG:
+	    dp->d_un.d_ptr = (Elf_Addr) &r_debug;
+	    break;
+#endif
+	}
+    }
+
+    if (plttype == DT_RELA) {
+	ef->pltrela = (const Elf_Rela *) ef->pltrel;
+	ef->pltrel = NULL;
+	ef->pltrelasize = ef->pltrelsize;
+	ef->pltrelsize = 0;
+    }
+
+    ef->ddbsymtab = ef->symtab;
+    ef->ddbsymcnt = ef->nchains;
+    ef->ddbstrtab = ef->strtab;
+    ef->ddbstrcnt = ef->strsz;
+
+    return 0;
+}
+
+static void
+link_elf_error(const char *s)
+{
+    printf("kldload: %s\n", s);
+}
+
+#ifdef DDB
+
+static void
+link_elf_add_gdb(struct link_map *l)
+{
+    struct link_map *prev;
+
+    /*
+     * Scan to the end of the list.
+     */
+    for (prev = r_debug.r_map; prev->l_next != NULL; prev = prev->l_next)
+	;
+
+    /* Link in the new entry. */
+    l->l_prev = prev;
+    l->l_next = prev->l_next;
+    prev->l_next = l;
+}
+
+static void
+link_elf_delete_gdb(struct link_map *l)
+{
+    if (l->l_prev == NULL) {
+	if ((r_debug.r_map = l->l_next) != NULL)
+	    l->l_next->l_prev = NULL;
+	return;
+    }
+
+    if ((l->l_prev->l_next = l->l_next) != NULL)
+	l->l_next->l_prev = l->l_prev;
+}
+
+#endif /* DDB */
+
+static int
+link_elf_link_preload(linker_class_t cls,
+		      const char* filename, linker_file_t *result)
+{
+    caddr_t		modptr, baseptr, sizeptr, dynptr;
+    char		*type;
+    elf_file_t		ef;
+    linker_file_t	lf;
+    int			error;
+    vm_offset_t		dp;
+
+    /* Look to see if we have the file preloaded */
+    modptr = preload_search_by_name(filename);
+    if (modptr == NULL)
+	return ENOENT;
+
+    type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+    baseptr = preload_search_info(modptr, MODINFO_ADDR);
+    sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+    dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC);
+    if (type == NULL || strcmp(type, "elf module") != 0)
+	return (EFTYPE);
+    if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
+	return (EINVAL);
+
+    lf = linker_make_file(filename, &link_elf_class);
+    if (lf == NULL) {
+	return ENOMEM;
+    }
+
+    ef = (elf_file_t) lf;
+    ef->preloaded = 1;
+    ef->modptr = modptr;
+    ef->address = *(caddr_t *)baseptr;
+#ifdef SPARSE_MAPPING
+    ef->object = 0;
+#endif
+    dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
+    ef->dynamic = (Elf_Dyn *)dp;
+    lf->address = ef->address;
+    lf->size = *(size_t *)sizeptr;
+
+    error = parse_dynamic(ef);
+    if (error) {
+	linker_file_unload(lf);
+	return error;
+    }
+    *result = lf;
+    return (0);
+}
+
+static int
+link_elf_link_preload_finish(linker_file_t lf)
+{
+    elf_file_t		ef;
+    int error;
+#ifdef DDB
+    char *newfilename;
+#endif
+
+    ef = (elf_file_t) lf;
+#if 0	/* this will be more trouble than it's worth for now */
+    for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+	if (dp->d_tag != DT_NEEDED)
+	    continue;
+	modname = ef->strtab + dp->d_un.d_val;
+	error = linker_load_module(modname, lf);
+	if (error)
+	    goto out;
+    }
+#endif
+    error = relocate_file(ef);
+    if (error)
+	return error;
+    (void)link_elf_preload_parse_symbols(ef);
+
+#ifdef DDB
+    GDB_STATE(RT_ADD);
+    ef->gdb.l_addr = lf->address;
+    newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK);
+    strcpy(newfilename, lf->filename);
+    ef->gdb.l_name = newfilename;
+    ef->gdb.l_ld = ef->dynamic;
+    link_elf_add_gdb(&ef->gdb);
+    GDB_STATE(RT_CONSISTENT);
+#endif
+
+    return (0);
+}
+
+static int
+link_elf_load_file(linker_class_t cls, const char* filename, linker_file_t* result)
+{
+    struct nameidata nd;
+    struct thread* td = curthread;	/* XXX */
+    Elf_Ehdr *hdr;
+    caddr_t firstpage;
+    int nbytes, i;
+    Elf_Phdr *phdr;
+    Elf_Phdr *phlimit;
+    Elf_Phdr *segs[2];
+    int nsegs;
+    Elf_Phdr *phdyn;
+    Elf_Phdr *phphdr;
+    caddr_t mapbase;
+    size_t mapsize;
+    Elf_Off base_offset;
+    Elf_Addr base_vaddr;
+    Elf_Addr base_vlimit;
+    int error = 0;
+    int resid, flags;
+    elf_file_t ef;
+    linker_file_t lf;
+    Elf_Shdr *shdr;
+    int symtabindex;
+    int symstrindex;
+    int symcnt;
+    int strcnt;
+#ifdef DDB
+    char *newfilename;
+#endif
+
+    GIANT_REQUIRED;
+
+    shdr = NULL;
+    lf = NULL;
+
+    NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+    flags = FREAD;
+    error = vn_open(&nd, &flags, 0);
+    if (error)
+	return error;
+    NDFREE(&nd, NDF_ONLY_PNBUF);
+
+    /*
+     * Read the elf header from the file.
+     */
+    firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
+    if (firstpage == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    hdr = (Elf_Ehdr *)firstpage;
+    error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+    nbytes = PAGE_SIZE - resid;
+    if (error)
+	goto out;
+
+    if (!IS_ELF(*hdr)) {
+	error = ENOEXEC;
+	goto out;
+    }
+
+    if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
+      || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+	link_elf_error("Unsupported file layout");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_ident[EI_VERSION] != EV_CURRENT
+      || hdr->e_version != EV_CURRENT) {
+	link_elf_error("Unsupported file version");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
+	link_elf_error("Unsupported file type");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_machine != ELF_TARG_MACH) {
+	link_elf_error("Unsupported machine");
+	error = ENOEXEC;
+	goto out;
+    }
+
+    /*
+     * We rely on the program header being in the first page.  This is
+     * not strictly required by the ABI specification, but it seems to
+     * always true in practice.  And, it simplifies things considerably.
+     */
+    if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
+	  (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
+	  (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
+	link_elf_error("Unreadable program headers");
+
+    /*
+     * Scan the program header entries, and save key information.
+     *
+     * We rely on there being exactly two load segments, text and data,
+     * in that order.
+     */
+    phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
+    phlimit = phdr + hdr->e_phnum;
+    nsegs = 0;
+    phdyn = NULL;
+    phphdr = NULL;
+    while (phdr < phlimit) {
+	switch (phdr->p_type) {
+
+	case PT_LOAD:
+	    if (nsegs == 2) {
+		link_elf_error("Too many sections");
+		error = ENOEXEC;
+		goto out;
+	    }
+	    segs[nsegs] = phdr;
+	    ++nsegs;
+	    break;
+
+	case PT_PHDR:
+	    phphdr = phdr;
+	    break;
+
+	case PT_DYNAMIC:
+	    phdyn = phdr;
+	    break;
+
+	case PT_INTERP:
+	    link_elf_error("Unsupported file type");
+	    error = ENOEXEC;
+	    goto out;
+	}
+
+	++phdr;
+    }
+    if (phdyn == NULL) {
+	link_elf_error("Object is not dynamically-linked");
+	error = ENOEXEC;
+	goto out;
+    }
+
+    /*
+     * Allocate the entire address space of the object, to stake out our
+     * contiguous region, and to establish the base address for relocation.
+     */
+    base_offset = trunc_page(segs[0]->p_offset);
+    base_vaddr = trunc_page(segs[0]->p_vaddr);
+    base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
+    mapsize = base_vlimit - base_vaddr;
+
+    lf = linker_make_file(filename, &link_elf_class);
+    if (!lf) {
+	error = ENOMEM;
+	goto out;
+    }
+
+    ef = (elf_file_t) lf;
+#ifdef SPARSE_MAPPING
+    ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
+    if (ef->object == NULL) {
+	free(ef, M_LINKER);
+	error = ENOMEM;
+	goto out;
+    }
+    vm_object_reference(ef->object);
+    ef->address = (caddr_t) vm_map_min(kernel_map);
+    error = vm_map_find(kernel_map, ef->object, 0,
+			(vm_offset_t *) &ef->address,
+			mapsize, 1,
+			VM_PROT_ALL, VM_PROT_ALL, 0);
+    if (error) {
+	vm_object_deallocate(ef->object);
+	ef->object = 0;
+	goto out;
+    }
+#else
+    ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
+    if (!ef->address) {
+	error = ENOMEM;
+	goto out;
+    }
+#endif
+    mapbase = ef->address;
+
+    /*
+     * Read the text and data sections and zero the bss.
+     */
+    for (i = 0; i < 2; i++) {
+	caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
+	error = vn_rdwr(UIO_READ, nd.ni_vp,
+			segbase, segs[i]->p_filesz, segs[i]->p_offset,
+			UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+	if (error) {
+	    goto out;
+	}
+	bzero(segbase + segs[i]->p_filesz,
+	      segs[i]->p_memsz - segs[i]->p_filesz);
+
+#ifdef SPARSE_MAPPING
+	/*
+	 * Wire down the pages
+	 */
+	vm_map_pageable(kernel_map,
+			(vm_offset_t) segbase,
+			(vm_offset_t) segbase + segs[i]->p_memsz,
+			FALSE);
+#endif
+    }
+
+#ifdef GPROF
+    /* Update profiling information with the new text segment. */
+    kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
+	segs[0]->p_memsz));
+#endif
+
+    ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
+
+    lf->address = ef->address;
+    lf->size = mapsize;
+
+    error = parse_dynamic(ef);
+    if (error)
+	goto out;
+    error = linker_load_dependencies(lf);
+    if (error)
+	goto out;
+#if 0	/* this will be more trouble than it's worth for now */
+    for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+	if (dp->d_tag != DT_NEEDED)
+	    continue;
+	modname = ef->strtab + dp->d_un.d_val;
+	error = linker_load_module(modname, lf);
+	if (error)
+	    goto out;
+    }
+#endif
+    error = relocate_file(ef);
+    if (error)
+	goto out;
+
+    /* Try and load the symbol table if it's present.  (you can strip it!) */
+    nbytes = hdr->e_shnum * hdr->e_shentsize;
+    if (nbytes == 0 || hdr->e_shoff == 0)
+	goto nosyms;
+    shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
+    if (shdr == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    (caddr_t)shdr, nbytes, hdr->e_shoff,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+    if (error)
+	goto out;
+    symtabindex = -1;
+    symstrindex = -1;
+    for (i = 0; i < hdr->e_shnum; i++) {
+	if (shdr[i].sh_type == SHT_SYMTAB) {
+	    symtabindex = i;
+	    symstrindex = shdr[i].sh_link;
+	}
+    }
+    if (symtabindex < 0 || symstrindex < 0)
+	goto nosyms;
+
+    symcnt = shdr[symtabindex].sh_size;
+    ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
+    strcnt = shdr[symstrindex].sh_size;
+    ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
+
+    if (ef->symbase == NULL || ef->strbase == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    ef->symbase, symcnt, shdr[symtabindex].sh_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+    if (error)
+	goto out;
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    ef->strbase, strcnt, shdr[symstrindex].sh_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+    if (error)
+	goto out;
+
+    ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+    ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
+    ef->ddbstrcnt = strcnt;
+    ef->ddbstrtab = ef->strbase;
+
+#ifdef DDB
+    GDB_STATE(RT_ADD);
+    ef->gdb.l_addr = lf->address;
+    newfilename = malloc(strlen(filename) + 1, M_LINKER, M_WAITOK);
+    strcpy(newfilename, filename);
+    ef->gdb.l_name = (const char *)newfilename;
+    ef->gdb.l_ld = ef->dynamic;
+    link_elf_add_gdb(&ef->gdb);
+    GDB_STATE(RT_CONSISTENT);
+#endif
+
+nosyms:
+
+    *result = lf;
+
+out:
+    if (error && lf)
+	linker_file_unload(lf);
+    if (shdr)
+	free(shdr, M_LINKER);
+    if (firstpage)
+	free(firstpage, M_LINKER);
+    VOP_UNLOCK(nd.ni_vp, 0, td);
+    vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+
+    return error;
+}
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+    elf_file_t ef = (elf_file_t) file;
+
+#ifdef DDB
+    if (ef->gdb.l_ld) {
+	GDB_STATE(RT_DELETE);
+	free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER);
+	link_elf_delete_gdb(&ef->gdb);
+	GDB_STATE(RT_CONSISTENT);
+    }
+#endif
+
+    if (ef->preloaded) {
+	link_elf_unload_preload(file);
+	return;
+    }
+#ifdef SPARSE_MAPPING
+    if (ef->object) {
+	vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+		      (vm_offset_t) ef->address
+		      + (ef->object->size << PAGE_SHIFT));
+	vm_object_deallocate(ef->object);
+    }
+#else
+    if (ef->address)
+	free(ef->address, M_LINKER);
+#endif
+    if (ef->symbase)
+	free(ef->symbase, M_LINKER);
+    if (ef->strbase)
+	free(ef->strbase, M_LINKER);
+}
+
+static void
+link_elf_unload_preload(linker_file_t file)
+{
+    if (file->filename)
+	preload_delete_name(file->filename);
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Word r_info)
+{
+    const Elf_Sym *ref;
+
+    if (ELF_R_SYM(r_info)) {
+	ref = ef->symtab + ELF_R_SYM(r_info);
+	return ef->strtab + ref->st_name;
+    } else
+	return NULL;
+}
+
+static int
+relocate_file(elf_file_t ef)
+{
+    const Elf_Rel *rellim;
+    const Elf_Rel *rel;
+    const Elf_Rela *relalim;
+    const Elf_Rela *rela;
+    const char *symname;
+
+    /* Perform relocations without addend if there are any: */
+    rel = ef->rel;
+    if (rel) {
+	rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize);
+	while (rel < rellim) {
+	    if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) {
+		symname = symbol_name(ef, rel->r_info);
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rel++;
+	}
+    }
+
+    /* Perform relocations with addend if there are any: */
+    rela = ef->rela;
+    if (rela) {
+	relalim = (const Elf_Rela *)((const char *)ef->rela + ef->relasize);
+	while (rela < relalim) {
+	    if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) {
+		symname = symbol_name(ef, rela->r_info);
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rela++;
+	}
+    }
+
+    /* Perform PLT relocations without addend if there are any: */
+    rel = ef->pltrel;
+    if (rel) {
+	rellim = (const Elf_Rel *)((const char *)ef->pltrel + ef->pltrelsize);
+	while (rel < rellim) {
+	    if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) {
+		symname = symbol_name(ef, rel->r_info);
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rel++;
+	}
+    }
+
+    /* Perform relocations with addend if there are any: */
+    rela = ef->pltrela;
+    if (rela) {
+	relalim = (const Elf_Rela *)((const char *)ef->pltrela + ef->pltrelasize);
+	while (rela < relalim) {
+	    if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) {
+		symname = symbol_name(ef, rela->r_info);
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rela++;
+	}
+    }
+
+    return 0;
+}
+
+/*
+ * Hash function for symbol table lookup.  Don't even think about changing
+ * this.  It is specified by the System V ABI.
+ */
+static unsigned long
+elf_hash(const char *name)
+{
+    const unsigned char *p = (const unsigned char *) name;
+    unsigned long h = 0;
+    unsigned long g;
+
+    while (*p != '\0') {
+	h = (h << 4) + *p++;
+	if ((g = h & 0xf0000000) != 0)
+	    h ^= g >> 24;
+	h &= ~g;
+    }
+    return h;
+}
+
+int
+link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym)
+{
+    elf_file_t ef = (elf_file_t) lf;
+    unsigned long symnum;
+    const Elf_Sym* symp;
+    const char *strp;
+    unsigned long hash;
+    int i;
+
+    /* First, search hashed global symbols */
+    hash = elf_hash(name);
+    symnum = ef->buckets[hash % ef->nbuckets];
+
+    while (symnum != STN_UNDEF) {
+	if (symnum >= ef->nchains) {
+	    printf("link_elf_lookup_symbol: corrupt symbol table\n");
+	    return ENOENT;
+	}
+
+	symp = ef->symtab + symnum;
+	if (symp->st_name == 0) {
+	    printf("link_elf_lookup_symbol: corrupt symbol table\n");
+	    return ENOENT;
+	}
+
+	strp = ef->strtab + symp->st_name;
+
+	if (strcmp(name, strp) == 0) {
+	    if (symp->st_shndx != SHN_UNDEF ||
+		(symp->st_value != 0 &&
+		 ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+		*sym = (c_linker_sym_t) symp;
+		return 0;
+	    } else
+		return ENOENT;
+	}
+
+	symnum = ef->chains[symnum];
+    }
+
+    /* If we have not found it, look at the full table (if loaded) */
+    if (ef->symtab == ef->ddbsymtab)
+	return ENOENT;
+
+    /* Exhaustive search */
+    for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+	strp = ef->ddbstrtab + symp->st_name;
+	if (strcmp(name, strp) == 0) {
+	    if (symp->st_shndx != SHN_UNDEF ||
+		(symp->st_value != 0 &&
+		 ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+		*sym = (c_linker_sym_t) symp;
+		return 0;
+	    } else
+		return ENOENT;
+	}
+    }
+
+    return ENOENT;
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t* symval)
+{
+	elf_file_t ef = (elf_file_t) lf;
+	const Elf_Sym* es = (const Elf_Sym*) sym;
+
+	if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) {
+	    symval->name = ef->strtab + es->st_name;
+	    symval->value = (caddr_t) ef->address + es->st_value;
+	    symval->size = es->st_size;
+	    return 0;
+	}
+	if (ef->symtab == ef->ddbsymtab)
+	    return ENOENT;
+	if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) {
+	    symval->name = ef->ddbstrtab + es->st_name;
+	    symval->value = (caddr_t) ef->address + es->st_value;
+	    symval->size = es->st_size;
+	    return 0;
+	}
+	return ENOENT;
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+		       c_linker_sym_t* sym, long* diffp)
+{
+	elf_file_t ef = (elf_file_t) lf;
+	u_long off = (uintptr_t) (void *) value;
+	u_long diff = off;
+	u_long st_value;
+	const Elf_Sym* es;
+	const Elf_Sym* best = 0;
+	int i;
+
+	for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+		if (es->st_name == 0)
+			continue;
+		st_value = es->st_value + (uintptr_t) (void *) ef->address;
+		if (off >= st_value) {
+			if (off - st_value < diff) {
+				diff = off - st_value;
+				best = es;
+				if (diff == 0)
+					break;
+			} else if (off - st_value == diff) {
+				best = es;
+			}
+		}
+	}
+	if (best == 0)
+		*diffp = off;
+	else
+		*diffp = diff;
+	*sym = (c_linker_sym_t) best;
+
+	return 0;
+}
+
+/*
+ * Look up a linker set on an ELF system.
+ */
+static int
+link_elf_lookup_set(linker_file_t lf, const char *name,
+		    void ***startp, void ***stopp, int *countp)
+{
+	c_linker_sym_t sym;
+	linker_symval_t symval;
+	char *setsym;
+	void **start, **stop;
+	int len, error = 0, count;
+
+	len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */
+	setsym = malloc(len, M_LINKER, M_WAITOK);
+	if (setsym == NULL)
+		return ENOMEM;
+
+	/* get address of first entry */
+	snprintf(setsym, len, "%s%s", "__start_set_", name);
+	error = link_elf_lookup_symbol(lf, setsym, &sym);
+	if (error)
+		goto out;
+	link_elf_symbol_values(lf, sym, &symval);
+	if (symval.value == 0) {
+		error = ESRCH;
+		goto out;
+	}
+	start = (void **)symval.value;
+
+	/* get address of last entry */
+	snprintf(setsym, len, "%s%s", "__stop_set_", name);
+	error = link_elf_lookup_symbol(lf, setsym, &sym);
+	if (error)
+		goto out;
+	link_elf_symbol_values(lf, sym, &symval);
+	if (symval.value == 0) {
+		error = ESRCH;
+		goto out;
+	}
+	stop = (void **)symval.value;
+
+	/* and the number of entries */
+	count = stop - start;
+
+	/* and copy out */
+	if (startp)
+		*startp = start;
+	if (stopp)
+		*stopp = stop;
+	if (countp)
+		*countp = count;
+
+out:
+	free(setsym, M_LINKER);
+	return error;
+}
+
+static int
+link_elf_each_function_name(linker_file_t file,
+  int (*callback)(const char *, void *), void *opaque) {
+    elf_file_t ef = (elf_file_t)file;
+    const Elf_Sym* symp;
+    int i, error;
+	
+    /* Exhaustive search */
+    for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+	if (symp->st_value != 0 &&
+	    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+		error = callback(ef->ddbstrtab + symp->st_name, opaque);
+		if (error)
+		    return (error);
+	}
+    }
+    return (0);
+}
+
+#ifdef __ia64__
+/*
+ * Each KLD has its own GP. The GP value for each load module is given by
+ * DT_PLTGOT on ia64. We need GP to construct function descriptors, but
+ * don't have direct access to the ELF file structure. The link_elf_get_gp()
+ * function returns the GP given a pointer to a generic linker file struct.
+ */
+Elf_Addr
+link_elf_get_gp(linker_file_t lf)
+{
+	elf_file_t ef = (elf_file_t)lf;
+	return (Elf_Addr)ef->got;
+}
+#endif
+
+/*
+ * Symbol lookup function that can be used when the symbol index is known (ie
+ * in relocations). It uses the symbol index instead of doing a fully fledged
+ * hash table based lookup when such is valid. For example for local symbols.
+ * This is not only more efficient, it's also more correct. It's not always
+ * the case that the symbol can be found through the hash table.
+ */
+Elf_Addr
+elf_lookup(linker_file_t lf, Elf_Word symidx, int deps)
+{
+	elf_file_t ef = (elf_file_t)lf;
+	const Elf_Sym *sym;
+	const char *symbol;
+
+	/* Don't even try to lookup the symbol if the index is bogus. */
+	if (symidx >= ef->nchains)
+		return (0);
+
+	sym = ef->symtab + symidx;
+
+	/*
+	 * Don't do a full lookup when the symbol is local. It may even
+	 * fail because it may not be found through the hash table.
+	 */
+	if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) {
+		/* Force lookup failure when we have an insanity. */
+		if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0)
+			return (0);
+		return ((Elf_Addr)ef->address + sym->st_value);
+	}
+
+	/*
+	 * XXX we can avoid doing a hash table based lookup for global
+	 * symbols as well. This however is not always valid, so we'll
+	 * just do it the hard way for now. Performance tweaks can
+	 * always be added.
+	 */
+
+	symbol = ef->strtab + sym->st_name;
+
+	/* Force a lookup failure if the symbol name is bogus. */
+	if (*symbol == 0)
+		return (0);
+
+	return ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
+}
diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c
new file mode 100644
index 0000000..dd59405
--- /dev/null
+++ b/sys/kern/link_elf_obj.c
@@ -0,0 +1,1239 @@
+/*-
+ * Copyright (c) 1998-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+
+#include <machine/elf.h>
+#ifdef GPROF
+#include <machine/profile.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#ifdef SPARSE_MAPPING
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#endif
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#ifdef __AOUT__
+#include <nlist.h>
+#endif
+#include <link.h>
+
+#include "linker_if.h"
+
+typedef struct elf_file {
+    struct linker_file	lf;		/* Common fields */
+    int			preloaded;	/* Was file pre-loaded */
+    caddr_t		address;	/* Relocation address */
+#ifdef SPARSE_MAPPING
+    vm_object_t		object;		/* VM object to hold file pages */
+#endif
+    Elf_Dyn*		dynamic;	/* Symbol table etc. */
+    Elf_Hashelt		nbuckets;	/* DT_HASH info */
+    Elf_Hashelt		nchains;
+    const Elf_Hashelt*	buckets;
+    const Elf_Hashelt*	chains;
+    caddr_t		hash;
+    caddr_t		strtab;		/* DT_STRTAB */
+    int			strsz;		/* DT_STRSZ */
+    const Elf_Sym*	symtab;		/* DT_SYMTAB */
+    Elf_Addr*		got;		/* DT_PLTGOT */
+    const Elf_Rel*	pltrel;		/* DT_JMPREL */
+    int			pltrelsize;	/* DT_PLTRELSZ */
+    const Elf_Rela*	pltrela;	/* DT_JMPREL */
+    int			pltrelasize;	/* DT_PLTRELSZ */
+    const Elf_Rel*	rel;		/* DT_REL */
+    int			relsize;	/* DT_RELSZ */
+    const Elf_Rela*	rela;		/* DT_RELA */
+    int			relasize;	/* DT_RELASZ */
+    caddr_t		modptr;
+    const Elf_Sym*	ddbsymtab;	/* The symbol table we are using */
+    long		ddbsymcnt;	/* Number of symbols */
+    caddr_t		ddbstrtab;	/* String table */
+    long		ddbstrcnt;	/* number of bytes in string table */
+    caddr_t		symbase;	/* malloc'ed symbold base */
+    caddr_t		strbase;	/* malloc'ed string base */
+#ifdef DDB
+    struct link_map	gdb;		/* hooks for gdb */
+#endif
+} *elf_file_t;
+
+static int	link_elf_link_preload(linker_class_t cls,
+				      const char*, linker_file_t*);
+static int	link_elf_link_preload_finish(linker_file_t);
+static int	link_elf_load_file(linker_class_t, const char*, linker_file_t*);
+static int	link_elf_lookup_symbol(linker_file_t, const char*,
+				       c_linker_sym_t*);
+static int	link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t*);
+static int	link_elf_search_symbol(linker_file_t, caddr_t value,
+				       c_linker_sym_t* sym, long* diffp);
+
+static void	link_elf_unload_file(linker_file_t);
+static void	link_elf_unload_preload(linker_file_t);
+static int	link_elf_lookup_set(linker_file_t, const char *,
+				    void ***, void ***, int *);
+static int	link_elf_each_function_name(linker_file_t,
+				int (*)(const char *, void *),
+				void *);
+
+static kobj_method_t link_elf_methods[] = {
+    KOBJMETHOD(linker_lookup_symbol,	link_elf_lookup_symbol),
+    KOBJMETHOD(linker_symbol_values,	link_elf_symbol_values),
+    KOBJMETHOD(linker_search_symbol,	link_elf_search_symbol),
+    KOBJMETHOD(linker_unload,		link_elf_unload_file),
+    KOBJMETHOD(linker_load_file,	link_elf_load_file),
+    KOBJMETHOD(linker_link_preload,	link_elf_link_preload),
+    KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish),
+    KOBJMETHOD(linker_lookup_set,	link_elf_lookup_set),
+    KOBJMETHOD(linker_each_function_name, link_elf_each_function_name),
+    { 0, 0 }
+};
+
+static struct linker_class link_elf_class = {
+#if ELF_TARG_CLASS == ELFCLASS32
+    "elf32",
+#else
+    "elf64",
+#endif
+    link_elf_methods, sizeof(struct elf_file)
+};
+
+static int		parse_dynamic(elf_file_t ef);
+static int		relocate_file(elf_file_t ef);
+static int		link_elf_preload_parse_symbols(elf_file_t ef);
+
+#ifdef DDB
+static void		r_debug_state(struct r_debug *dummy_one,
+				      struct link_map *dummy_two);
+
+/*
+ * A list of loaded modules for GDB to use for loading symbols.
+ */
+struct r_debug r_debug;
+
+#define GDB_STATE(s)	r_debug.r_state = s; r_debug_state(NULL, NULL);
+
+/*
+ * Function for the debugger to set a breakpoint on to gain control.
+ */
+void
+r_debug_state(struct r_debug *dummy_one __unused,
+	      struct link_map *dummy_two __unused)
+{
+}
+
+#endif
+
+#ifdef __ia64__
+Elf_Addr link_elf_get_gp(linker_file_t);
+#endif
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_elf_init(void* arg)
+{
+#ifdef __ELF__
+    Elf_Dyn	*dp;
+    caddr_t	modptr, baseptr, sizeptr;
+    elf_file_t	ef;
+    char	*modname;
+#ifdef DDB
+    char *newfilename;
+#endif
+#endif
+
+    linker_add_class(&link_elf_class);
+
+#ifdef __ELF__
+    dp = (Elf_Dyn*) &_DYNAMIC;
+    modname = NULL;
+    modptr = preload_search_by_type("elf kernel");
+    if (modptr)
+	modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+    if (modname == NULL)
+	modname = "kernel";
+    linker_kernel_file = linker_make_file(modname, &link_elf_class);
+    if (linker_kernel_file == NULL)
+	panic("link_elf_init: Can't create linker structures for kernel");
+    
+    ef = (elf_file_t) linker_kernel_file;
+    ef->preloaded = 1;
+    ef->address = 0;
+#ifdef SPARSE_MAPPING
+    ef->object = 0;
+#endif
+    ef->dynamic = dp;
+
+    if (dp)
+	parse_dynamic(ef);
+    linker_kernel_file->address = (caddr_t) KERNBASE;
+    linker_kernel_file->size = -(intptr_t)linker_kernel_file->address;
+
+    if (modptr) {
+	ef->modptr = modptr;
+	baseptr = preload_search_info(modptr, MODINFO_ADDR);
+	if (baseptr)
+	    linker_kernel_file->address = *(caddr_t *)baseptr;
+	sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+	if (sizeptr)
+	    linker_kernel_file->size = *(size_t *)sizeptr;
+    }
+    (void)link_elf_preload_parse_symbols(ef);
+
+#ifdef DDB
+    ef->gdb.l_addr = linker_kernel_file->address;
+    newfilename = malloc(strlen(modname) + 1, M_LINKER, M_WAITOK);
+    strcpy(newfilename, modname);
+    ef->gdb.l_name = newfilename;
+    ef->gdb.l_ld = dp;
+    ef->gdb.l_prev = 0;
+    ef->gdb.l_next = 0;
+
+    r_debug.r_map = &ef->gdb;
+    r_debug.r_brk = r_debug_state;
+    r_debug.r_state = RT_CONSISTENT;
+
+    r_debug_state(NULL, NULL);	/* say hello to gdb! */
+#endif
+
+#endif
+}
+
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+
+static int
+link_elf_preload_parse_symbols(elf_file_t ef)
+{
+    caddr_t	pointer;
+    caddr_t	ssym, esym, base;
+    caddr_t	strtab;
+    int		strcnt;
+    Elf_Sym*	symtab;
+    int		symcnt;
+
+    if (ef->modptr == NULL)
+	return 0;
+    pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM);
+    if (pointer == NULL)
+	return 0;
+    ssym = *(caddr_t *)pointer;
+    pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM);
+    if (pointer == NULL)
+	return 0;
+    esym = *(caddr_t *)pointer;
+
+    base = ssym;
+
+    symcnt = *(long *)base;
+    base += sizeof(long);
+    symtab = (Elf_Sym *)base;
+    base += roundup(symcnt, sizeof(long));
+
+    if (base > esym || base < ssym) {
+	printf("Symbols are corrupt!\n");
+	return EINVAL;
+    }
+
+    strcnt = *(long *)base;
+    base += sizeof(long);
+    strtab = base;
+    base += roundup(strcnt, sizeof(long));
+
+    if (base > esym || base < ssym) {
+	printf("Symbols are corrupt!\n");
+	return EINVAL;
+    }
+
+    ef->ddbsymtab = symtab;
+    ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+    ef->ddbstrtab = strtab;
+    ef->ddbstrcnt = strcnt;
+
+    return 0;
+}
+
+static int
+parse_dynamic(elf_file_t ef)
+{
+    Elf_Dyn *dp;
+    int plttype = DT_REL;
+
+    for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+	switch (dp->d_tag) {
+	case DT_HASH:
+	{
+	    /* From src/libexec/rtld-elf/rtld.c */
+	    const Elf_Hashelt *hashtab = (const Elf_Hashelt *)
+		(ef->address + dp->d_un.d_ptr);
+	    ef->nbuckets = hashtab[0];
+	    ef->nchains = hashtab[1];
+	    ef->buckets = hashtab + 2;
+	    ef->chains = ef->buckets + ef->nbuckets;
+	    break;
+	}
+	case DT_STRTAB:
+	    ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_STRSZ:
+	    ef->strsz = dp->d_un.d_val;
+	    break;
+	case DT_SYMTAB:
+	    ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_SYMENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Sym))
+		return ENOEXEC;
+	    break;
+	case DT_PLTGOT:
+	    ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_REL:
+	    ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_RELSZ:
+	    ef->relsize = dp->d_un.d_val;
+	    break;
+	case DT_RELENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Rel))
+		return ENOEXEC;
+	    break;
+	case DT_JMPREL:
+	    ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_PLTRELSZ:
+	    ef->pltrelsize = dp->d_un.d_val;
+	    break;
+	case DT_RELA:
+	    ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
+	    break;
+	case DT_RELASZ:
+	    ef->relasize = dp->d_un.d_val;
+	    break;
+	case DT_RELAENT:
+	    if (dp->d_un.d_val != sizeof(Elf_Rela))
+		return ENOEXEC;
+	    break;
+	case DT_PLTREL:
+	    plttype = dp->d_un.d_val;
+	    if (plttype != DT_REL && plttype != DT_RELA)
+		return ENOEXEC;
+	    break;
+#ifdef DDB
+	case DT_DEBUG:
+	    dp->d_un.d_ptr = (Elf_Addr) &r_debug;
+	    break;
+#endif
+	}
+    }
+
+    if (plttype == DT_RELA) {
+	ef->pltrela = (const Elf_Rela *) ef->pltrel;
+	ef->pltrel = NULL;
+	ef->pltrelasize = ef->pltrelsize;
+	ef->pltrelsize = 0;
+    }
+
+    ef->ddbsymtab = ef->symtab;
+    ef->ddbsymcnt = ef->nchains;
+    ef->ddbstrtab = ef->strtab;
+    ef->ddbstrcnt = ef->strsz;
+
+    return 0;
+}
+
+static void
+link_elf_error(const char *s)
+{
+    printf("kldload: %s\n", s);
+}
+
+#ifdef DDB
+
+static void
+link_elf_add_gdb(struct link_map *l)
+{
+    struct link_map *prev;
+
+    /*
+     * Scan to the end of the list.
+     */
+    for (prev = r_debug.r_map; prev->l_next != NULL; prev = prev->l_next)
+	;
+
+    /* Link in the new entry. */
+    l->l_prev = prev;
+    l->l_next = prev->l_next;
+    prev->l_next = l;
+}
+
+static void
+link_elf_delete_gdb(struct link_map *l)
+{
+    if (l->l_prev == NULL) {
+	if ((r_debug.r_map = l->l_next) != NULL)
+	    l->l_next->l_prev = NULL;
+	return;
+    }
+
+    if ((l->l_prev->l_next = l->l_next) != NULL)
+	l->l_next->l_prev = l->l_prev;
+}
+
+#endif /* DDB */
+
+static int
+link_elf_link_preload(linker_class_t cls,
+		      const char* filename, linker_file_t *result)
+{
+    caddr_t		modptr, baseptr, sizeptr, dynptr;
+    char		*type;
+    elf_file_t		ef;
+    linker_file_t	lf;
+    int			error;
+    vm_offset_t		dp;
+
+    /* Look to see if we have the file preloaded */
+    modptr = preload_search_by_name(filename);
+    if (modptr == NULL)
+	return ENOENT;
+
+    type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+    baseptr = preload_search_info(modptr, MODINFO_ADDR);
+    sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+    dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC);
+    if (type == NULL || strcmp(type, "elf module") != 0)
+	return (EFTYPE);
+    if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
+	return (EINVAL);
+
+    lf = linker_make_file(filename, &link_elf_class);
+    if (lf == NULL) {
+	return ENOMEM;
+    }
+
+    ef = (elf_file_t) lf;
+    ef->preloaded = 1;
+    ef->modptr = modptr;
+    ef->address = *(caddr_t *)baseptr;
+#ifdef SPARSE_MAPPING
+    ef->object = 0;
+#endif
+    dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
+    ef->dynamic = (Elf_Dyn *)dp;
+    lf->address = ef->address;
+    lf->size = *(size_t *)sizeptr;
+
+    error = parse_dynamic(ef);
+    if (error) {
+	linker_file_unload(lf);
+	return error;
+    }
+    *result = lf;
+    return (0);
+}
+
+static int
+link_elf_link_preload_finish(linker_file_t lf)
+{
+    elf_file_t		ef;
+    int error;
+#ifdef DDB
+    char *newfilename;
+#endif
+
+    ef = (elf_file_t) lf;
+#if 0	/* this will be more trouble than it's worth for now */
+    for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+	if (dp->d_tag != DT_NEEDED)
+	    continue;
+	modname = ef->strtab + dp->d_un.d_val;
+	error = linker_load_module(modname, lf);
+	if (error)
+	    goto out;
+    }
+#endif
+    error = relocate_file(ef);
+    if (error)
+	return error;
+    (void)link_elf_preload_parse_symbols(ef);
+
+#ifdef DDB
+    GDB_STATE(RT_ADD);
+    ef->gdb.l_addr = lf->address;
+    newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK);
+    strcpy(newfilename, lf->filename);
+    ef->gdb.l_name = newfilename;
+    ef->gdb.l_ld = ef->dynamic;
+    link_elf_add_gdb(&ef->gdb);
+    GDB_STATE(RT_CONSISTENT);
+#endif
+
+    return (0);
+}
+
+static int
+link_elf_load_file(linker_class_t cls, const char* filename, linker_file_t* result)
+{
+    struct nameidata nd;
+    struct thread* td = curthread;	/* XXX */
+    Elf_Ehdr *hdr;
+    caddr_t firstpage;
+    int nbytes, i;
+    Elf_Phdr *phdr;
+    Elf_Phdr *phlimit;
+    Elf_Phdr *segs[2];
+    int nsegs;
+    Elf_Phdr *phdyn;
+    Elf_Phdr *phphdr;
+    caddr_t mapbase;
+    size_t mapsize;
+    Elf_Off base_offset;
+    Elf_Addr base_vaddr;
+    Elf_Addr base_vlimit;
+    int error = 0;
+    int resid, flags;
+    elf_file_t ef;
+    linker_file_t lf;
+    Elf_Shdr *shdr;
+    int symtabindex;
+    int symstrindex;
+    int symcnt;
+    int strcnt;
+#ifdef DDB
+    char *newfilename;
+#endif
+
+    GIANT_REQUIRED;
+
+    shdr = NULL;
+    lf = NULL;
+
+    NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+    flags = FREAD;
+    error = vn_open(&nd, &flags, 0);
+    if (error)
+	return error;
+    NDFREE(&nd, NDF_ONLY_PNBUF);
+
+    /*
+     * Read the elf header from the file.
+     */
+    firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
+    if (firstpage == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    hdr = (Elf_Ehdr *)firstpage;
+    error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+    nbytes = PAGE_SIZE - resid;
+    if (error)
+	goto out;
+
+    if (!IS_ELF(*hdr)) {
+	error = ENOEXEC;
+	goto out;
+    }
+
+    if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
+      || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+	link_elf_error("Unsupported file layout");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_ident[EI_VERSION] != EV_CURRENT
+      || hdr->e_version != EV_CURRENT) {
+	link_elf_error("Unsupported file version");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
+	link_elf_error("Unsupported file type");
+	error = ENOEXEC;
+	goto out;
+    }
+    if (hdr->e_machine != ELF_TARG_MACH) {
+	link_elf_error("Unsupported machine");
+	error = ENOEXEC;
+	goto out;
+    }
+
+    /*
+     * We rely on the program header being in the first page.  This is
+     * not strictly required by the ABI specification, but it seems to
+     * always true in practice.  And, it simplifies things considerably.
+     */
+    if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
+	  (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
+	  (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
+	link_elf_error("Unreadable program headers");
+
+    /*
+     * Scan the program header entries, and save key information.
+     *
+     * We rely on there being exactly two load segments, text and data,
+     * in that order.
+     */
+    phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
+    phlimit = phdr + hdr->e_phnum;
+    nsegs = 0;
+    phdyn = NULL;
+    phphdr = NULL;
+    while (phdr < phlimit) {
+	switch (phdr->p_type) {
+
+	case PT_LOAD:
+	    if (nsegs == 2) {
+		link_elf_error("Too many sections");
+		error = ENOEXEC;
+		goto out;
+	    }
+	    segs[nsegs] = phdr;
+	    ++nsegs;
+	    break;
+
+	case PT_PHDR:
+	    phphdr = phdr;
+	    break;
+
+	case PT_DYNAMIC:
+	    phdyn = phdr;
+	    break;
+
+	case PT_INTERP:
+	    link_elf_error("Unsupported file type");
+	    error = ENOEXEC;
+	    goto out;
+	}
+
+	++phdr;
+    }
+    if (phdyn == NULL) {
+	link_elf_error("Object is not dynamically-linked");
+	error = ENOEXEC;
+	goto out;
+    }
+
+    /*
+     * Allocate the entire address space of the object, to stake out our
+     * contiguous region, and to establish the base address for relocation.
+     */
+    base_offset = trunc_page(segs[0]->p_offset);
+    base_vaddr = trunc_page(segs[0]->p_vaddr);
+    base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
+    mapsize = base_vlimit - base_vaddr;
+
+    lf = linker_make_file(filename, &link_elf_class);
+    if (!lf) {
+	error = ENOMEM;
+	goto out;
+    }
+
+    ef = (elf_file_t) lf;
+#ifdef SPARSE_MAPPING
+    ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
+    if (ef->object == NULL) {
+	free(ef, M_LINKER);
+	error = ENOMEM;
+	goto out;
+    }
+    vm_object_reference(ef->object);
+    ef->address = (caddr_t) vm_map_min(kernel_map);
+    error = vm_map_find(kernel_map, ef->object, 0,
+			(vm_offset_t *) &ef->address,
+			mapsize, 1,
+			VM_PROT_ALL, VM_PROT_ALL, 0);
+    if (error) {
+	vm_object_deallocate(ef->object);
+	ef->object = 0;
+	goto out;
+    }
+#else
+    ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
+    if (!ef->address) {
+	error = ENOMEM;
+	goto out;
+    }
+#endif
+    mapbase = ef->address;
+
+    /*
+     * Read the text and data sections and zero the bss.
+     */
+    for (i = 0; i < 2; i++) {
+	caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
+	error = vn_rdwr(UIO_READ, nd.ni_vp,
+			segbase, segs[i]->p_filesz, segs[i]->p_offset,
+			UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+	if (error) {
+	    goto out;
+	}
+	bzero(segbase + segs[i]->p_filesz,
+	      segs[i]->p_memsz - segs[i]->p_filesz);
+
+#ifdef SPARSE_MAPPING
+	/*
+	 * Wire down the pages
+	 */
+	vm_map_pageable(kernel_map,
+			(vm_offset_t) segbase,
+			(vm_offset_t) segbase + segs[i]->p_memsz,
+			FALSE);
+#endif
+    }
+
+#ifdef GPROF
+    /* Update profiling information with the new text segment. */
+    kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
+	segs[0]->p_memsz));
+#endif
+
+    ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
+
+    lf->address = ef->address;
+    lf->size = mapsize;
+
+    error = parse_dynamic(ef);
+    if (error)
+	goto out;
+    error = linker_load_dependencies(lf);
+    if (error)
+	goto out;
+#if 0	/* this will be more trouble than it's worth for now */
+    for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+	if (dp->d_tag != DT_NEEDED)
+	    continue;
+	modname = ef->strtab + dp->d_un.d_val;
+	error = linker_load_module(modname, lf);
+	if (error)
+	    goto out;
+    }
+#endif
+    error = relocate_file(ef);
+    if (error)
+	goto out;
+
+    /* Try and load the symbol table if it's present.  (you can strip it!) */
+    nbytes = hdr->e_shnum * hdr->e_shentsize;
+    if (nbytes == 0 || hdr->e_shoff == 0)
+	goto nosyms;
+    shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
+    if (shdr == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    (caddr_t)shdr, nbytes, hdr->e_shoff,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+    if (error)
+	goto out;
+    symtabindex = -1;
+    symstrindex = -1;
+    for (i = 0; i < hdr->e_shnum; i++) {
+	if (shdr[i].sh_type == SHT_SYMTAB) {
+	    symtabindex = i;
+	    symstrindex = shdr[i].sh_link;
+	}
+    }
+    if (symtabindex < 0 || symstrindex < 0)
+	goto nosyms;
+
+    symcnt = shdr[symtabindex].sh_size;
+    ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
+    strcnt = shdr[symstrindex].sh_size;
+    ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
+
+    if (ef->symbase == NULL || ef->strbase == NULL) {
+	error = ENOMEM;
+	goto out;
+    }
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    ef->symbase, symcnt, shdr[symtabindex].sh_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+    if (error)
+	goto out;
+    error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    ef->strbase, strcnt, shdr[symstrindex].sh_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, &resid, td);
+    if (error)
+	goto out;
+
+    ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+    ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
+    ef->ddbstrcnt = strcnt;
+    ef->ddbstrtab = ef->strbase;
+
+#ifdef DDB
+    GDB_STATE(RT_ADD);
+    ef->gdb.l_addr = lf->address;
+    newfilename = malloc(strlen(filename) + 1, M_LINKER, M_WAITOK);
+    strcpy(newfilename, filename);
+    ef->gdb.l_name = (const char *)newfilename;
+    ef->gdb.l_ld = ef->dynamic;
+    link_elf_add_gdb(&ef->gdb);
+    GDB_STATE(RT_CONSISTENT);
+#endif
+
+nosyms:
+
+    *result = lf;
+
+out:
+    if (error && lf)
+	linker_file_unload(lf);
+    if (shdr)
+	free(shdr, M_LINKER);
+    if (firstpage)
+	free(firstpage, M_LINKER);
+    VOP_UNLOCK(nd.ni_vp, 0, td);
+    vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+
+    return error;
+}
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+    elf_file_t ef = (elf_file_t) file;
+
+#ifdef DDB
+    if (ef->gdb.l_ld) {
+	GDB_STATE(RT_DELETE);
+	free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER);
+	link_elf_delete_gdb(&ef->gdb);
+	GDB_STATE(RT_CONSISTENT);
+    }
+#endif
+
+    if (ef->preloaded) {
+	link_elf_unload_preload(file);
+	return;
+    }
+#ifdef SPARSE_MAPPING
+    if (ef->object) {
+	vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+		      (vm_offset_t) ef->address
+		      + (ef->object->size << PAGE_SHIFT));
+	vm_object_deallocate(ef->object);
+    }
+#else
+    if (ef->address)
+	free(ef->address, M_LINKER);
+#endif
+    if (ef->symbase)
+	free(ef->symbase, M_LINKER);
+    if (ef->strbase)
+	free(ef->strbase, M_LINKER);
+}
+
+static void
+link_elf_unload_preload(linker_file_t file)
+{
+    if (file->filename)
+	preload_delete_name(file->filename);
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Word r_info)
+{
+    const Elf_Sym *ref;
+
+    if (ELF_R_SYM(r_info)) {
+	ref = ef->symtab + ELF_R_SYM(r_info);
+	return ef->strtab + ref->st_name;
+    } else
+	return NULL;
+}
+
+static int
+relocate_file(elf_file_t ef)
+{
+    const Elf_Rel *rellim;
+    const Elf_Rel *rel;
+    const Elf_Rela *relalim;
+    const Elf_Rela *rela;
+    const char *symname;
+
+    /* Perform relocations without addend if there are any: */
+    rel = ef->rel;
+    if (rel) {
+	rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize);
+	while (rel < rellim) {
+	    if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) {
+		symname = symbol_name(ef, rel->r_info);
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rel++;
+	}
+    }
+
+    /* Perform relocations with addend if there are any: */
+    rela = ef->rela;
+    if (rela) {
+	relalim = (const Elf_Rela *)((const char *)ef->rela + ef->relasize);
+	while (rela < relalim) {
+	    if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) {
+		symname = symbol_name(ef, rela->r_info);
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rela++;
+	}
+    }
+
+    /* Perform PLT relocations without addend if there are any: */
+    rel = ef->pltrel;
+    if (rel) {
+	rellim = (const Elf_Rel *)((const char *)ef->pltrel + ef->pltrelsize);
+	while (rel < rellim) {
+	    if (elf_reloc(&ef->lf, rel, ELF_RELOC_REL)) {
+		symname = symbol_name(ef, rel->r_info);
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rel++;
+	}
+    }
+
+    /* Perform relocations with addend if there are any: */
+    rela = ef->pltrela;
+    if (rela) {
+	relalim = (const Elf_Rela *)((const char *)ef->pltrela + ef->pltrelasize);
+	while (rela < relalim) {
+	    if (elf_reloc(&ef->lf, rela, ELF_RELOC_RELA)) {
+		symname = symbol_name(ef, rela->r_info);
+		printf("link_elf: symbol %s undefined\n", symname);
+		return ENOENT;
+	    }
+	    rela++;
+	}
+    }
+
+    return 0;
+}
+
+/*
+ * Hash function for symbol table lookup.  Don't even think about changing
+ * this.  It is specified by the System V ABI.
+ */
+static unsigned long
+elf_hash(const char *name)
+{
+    const unsigned char *p = (const unsigned char *) name;
+    unsigned long h = 0;
+    unsigned long g;
+
+    while (*p != '\0') {
+	h = (h << 4) + *p++;
+	if ((g = h & 0xf0000000) != 0)
+	    h ^= g >> 24;
+	h &= ~g;
+    }
+    return h;
+}
+
+int
+link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym)
+{
+    elf_file_t ef = (elf_file_t) lf;
+    unsigned long symnum;
+    const Elf_Sym* symp;
+    const char *strp;
+    unsigned long hash;
+    int i;
+
+    /* First, search hashed global symbols */
+    hash = elf_hash(name);
+    symnum = ef->buckets[hash % ef->nbuckets];
+
+    while (symnum != STN_UNDEF) {
+	if (symnum >= ef->nchains) {
+	    printf("link_elf_lookup_symbol: corrupt symbol table\n");
+	    return ENOENT;
+	}
+
+	symp = ef->symtab + symnum;
+	if (symp->st_name == 0) {
+	    printf("link_elf_lookup_symbol: corrupt symbol table\n");
+	    return ENOENT;
+	}
+
+	strp = ef->strtab + symp->st_name;
+
+	if (strcmp(name, strp) == 0) {
+	    if (symp->st_shndx != SHN_UNDEF ||
+		(symp->st_value != 0 &&
+		 ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+		*sym = (c_linker_sym_t) symp;
+		return 0;
+	    } else
+		return ENOENT;
+	}
+
+	symnum = ef->chains[symnum];
+    }
+
+    /* If we have not found it, look at the full table (if loaded) */
+    if (ef->symtab == ef->ddbsymtab)
+	return ENOENT;
+
+    /* Exhaustive search */
+    for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+	strp = ef->ddbstrtab + symp->st_name;
+	if (strcmp(name, strp) == 0) {
+	    if (symp->st_shndx != SHN_UNDEF ||
+		(symp->st_value != 0 &&
+		 ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+		*sym = (c_linker_sym_t) symp;
+		return 0;
+	    } else
+		return ENOENT;
+	}
+    }
+
+    return ENOENT;
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t* symval)
+{
+	elf_file_t ef = (elf_file_t) lf;
+	const Elf_Sym* es = (const Elf_Sym*) sym;
+
+	if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) {
+	    symval->name = ef->strtab + es->st_name;
+	    symval->value = (caddr_t) ef->address + es->st_value;
+	    symval->size = es->st_size;
+	    return 0;
+	}
+	if (ef->symtab == ef->ddbsymtab)
+	    return ENOENT;
+	if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) {
+	    symval->name = ef->ddbstrtab + es->st_name;
+	    symval->value = (caddr_t) ef->address + es->st_value;
+	    symval->size = es->st_size;
+	    return 0;
+	}
+	return ENOENT;
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+		       c_linker_sym_t* sym, long* diffp)
+{
+	elf_file_t ef = (elf_file_t) lf;
+	u_long off = (uintptr_t) (void *) value;
+	u_long diff = off;
+	u_long st_value;
+	const Elf_Sym* es;
+	const Elf_Sym* best = 0;
+	int i;
+
+	for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+		if (es->st_name == 0)
+			continue;
+		st_value = es->st_value + (uintptr_t) (void *) ef->address;
+		if (off >= st_value) {
+			if (off - st_value < diff) {
+				diff = off - st_value;
+				best = es;
+				if (diff == 0)
+					break;
+			} else if (off - st_value == diff) {
+				best = es;
+			}
+		}
+	}
+	if (best == 0)
+		*diffp = off;
+	else
+		*diffp = diff;
+	*sym = (c_linker_sym_t) best;
+
+	return 0;
+}
+
+/*
+ * Look up a linker set on an ELF system.
+ */
+static int
+link_elf_lookup_set(linker_file_t lf, const char *name,
+		    void ***startp, void ***stopp, int *countp)
+{
+	c_linker_sym_t sym;
+	linker_symval_t symval;
+	char *setsym;
+	void **start, **stop;
+	int len, error = 0, count;
+
+	len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */
+	setsym = malloc(len, M_LINKER, M_WAITOK);
+	if (setsym == NULL)
+		return ENOMEM;
+
+	/* get address of first entry */
+	snprintf(setsym, len, "%s%s", "__start_set_", name);
+	error = link_elf_lookup_symbol(lf, setsym, &sym);
+	if (error)
+		goto out;
+	link_elf_symbol_values(lf, sym, &symval);
+	if (symval.value == 0) {
+		error = ESRCH;
+		goto out;
+	}
+	start = (void **)symval.value;
+
+	/* get address of last entry */
+	snprintf(setsym, len, "%s%s", "__stop_set_", name);
+	error = link_elf_lookup_symbol(lf, setsym, &sym);
+	if (error)
+		goto out;
+	link_elf_symbol_values(lf, sym, &symval);
+	if (symval.value == 0) {
+		error = ESRCH;
+		goto out;
+	}
+	stop = (void **)symval.value;
+
+	/* and the number of entries */
+	count = stop - start;
+
+	/* and copy out */
+	if (startp)
+		*startp = start;
+	if (stopp)
+		*stopp = stop;
+	if (countp)
+		*countp = count;
+
+out:
+	free(setsym, M_LINKER);
+	return error;
+}
+
+static int
+link_elf_each_function_name(linker_file_t file,
+  int (*callback)(const char *, void *), void *opaque) {
+    elf_file_t ef = (elf_file_t)file;
+    const Elf_Sym* symp;
+    int i, error;
+	
+    /* Exhaustive search */
+    for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+	if (symp->st_value != 0 &&
+	    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+		error = callback(ef->ddbstrtab + symp->st_name, opaque);
+		if (error)
+		    return (error);
+	}
+    }
+    return (0);
+}
+
+#ifdef __ia64__
+/*
+ * Each KLD has its own GP. The GP value for each load module is given by
+ * DT_PLTGOT on ia64. We need GP to construct function descriptors, but
+ * don't have direct access to the ELF file structure. The link_elf_get_gp()
+ * function returns the GP given a pointer to a generic linker file struct.
+ */
+Elf_Addr
+link_elf_get_gp(linker_file_t lf)
+{
+	elf_file_t ef = (elf_file_t)lf;
+	return (Elf_Addr)ef->got;
+}
+#endif
+
+/*
+ * Symbol lookup function that can be used when the symbol index is known (ie
+ * in relocations). It uses the symbol index instead of doing a fully fledged
+ * hash table based lookup when such is valid. For example for local symbols.
+ * This is not only more efficient, it's also more correct. It's not always
+ * the case that the symbol can be found through the hash table.
+ */
+Elf_Addr
+elf_lookup(linker_file_t lf, Elf_Word symidx, int deps)
+{
+	elf_file_t ef = (elf_file_t)lf;
+	const Elf_Sym *sym;
+	const char *symbol;
+
+	/* Don't even try to lookup the symbol if the index is bogus. */
+	if (symidx >= ef->nchains)
+		return (0);
+
+	sym = ef->symtab + symidx;
+
+	/*
+	 * Don't do a full lookup when the symbol is local. It may even
+	 * fail because it may not be found through the hash table.
+	 */
+	if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) {
+		/* Force lookup failure when we have an insanity. */
+		if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0)
+			return (0);
+		return ((Elf_Addr)ef->address + sym->st_value);
+	}
+
+	/*
+	 * XXX we can avoid doing a hash table based lookup for global
+	 * symbols as well. This however is not always valid, so we'll
+	 * just do it the hard way for now. Performance tweaks can
+	 * always be added.
+	 */
+
+	symbol = ef->strtab + sym->st_name;
+
+	/* Force a lookup failure if the symbol name is bogus. */
+	if (*symbol == 0)
+		return (0);
+
+	return ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
+}
diff --git a/sys/kern/linker_if.m b/sys/kern/linker_if.m
new file mode 100644
index 0000000..9dafb57
--- /dev/null
+++ b/sys/kern/linker_if.m
@@ -0,0 +1,107 @@
+#
+# Copyright (c) 2000 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/linker.h>
+
+INTERFACE linker;
+
+#
+# Lookup a symbol in the file's symbol table.  If the symbol is not
+# found then return ENOENT, otherwise zero.
+#
+METHOD int lookup_symbol {
+    linker_file_t	file;
+    const char*		name;
+    c_linker_sym_t*	symp;
+};
+
+METHOD int symbol_values {
+    linker_file_t	file;
+    c_linker_sym_t	sym;
+    linker_symval_t*	valp;
+};
+
+METHOD int search_symbol {
+    linker_file_t	file;
+    caddr_t		value;
+    c_linker_sym_t*	symp;
+    long*		diffp;
+};
+
+#
+# Call the callback with each specified function defined in the file.
+# Stop and return the error if the callback returns an error.
+#
+METHOD int each_function_name {
+	linker_file_t	file;
+	linker_function_name_callback_t	callback;
+	void*		opaque;
+};
+
+#
+# Search for a linker set in a file.  Return a pointer to the first
+# entry (which is itself a pointer), and the number of entries.
+# "stop" points to the entry beyond the last valid entry.
+# If count, start or stop are NULL, they are not returned.
+#
+METHOD int lookup_set {
+    linker_file_t	file;
+    const char*		name;
+    void***		start;
+    void***		stop;
+    int*		count;
+};
+
+#
+# Unload a file, releasing dependancies and freeing storage.
+#
+METHOD void unload {
+    linker_file_t	file;
+};
+
+#
+# Load a file, returning the new linker_file_t in *result.  If
+# the class does not recognise the file type, zero should be
+# returned, without modifying *result.  If the file is
+# recognised, the file should be loaded, *result set to the new
+# file and zero returned.  If some other error is detected an
+# appropriate errno should be returned.
+#
+STATICMETHOD int load_file {
+    linker_class_t	cls;
+    const char*		filename;
+    linker_file_t*	result;
+};
+STATICMETHOD int link_preload {
+    linker_class_t	cls;
+    const char*		filename;
+    linker_file_t*	result;
+};
+STATICMETHOD int link_preload_finish {
+    linker_file_t	file;
+};
diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh
new file mode 100644
index 0000000..f4a0212
--- /dev/null
+++ b/sys/kern/makesyscalls.sh
@@ -0,0 +1,446 @@
+#! /bin/sh -
+#	@(#)makesyscalls.sh	8.1 (Berkeley) 6/10/93
+# $FreeBSD$
+
+set -e
+
+# name of compat option:
+compat=COMPAT_43
+
+# output files:
+sysnames="syscalls.c"
+sysproto="../sys/sysproto.h"
+sysproto_h=_SYS_SYSPROTO_H_
+syshdr="../sys/syscall.h"
+sysmk="../sys/syscall.mk"
+syssw="init_sysent.c"
+syscallprefix="SYS_"
+switchname="sysent"
+namesname="syscallnames"
+
+# tmp files:
+sysdcl="sysent.dcl.$$"
+syscompat="sysent.compat.$$"
+syscompatdcl="sysent.compatdcl.$$"
+sysent="sysent.switch.$$"
+sysinc="sysinc.switch.$$"
+sysarg="sysarg.switch.$$"
+
+trap "rm $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg" 0
+
+touch $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg
+
+case $# in
+    0)	echo "usage: $0 input-file <config-file>" 1>&2
+	exit 1
+	;;
+esac
+
+if [ -n "$2" -a -f "$2" ]; then
+	. $2
+fi
+
+sed -e '
+s/\$//g
+:join
+	/\\$/{a\
+
+	N
+	s/\\\n//
+	b join
+	}
+2,${
+	/^#/!s/\([{}()*,]\)/ \1 /g
+}
+' < $1 | awk "
+	BEGIN {
+		sysdcl = \"$sysdcl\"
+		sysproto = \"$sysproto\"
+		sysproto_h = \"$sysproto_h\"
+		syscompat = \"$syscompat\"
+		syscompatdcl = \"$syscompatdcl\"
+		sysent = \"$sysent\"
+		syssw = \"$syssw\"
+		sysinc = \"$sysinc\"
+		sysarg = \"$sysarg\"
+		sysnames = \"$sysnames\"
+		syshdr = \"$syshdr\"
+		sysmk = \"$sysmk\"
+		compat = \"$compat\"
+		syscallprefix = \"$syscallprefix\"
+		switchname = \"$switchname\"
+		namesname = \"$namesname\"
+		infile = \"$1\"
+		"'
+
+		printf "/*\n * System call switch table.\n *\n" > syssw
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw
+		printf " * $%s$\n", "FreeBSD" > syssw
+
+		printf "/*\n * System call prototypes.\n *\n" > sysarg
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg
+		printf " * $%s$\n", "FreeBSD" > sysarg
+
+		printf "\n#ifdef %s\n\n", compat > syscompat
+
+		printf "/*\n * System call names.\n *\n" > sysnames
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
+		printf " * $%s$\n", "FreeBSD" > sysnames
+
+		printf "/*\n * System call numbers.\n *\n" > syshdr
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr
+		printf " * $%s$\n", "FreeBSD" > syshdr
+		printf "# FreeBSD system call names.\n" > sysmk
+		printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk
+		printf "# $%s$\n", "FreeBSD" > sysmk
+	}
+	NR == 1 {
+		gsub("[$]FreeBSD: ", "", $0)
+		gsub(" [$]", "", $0)
+
+		printf " * created from%s\n */\n\n", $0 > syssw
+
+		printf "\n/* The casts are bogus but will do for now. */\n" > sysent
+		printf "struct sysent %s[] = {\n",switchname > sysent
+
+		printf " * created from%s\n */\n\n", $0 > sysarg
+		printf "#ifndef %s\n", sysproto_h > sysarg
+		printf "#define\t%s\n\n", sysproto_h > sysarg
+		printf "#include <sys/signal.h>\n\n" > sysarg
+		printf "#include <sys/acl.h>\n\n" > sysarg
+		printf "struct proc;\n\n" > sysarg
+		printf "struct thread;\n\n" > sysarg
+		printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg
+		printf "\t\t0 : sizeof(register_t) - sizeof(t))\n\n" > sysarg
+		printf "#if BYTE_ORDER == LITTLE_ENDIAN\n"> sysarg
+		printf "#define\tPADL_(t)\t0\n" > sysarg
+		printf "#define\tPADR_(t)\tPAD_(t)\n" > sysarg
+		printf "#else\n" > sysarg
+		printf "#define\tPADL_(t)\tPAD_(t)\n" > sysarg
+		printf "#define\tPADR_(t)\t0\n" > sysarg
+		printf "#endif\n\n" > sysarg
+
+		printf " * created from%s\n */\n\n", $0 > sysnames
+		printf "char *%s[] = {\n", namesname > sysnames
+
+		printf " * created from%s\n */\n\n", $0 > syshdr
+
+		printf "# created from%s\nMIASM = ", $0 > sysmk
+
+		next
+	}
+	NF == 0 || $1 ~ /^;/ {
+		next
+	}
+	$1 ~ /^#[ 	]*include/ {
+		print > sysinc
+		next
+	}
+	$1 ~ /^#[ 	]*if/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > sysnames
+		savesyscall = syscall
+		next
+	}
+	$1 ~ /^#[ 	]*else/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > sysnames
+		syscall = savesyscall
+		next
+	}
+	$1 ~ /^#/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > sysnames
+		next
+	}
+	syscall != $1 {
+		printf "%s: line %d: syscall number out of sync at %d\n",
+		    infile, NR, syscall
+		printf "line is:\n"
+		print
+		exit 1
+	}
+	function align_sysent_comment(column) {
+		printf("\t") > sysent
+		column = column + 8 - column % 8
+		while (column < 56) {
+			printf("\t") > sysent
+			column = column + 8
+		}
+	}
+	function parserr(was, wanted) {
+		printf "%s: line %d: unexpected %s (expected %s)\n",
+		    infile, NR, was, wanted
+		exit 1
+	}
+	function parseline() {
+		f=4			# toss number and type
+		argc= 0;
+		argssize = "0"
+		if ($NF != "}") {
+			funcalias=$(NF-2)
+			argalias=$(NF-1)
+			rettype=$NF
+			end=NF-3
+		} else {
+			funcalias=""
+			argalias=""
+			rettype="int"
+			end=NF
+		}
+		if ($2 == "NODEF") {
+			funcname=$4
+			argssize = "AS(" $6 ")"
+			return
+		}
+		if ($f != "{")
+			parserr($f, "{")
+		f++
+		if ($end != "}")
+			parserr($end, "}")
+		end--
+		if ($end != ";")
+			parserr($end, ";")
+		end--
+		if ($end != ")")
+			parserr($end, ")")
+		end--
+
+		f++	#function return type
+
+		funcname=$f
+		if (funcalias == "")
+			funcalias = funcname
+		if (argalias == "") {
+			argalias = funcname "_args"
+			if ($2 == "COMPAT")
+				argalias = "o" argalias
+		}
+		f++
+
+		if ($f != "(")
+			parserr($f, ")")
+		f++
+
+		if (f == end) {
+			if ($f != "void")
+				parserr($f, "argument definition")
+			return
+		}
+
+		while (f <= end) {
+			argc++
+			argtype[argc]=""
+			oldf=""
+			while (f < end && $(f+1) != ",") {
+				if (argtype[argc] != "" && oldf != "*")
+					argtype[argc] = argtype[argc]" ";
+				argtype[argc] = argtype[argc]$f;
+				oldf = $f;
+				f++
+			}
+			if (argtype[argc] == "")
+				parserr($f, "argument definition")
+			argname[argc]=$f;
+			f += 2;			# skip name, and any comma
+		}
+		if (argc != 0)
+			argssize = "AS(" argalias ")"
+	}
+	{	comment = $4
+		if (NF < 7)
+			for (i = 5; i <= NF; i++)
+				comment = comment " " $i
+	}
+
+	# The 'M' type prefix
+	#
+	{
+		mpsafe = "SYF_MPSAFE | ";
+		if ($2 == "MSTD") {
+			$2 = "STD";
+		} else if ($2 == "MNODEF") {
+			$2 = "NODEF";
+		} else if ($2 == "MNOARGS") {
+			$2 = "NOARGS";
+		} else if ($2 == "MNOPROTO") {
+			$2 = "NOPROTO";
+		} else if ($2 == "MNOIMPL") {
+			$2 = "NOIMPL";
+		} else if ($2 == "MNOSTD") {
+			$2 = "NOSTD";
+		} else if ($2 == "MCOMPAT") {
+			$2 = "COMPAT";
+		} else if ($2 == "MCPT_NOA") {
+			$2 = "CPT_NOA";
+		} else if ($2 == "MLIBCOMPAT") {
+			$2 = "LIBCOMPAT";
+		} else if ($2 == "MOBSOL") {
+			$2 = "OBSOL";
+		} else if ($2 == "MUNIMPL") {
+			$2 = "UNIMPL";
+		} else {
+			mpsafe = "";
+		}
+	}
+	$2 == "STD" || $2 == "NODEF" || $2 == "NOARGS"  || $2 == "NOPROTO" \
+	    || $2 == "NOIMPL" || $2 == "NOSTD" {
+		parseline()
+		if ((!nosys || funcname != "nosys") && \
+		    (funcname != "lkmnosys") && (funcname != "lkmressys")) {
+			if (argc != 0 && $2 != "NOARGS" && $2 != "NOPROTO") {
+				printf("struct %s {\n", argalias) > sysarg
+				for (i = 1; i <= argc; i++)
+					printf("\tchar %s_l_[PADL_(%s)]; " \
+					    "%s %s; char %s_r_[PADR_(%s)];\n",
+					    argname[i], argtype[i],
+					    argtype[i], argname[i],
+					    argname[i], argtype[i]) > sysarg
+				printf("};\n") > sysarg
+			}
+			else if ($2 != "NOARGS" && $2 != "NOPROTO" && \
+			    $2 != "NODEF")
+				printf("struct %s {\n\tregister_t dummy;\n};\n",
+				    argalias) > sysarg
+		}
+		if (($2 != "NOPROTO" && $2 != "NODEF" && \
+		    (funcname != "nosys" || !nosys)) || \
+		    (funcname == "lkmnosys" && !lkmnosys) || \
+		    funcname == "lkmressys") {
+			printf("%s\t%s(struct thread *, struct %s *)",
+			    rettype, funcname, argalias) > sysdcl
+			printf(";\n") > sysdcl
+		}
+		if (funcname == "nosys")
+			nosys = 1
+		if (funcname == "lkmnosys")
+			lkmnosys = 1
+		printf("\t{ %s%s, (sy_call_t *)", mpsafe, argssize) > sysent
+		column = 8 + 2 + length(mpsafe) + length(argssize) + 15
+		if ($2 == "NOIMPL") {
+			printf("%s },", "nosys") > sysent
+			column = column + length("nosys") + 3
+		} else if ($2 == "NOSTD") {
+			printf("%s },", "lkmressys") > sysent
+			column = column + length("lkmressys") + 3
+		} else {
+			printf("%s },", funcname) > sysent
+			column = column + length(funcname) + 3
+		} 
+		align_sysent_comment(column)
+		printf("/* %d = %s */\n", syscall, funcalias) > sysent
+		printf("\t\"%s\",\t\t\t/* %d = %s */\n",
+		    funcalias, syscall, funcalias) > sysnames
+		if ($2 != "NODEF") {
+			printf("#define\t%s%s\t%d\n", syscallprefix,
+		    	    funcalias, syscall) > syshdr
+			printf(" \\\n\t%s.o", funcalias) > sysmk
+		}
+		syscall++
+		next
+	}
+	$2 == "COMPAT" || $2 == "CPT_NOA" {
+		ncompat++
+		parseline()
+		if (argc != 0 && $2 != "CPT_NOA") {
+			printf("struct %s {\n", argalias) > syscompat
+			for (i = 1; i <= argc; i++)
+				printf("\tchar %s_l_[PADL_(%s)]; %s %s; " \
+				    "char %s_r_[PADR_(%s)];\n",
+				    argname[i], argtype[i],
+				    argtype[i], argname[i],
+				    argname[i], argtype[i]) > syscompat
+			printf("};\n") > syscompat
+		}
+		else if($2 != "CPT_NOA")
+			printf("struct %s {\n\tregister_t dummy;\n};\n",
+			    argalias) > sysarg
+		printf("%s\to%s(struct thread *, struct %s *);\n",
+		    rettype, funcname, argalias) > syscompatdcl
+		printf("\t{ compat(%s%s,%s) },",
+		    mpsafe, argssize, funcname) > sysent
+		align_sysent_comment(8 + 9 + length(mpsafe) + \
+		    length(argssize) + 1 + length(funcname) + 4)
+		printf("/* %d = old %s */\n", syscall, funcalias) > sysent
+		printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
+		    funcalias, syscall, funcalias) > sysnames
+		printf("\t\t\t\t/* %d is old %s */\n",
+		    syscall, funcalias) > syshdr
+		syscall++
+		next
+	}
+	$2 == "LIBCOMPAT" {
+		ncompat++
+		parseline()
+		printf("%s\to%s();\n", rettype, funcname) > syscompatdcl
+		printf("\t{ compat(%s%s,%s) },",
+		    mpsafe, argssize, funcname) > sysent
+		align_sysent_comment(8 + 9 + length(mpsafe) + \
+		    length(argssize) + 1 + length(funcname) + 4)
+		printf("/* %d = old %s */\n", syscall, funcalias) > sysent
+		printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
+		    funcalias, syscall, funcalias) > sysnames
+		printf("#define\t%s%s\t%d\t/* compatibility; still used by libc */\n",
+		    syscallprefix, funcalias, syscall) > syshdr
+		printf(" \\\n\t%s.o", funcalias) > sysmk
+		syscall++
+		next
+	}
+	$2 == "OBSOL" {
+		printf("\t{ 0, (sy_call_t *)nosys },") > sysent
+		align_sysent_comment(34)
+		printf("/* %d = obsolete %s */\n", syscall, comment) > sysent
+		printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n",
+		    $4, syscall, comment) > sysnames
+		printf("\t\t\t\t/* %d is obsolete %s */\n",
+		    syscall, comment) > syshdr
+		syscall++
+		next
+	}
+	$2 == "UNIMPL" {
+		printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = %s */\n",
+		    syscall, comment) > sysent
+		printf("\t\"#%d\",\t\t\t/* %d = %s */\n",
+		    syscall, syscall, comment) > sysnames
+		syscall++
+		next
+	}
+	{
+		printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2
+		exit 1
+	}
+	END {
+		printf "\n#define AS(name) (sizeof(struct name) / sizeof(register_t))\n" > sysinc
+		if (ncompat != 0) {
+			printf "#include \"opt_compat.h\"\n\n" > syssw
+			printf "\n#ifdef %s\n", compat > sysinc
+			printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysinc
+			printf "#else\n" > sysinc
+			printf "#define compat(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+			printf "#endif\n" > sysinc
+		}
+
+		printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
+		printf("#undef PAD_\n") > syscompatdcl
+		printf("#undef PADL_\n") > syscompatdcl
+		printf("#undef PADR_\n") > syscompatdcl
+		printf("\n#endif /* !%s */\n", sysproto_h) > syscompatdcl
+
+		printf("\n") > sysmk
+		printf("};\n") > sysent
+		printf("};\n") > sysnames
+		printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
+		    > syshdr
+	} '
+
+cat $sysinc $sysent >> $syssw
+cat $sysarg $sysdcl $syscompat $syscompatdcl > $sysproto
diff --git a/sys/kern/md4c.c b/sys/kern/md4c.c
new file mode 100644
index 0000000..e3a0bfa
--- /dev/null
+++ b/sys/kern/md4c.c
@@ -0,0 +1,285 @@
+/* MD4C.C - RSA Data Security, Inc., MD4 message-digest algorithm
+ * $FreeBSD$
+ */
+
+/* Copyright (C) 1990-2, RSA Data Security, Inc. All rights reserved.
+
+   License to copy and use this software is granted provided that it
+   is identified as the "RSA Data Security, Inc. MD4 Message-Digest
+   Algorithm" in all material mentioning or referencing this software
+   or this function.
+
+   License is also granted to make and use derivative works provided
+   that such works are identified as "derived from the RSA Data
+   Security, Inc. MD4 Message-Digest Algorithm" in all material
+   mentioning or referencing the derived work.
+
+   RSA Data Security, Inc. makes no representations concerning either
+   the merchantability of this software or the suitability of this
+   software for any particular purpose. It is provided "as is"
+   without express or implied warranty of any kind.
+
+   These notices must be retained in any copies of any part of this
+   documentation and/or software.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/md4.h>
+
+typedef unsigned char *POINTER;
+typedef u_int16_t UINT2;
+typedef u_int32_t UINT4;
+
+#define PROTO_LIST(list) list
+
+/* Constants for MD4Transform routine.
+ */
+#define S11 3
+#define S12 7
+#define S13 11
+#define S14 19
+#define S21 3
+#define S22 5
+#define S23 9
+#define S24 13
+#define S31 3
+#define S32 9
+#define S33 11
+#define S34 15
+
+static void MD4Transform PROTO_LIST ((UINT4 [4], const unsigned char [64]));
+static void Encode PROTO_LIST
+  ((unsigned char *, UINT4 *, unsigned int));
+static void Decode PROTO_LIST
+  ((UINT4 *, const unsigned char *, unsigned int));
+
+static unsigned char PADDING[64] = {
+  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G and H are basic MD4 functions.
+ */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+
+/* ROTATE_LEFT rotates x left n bits.
+ */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/* FF, GG and HH are transformations for rounds 1, 2 and 3 */
+/* Rotation is separate from addition to prevent recomputation */
+#define FF(a, b, c, d, x, s) { \
+    (a) += F ((b), (c), (d)) + (x); \
+    (a) = ROTATE_LEFT ((a), (s)); \
+  }
+#define GG(a, b, c, d, x, s) { \
+    (a) += G ((b), (c), (d)) + (x) + (UINT4)0x5a827999; \
+    (a) = ROTATE_LEFT ((a), (s)); \
+  }
+#define HH(a, b, c, d, x, s) { \
+    (a) += H ((b), (c), (d)) + (x) + (UINT4)0x6ed9eba1; \
+    (a) = ROTATE_LEFT ((a), (s)); \
+  }
+
+/* MD4 initialization. Begins an MD4 operation, writing a new context.
+ */
+void MD4Init (context)
+MD4_CTX *context;                                        /* context */
+{
+  context->count[0] = context->count[1] = 0;
+
+  /* Load magic initialization constants.
+   */
+  context->state[0] = 0x67452301;
+  context->state[1] = 0xefcdab89;
+  context->state[2] = 0x98badcfe;
+  context->state[3] = 0x10325476;
+}
+
+/* MD4 block update operation. Continues an MD4 message-digest
+     operation, processing another message block, and updating the
+     context.
+ */
+void MD4Update (context, input, inputLen)
+MD4_CTX *context;                                        /* context */
+const unsigned char *input;                                /* input block */
+unsigned int inputLen;                     /* length of input block */
+{
+  unsigned int i, index, partLen;
+
+  /* Compute number of bytes mod 64 */
+  index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+  /* Update number of bits */
+  if ((context->count[0] += ((UINT4)inputLen << 3))
+      < ((UINT4)inputLen << 3))
+    context->count[1]++;
+  context->count[1] += ((UINT4)inputLen >> 29);
+
+  partLen = 64 - index;
+  /* Transform as many times as possible.
+   */
+  if (inputLen >= partLen) {
+    bcopy(input, &context->buffer[index], partLen);
+    MD4Transform (context->state, context->buffer);
+
+    for (i = partLen; i + 63 < inputLen; i += 64)
+      MD4Transform (context->state, &input[i]);
+
+    index = 0;
+  }
+  else
+    i = 0;
+
+  /* Buffer remaining input */
+  bcopy(&input[i], &context->buffer[index], inputLen-i);
+}
+
+/* MD4 padding. */
+void MD4Pad (context)
+MD4_CTX *context;                                        /* context */
+{
+  unsigned char bits[8];
+  unsigned int index, padLen;
+
+  /* Save number of bits */
+  Encode (bits, context->count, 8);
+
+  /* Pad out to 56 mod 64.
+   */
+  index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+  padLen = (index < 56) ? (56 - index) : (120 - index);
+  MD4Update (context, PADDING, padLen);
+
+  /* Append length (before padding) */
+  MD4Update (context, bits, 8);
+}
+
+/* MD4 finalization. Ends an MD4 message-digest operation, writing the
+     the message digest and zeroizing the context.
+ */
+void MD4Final (digest, context)
+unsigned char digest[16];                         /* message digest */
+MD4_CTX *context;                                        /* context */
+{
+  /* Do padding */
+  MD4Pad (context);
+
+  /* Store state in digest */
+  Encode (digest, context->state, 16);
+
+  /* Zeroize sensitive information.
+   */
+  bzero((POINTER)context, sizeof (*context));
+}
+
+/* MD4 basic transformation. Transforms state based on block.
+ */
+static void MD4Transform (state, block)
+UINT4 state[4];
+const unsigned char block[64];
+{
+  UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+  Decode (x, block, 64);
+
+  /* Round 1 */
+  FF (a, b, c, d, x[ 0], S11); /* 1 */
+  FF (d, a, b, c, x[ 1], S12); /* 2 */
+  FF (c, d, a, b, x[ 2], S13); /* 3 */
+  FF (b, c, d, a, x[ 3], S14); /* 4 */
+  FF (a, b, c, d, x[ 4], S11); /* 5 */
+  FF (d, a, b, c, x[ 5], S12); /* 6 */
+  FF (c, d, a, b, x[ 6], S13); /* 7 */
+  FF (b, c, d, a, x[ 7], S14); /* 8 */
+  FF (a, b, c, d, x[ 8], S11); /* 9 */
+  FF (d, a, b, c, x[ 9], S12); /* 10 */
+  FF (c, d, a, b, x[10], S13); /* 11 */
+  FF (b, c, d, a, x[11], S14); /* 12 */
+  FF (a, b, c, d, x[12], S11); /* 13 */
+  FF (d, a, b, c, x[13], S12); /* 14 */
+  FF (c, d, a, b, x[14], S13); /* 15 */
+  FF (b, c, d, a, x[15], S14); /* 16 */
+
+  /* Round 2 */
+  GG (a, b, c, d, x[ 0], S21); /* 17 */
+  GG (d, a, b, c, x[ 4], S22); /* 18 */
+  GG (c, d, a, b, x[ 8], S23); /* 19 */
+  GG (b, c, d, a, x[12], S24); /* 20 */
+  GG (a, b, c, d, x[ 1], S21); /* 21 */
+  GG (d, a, b, c, x[ 5], S22); /* 22 */
+  GG (c, d, a, b, x[ 9], S23); /* 23 */
+  GG (b, c, d, a, x[13], S24); /* 24 */
+  GG (a, b, c, d, x[ 2], S21); /* 25 */
+  GG (d, a, b, c, x[ 6], S22); /* 26 */
+  GG (c, d, a, b, x[10], S23); /* 27 */
+  GG (b, c, d, a, x[14], S24); /* 28 */
+  GG (a, b, c, d, x[ 3], S21); /* 29 */
+  GG (d, a, b, c, x[ 7], S22); /* 30 */
+  GG (c, d, a, b, x[11], S23); /* 31 */
+  GG (b, c, d, a, x[15], S24); /* 32 */
+
+  /* Round 3 */
+  HH (a, b, c, d, x[ 0], S31); /* 33 */
+  HH (d, a, b, c, x[ 8], S32); /* 34 */
+  HH (c, d, a, b, x[ 4], S33); /* 35 */
+  HH (b, c, d, a, x[12], S34); /* 36 */
+  HH (a, b, c, d, x[ 2], S31); /* 37 */
+  HH (d, a, b, c, x[10], S32); /* 38 */
+  HH (c, d, a, b, x[ 6], S33); /* 39 */
+  HH (b, c, d, a, x[14], S34); /* 40 */
+  HH (a, b, c, d, x[ 1], S31); /* 41 */
+  HH (d, a, b, c, x[ 9], S32); /* 42 */
+  HH (c, d, a, b, x[ 5], S33); /* 43 */
+  HH (b, c, d, a, x[13], S34); /* 44 */
+  HH (a, b, c, d, x[ 3], S31); /* 45 */
+  HH (d, a, b, c, x[11], S32); /* 46 */
+  HH (c, d, a, b, x[ 7], S33); /* 47 */
+  HH (b, c, d, a, x[15], S34); /* 48 */
+
+  state[0] += a;
+  state[1] += b;
+  state[2] += c;
+  state[3] += d;
+
+  /* Zeroize sensitive information.
+   */
+  bzero((POINTER)x, sizeof (x));
+}
+
+/* Encodes input (UINT4) into output (unsigned char). Assumes len is
+     a multiple of 4.
+ */
+static void Encode (output, input, len)
+unsigned char *output;
+UINT4 *input;
+unsigned int len;
+{
+  unsigned int i, j;
+
+  for (i = 0, j = 0; j < len; i++, j += 4) {
+    output[j] = (unsigned char)(input[i] & 0xff);
+    output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+    output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+    output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+  }
+}
+
+/* Decodes input (unsigned char) into output (UINT4). Assumes len is
+     a multiple of 4.
+ */
+static void Decode (output, input, len)
+
+UINT4 *output;
+const unsigned char *input;
+unsigned int len;
+{
+  unsigned int i, j;
+
+  for (i = 0, j = 0; j < len; i++, j += 4)
+    output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
+      (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
+}
diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c
new file mode 100644
index 0000000..72c970b
--- /dev/null
+++ b/sys/kern/md5c.c
@@ -0,0 +1,339 @@
+/*
+ * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ *
+ * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ * rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * This code is the same as the code published by RSA Inc.  It has been
+ * edited for clarity and style only.
+ */
+
+/*
+ * This file should be kept in sync with src/lib/libmd/md5c.c
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <machine/endian.h>
+#include <sys/endian.h>
+#include <sys/md5.h>
+
+static void MD5Transform(u_int32_t [4], const unsigned char [64]);
+
+#ifdef _KERNEL
+#define memset(x,y,z)	bzero(x,z);
+#define memcpy(x,y,z)	bcopy(y, x, z)
+#endif
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+#define Encode memcpy
+#define Decode memcpy
+#else 
+
+/*
+ * Encodes input (u_int32_t) into output (unsigned char). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Encode (unsigned char *output, u_int32_t *input, unsigned int len)
+{
+	unsigned int i;
+	u_int32_t *op = (u_int32_t *)output;
+
+	for (i = 0; i < len / 4; i++)
+		op[i] = htole32(input[i]);
+}
+
+/*
+ * Decodes input (unsigned char) into output (u_int32_t). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Decode (u_int32_t *output, const unsigned char *input, unsigned int len)
+{
+	unsigned int i;
+	const u_int32_t *ip = (const u_int32_t *)input;
+
+	for (i = 0; i < len / 4; i++)
+		output[i] = le32toh(ip[i]);
+}
+#endif
+
+static unsigned char PADDING[64] = {
+  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G, H and I are basic MD5 functions. */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits. */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/*
+ * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+ * Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+	(a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define GG(a, b, c, d, x, s, ac) { \
+	(a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define HH(a, b, c, d, x, s, ac) { \
+	(a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define II(a, b, c, d, x, s, ac) { \
+	(a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+
+/* MD5 initialization. Begins an MD5 operation, writing a new context. */
+
+void
+MD5Init (context)
+	MD5_CTX *context;
+{
+
+	context->count[0] = context->count[1] = 0;
+
+	/* Load magic initialization constants.  */
+	context->state[0] = 0x67452301;
+	context->state[1] = 0xefcdab89;
+	context->state[2] = 0x98badcfe;
+	context->state[3] = 0x10325476;
+}
+
+/* 
+ * MD5 block update operation. Continues an MD5 message-digest
+ * operation, processing another message block, and updating the
+ * context.
+ */
+
+void
+MD5Update (context, input, inputLen)
+	MD5_CTX *context;
+	const unsigned char *input;
+	unsigned int inputLen;
+{
+	unsigned int i, index, partLen;
+
+	/* Compute number of bytes mod 64 */
+	index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+	/* Update number of bits */
+	if ((context->count[0] += ((u_int32_t)inputLen << 3))
+	    < ((u_int32_t)inputLen << 3))
+		context->count[1]++;
+	context->count[1] += ((u_int32_t)inputLen >> 29);
+
+	partLen = 64 - index;
+
+	/* Transform as many times as possible. */
+	if (inputLen >= partLen) {
+		memcpy((void *)&context->buffer[index], (const void *)input,
+		    partLen);
+		MD5Transform (context->state, context->buffer);
+
+		for (i = partLen; i + 63 < inputLen; i += 64)
+			MD5Transform (context->state, &input[i]);
+
+		index = 0;
+	}
+	else
+		i = 0;
+
+	/* Buffer remaining input */
+	memcpy ((void *)&context->buffer[index], (const void *)&input[i],
+	    inputLen-i);
+}
+
+/*
+ * MD5 padding. Adds padding followed by original length.
+ */
+
+void
+MD5Pad (context)
+	MD5_CTX *context;
+{
+	unsigned char bits[8];
+	unsigned int index, padLen;
+
+	/* Save number of bits */
+	Encode (bits, context->count, 8);
+
+	/* Pad out to 56 mod 64. */
+	index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+	padLen = (index < 56) ? (56 - index) : (120 - index);
+	MD5Update (context, PADDING, padLen);
+
+	/* Append length (before padding) */
+	MD5Update (context, bits, 8);
+}
+
+/*
+ * MD5 finalization. Ends an MD5 message-digest operation, writing the
+ * the message digest and zeroizing the context.
+ */
+
+void
+MD5Final (digest, context)
+	unsigned char digest[16];
+	MD5_CTX *context;
+{
+	/* Do padding. */
+	MD5Pad (context);
+
+	/* Store state in digest */
+	Encode (digest, context->state, 16);
+
+	/* Zeroize sensitive information. */
+	memset ((void *)context, 0, sizeof (*context));
+}
+
+/* MD5 basic transformation. Transforms state based on block. */
+
+static void
+MD5Transform (state, block)
+	u_int32_t state[4];
+	const unsigned char block[64];
+{
+	u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+	Decode (x, block, 64);
+
+	/* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+	FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+	FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+	FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+	FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+	FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+	FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+	FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+	FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+	FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+	FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+	FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+	FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+	FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+	FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+	FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+	FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+	/* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+	GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+	GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+	GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+	GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+	GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+	GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
+	GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+	GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+	GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+	GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+	GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+	GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+	GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+	GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+	GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+	GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+	/* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+	HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+	HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+	HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+	HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+	HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+	HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+	HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+	HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+	HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+	HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+	HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+	HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
+	HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+	HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+	HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+	HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+	/* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+	II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+	II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+	II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+	II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+	II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+	II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+	II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+	II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+	II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+	II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+	II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+	II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+	II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+	II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+	II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+	II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+
+	/* Zeroize sensitive information. */
+	memset ((void *)x, 0, sizeof (x));
+}
diff --git a/sys/kern/p1003_1b.c b/sys/kern/p1003_1b.c
new file mode 100644
index 0000000..9e6fdca
--- /dev/null
+++ b/sys/kern/p1003_1b.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 1996, 1997, 1998
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/* p1003_1b: Real Time common code.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+
+#include <posix4/posix4.h>
+
+MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
+
+/* The system calls return ENOSYS if an entry is called that is
+ * not run-time supported.  I am also logging since some programs
+ * start to use this when they shouldn't.  That will be removed if annoying.
+ */
+int
+syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap)
+{
+	log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+			td->td_proc->p_comm, td->td_proc->p_pid, s);
+
+	/* a " return nosys(p, uap); " here causes a core dump.
+	 */
+
+	return ENOSYS;
+}
+
+#if !defined(_KPOSIX_PRIORITY_SCHEDULING)
+
+/* Not configured but loadable via a module:
+ */
+
+static int sched_attach(void)
+{
+	return 0;
+}
+
+SYSCALL_NOT_PRESENT_GEN(sched_setparam)
+SYSCALL_NOT_PRESENT_GEN(sched_getparam)
+SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_yield)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
+SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
+
+#else
+
+/* Configured in kernel version:
+ */
+static struct ksched *ksched;
+
+static int sched_attach(void)
+{
+	int ret = ksched_attach(&ksched);
+
+	if (ret == 0)
+		p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1);
+
+	return ret;
+}
+
+/* 
+ * MPSAFE
+ */
+int sched_setparam(struct thread *td,
+	struct sched_setparam_args *uap)
+{
+	struct thread *targettd;
+	struct proc *targetp;
+	int e;
+	struct sched_param sched_param;
+
+	e = copyin(uap->param, &sched_param, sizeof(sched_param));
+	if (e)
+		return (e);
+
+	mtx_lock(&Giant);
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL) {
+			e = ESRCH;
+			goto done2;
+		}
+		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+	}
+
+	e = p_cansched(td, targetp);
+	PROC_UNLOCK(targetp);
+	if (e == 0) {
+		e = ksched_setparam(&td->td_retval[0], ksched, targettd,
+			(const struct sched_param *)&sched_param);
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (e);
+}
+
+/* 
+ * MPSAFE
+ */
+int sched_getparam(struct thread *td,
+	struct sched_getparam_args *uap)
+{
+	int e;
+	struct sched_param sched_param;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	mtx_lock(&Giant);
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL) {
+			e = ESRCH;
+			goto done2;
+		}
+		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+	}
+
+	e = p_cansee(td, targetp);
+	PROC_UNLOCK(targetp);
+	if (e)
+		goto done2;
+
+	e = ksched_getparam(&td->td_retval[0], ksched, targettd, &sched_param);
+	if (e == 0)
+		e = copyout(&sched_param, uap->param, sizeof(sched_param));
+done2:
+	mtx_unlock(&Giant);
+	return (e);
+}
+
+/* 
+ * MPSAFE
+ */
+int sched_setscheduler(struct thread *td,
+	struct sched_setscheduler_args *uap)
+{
+	int e;
+	struct sched_param sched_param;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	e = copyin(uap->param, &sched_param, sizeof(sched_param));
+	if (e)
+		return (e);
+
+	mtx_lock(&Giant);
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL) {
+			e = ESRCH;
+			goto done2;
+		}
+		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+	}
+
+	e = p_cansched(td, targetp);
+	PROC_UNLOCK(targetp);
+	if (e == 0) {
+		e = ksched_setscheduler(&td->td_retval[0], ksched, targettd,
+			uap->policy, (const struct sched_param *)&sched_param);
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (e);
+}
+
+/* 
+ * MPSAFE
+ */
+int sched_getscheduler(struct thread *td,
+	struct sched_getscheduler_args *uap)
+{
+	int e;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	mtx_lock(&Giant);
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL) {
+			e = ESRCH;
+			goto done2;
+		}
+		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+	}
+
+	e = p_cansee(td, targetp);
+	PROC_UNLOCK(targetp);
+	if (e == 0)
+		e = ksched_getscheduler(&td->td_retval[0], ksched, targettd);
+
+done2:
+	mtx_unlock(&Giant);
+	return (e);
+}
+
+/* 
+ * MPSAFE
+ */
+int sched_yield(struct thread *td,
+	struct sched_yield_args *uap)
+{
+	int error;
+
+	mtx_lock(&Giant);
+	error = ksched_yield(&td->td_retval[0], ksched);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/* 
+ * MPSAFE
+ */
+int sched_get_priority_max(struct thread *td,
+	struct sched_get_priority_max_args *uap)
+{
+	int error;
+
+	mtx_lock(&Giant);
+	error = ksched_get_priority_max(&td->td_retval[0], ksched, uap->policy);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/* 
+ * MPSAFE
+ */
+int sched_get_priority_min(struct thread *td,
+	struct sched_get_priority_min_args *uap)
+{
+	int error;
+
+	mtx_lock(&Giant);
+	error = ksched_get_priority_min(&td->td_retval[0], ksched, uap->policy);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/* 
+ * MPSAFE
+ */
+int sched_rr_get_interval(struct thread *td,
+	struct sched_rr_get_interval_args *uap)
+{
+	int e;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	mtx_lock(&Giant);
+	if (uap->pid == 0) {
+		targettd = td;
+		targetp = td->td_proc;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL) {
+			e = ESRCH;
+			goto done2;
+		}
+		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+	}
+
+	e = p_cansee(td, targetp);
+	PROC_UNLOCK(targetp);
+	if (e == 0) {
+		e = ksched_rr_get_interval(&td->td_retval[0], ksched, targettd,
+			uap->interval);
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (e);
+}
+
+#endif
+
+static void p31binit(void *notused)
+{
+	(void) sched_attach();
+	p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+}
+
+SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
diff --git a/sys/kern/posix4_mib.c b/sys/kern/posix4_mib.c
new file mode 100644
index 0000000..09af27d
--- /dev/null
+++ b/sys/kern/posix4_mib.c
@@ -0,0 +1,115 @@
+/*-
+ * Copyright (c) 1998
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <posix4/posix4.h>
+
+static int facility[CTL_P1003_1B_MAXID - 1];
+
+/* OID_AUTO isn't working with sysconf(3).  I guess I'd have to
+ * modify it to do a lookup by name from the index.
+ * For now I've left it a top-level sysctl.
+ */
+
+#if 1
+
+SYSCTL_DECL(_p1003_1b);
+
+#define P1B_SYSCTL(num, name)  \
+SYSCTL_INT(_p1003_1b, num, \
+	name, CTLFLAG_RD, facility + num - 1, 0, "");
+
+#else
+
+SYSCTL_DECL(_kern_p1003_1b);
+
+#define P1B_SYSCTL(num, name)  \
+SYSCTL_INT(_kern_p1003_1b, OID_AUTO, \
+	name, CTLFLAG_RD, facility + num - 1, 0, "");
+SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B");
+
+#endif
+
+P1B_SYSCTL(CTL_P1003_1B_ASYNCHRONOUS_IO, asynchronous_io);
+P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range);
+P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection);
+P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling);
+P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals);
+P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores);
+P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync);
+P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects);
+P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io);
+P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers);
+P1B_SYSCTL(CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max);
+P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max);
+P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max);
+P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize);
+P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max);
+P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max);
+P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max);
+
+/* p31b_setcfg: Set the configuration
+ */
+void p31b_setcfg(int num, int value)
+{
+	if (num >= 1 && num < CTL_P1003_1B_MAXID)
+		facility[num - 1] = value;
+}
+
+/*
+ * Turn on indications for standard (non-configurable) kernel features.
+ */
+static void
+p31b_set_standard(void *dummy)
+{
+	/* ??? p31b_setcfg(CTL_P1003_1B_FSYNC, 1); */
+	p31b_setcfg(CTL_P1003_1B_MAPPED_FILES, 1);
+	p31b_setcfg(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, 1);
+	p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+}
+
+SYSINIT(p31b_set_standard, SI_SUB_P1003_1B, SI_ORDER_ANY, p31b_set_standard, 
+	0);
+
diff --git a/sys/kern/subr_acl_posix1e.c b/sys/kern/subr_acl_posix1e.c
new file mode 100644
index 0000000..70be0ec
--- /dev/null
+++ b/sys/kern/subr_acl_posix1e.c
@@ -0,0 +1,830 @@
+/*-
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ * Support for POSIX.1e access control lists.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+
+MALLOC_DEFINE(M_ACL, "acl", "access control list");
+
+static int	vacl_set_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_get_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+
+/*
+ * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
+ * Return 0 on success, else an errno value.  Should be merged into
+ * vaccess() eventually.
+ */
+int
+vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
+    struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
+{
+	struct acl_entry *acl_other, *acl_mask;
+	mode_t dac_granted;
+	mode_t cap_granted;
+	mode_t acl_mask_granted;
+	int group_matched, i;
+
+	/*
+	 * Look for a normal, non-privileged way to access the file/directory
+	 * as requested.  If it exists, go with that.  Otherwise, attempt
+	 * to use privileges granted via cap_granted.  In some cases,
+	 * which privileges to use may be ambiguous due to "best match",
+	 * in which case fall back on first match for the time being.
+	 */
+	if (privused != NULL)
+		*privused = 0;
+
+	/*
+	 * Determine privileges now, but don't apply until we've found
+	 * a DAC entry that matches but has failed to allow access.
+	 */
+#ifndef CAPABILITIES
+	if (suser_cred(cred, PRISON_ROOT) == 0)
+		cap_granted = (VEXEC | VREAD | VWRITE | VADMIN);
+	else
+		cap_granted = 0;
+#else
+	cap_granted = 0;
+
+	if (type == VDIR) {
+		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+		     CAP_DAC_READ_SEARCH, PRISON_ROOT))
+			cap_granted |= VEXEC;
+	} else {
+		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+		    CAP_DAC_EXECUTE, PRISON_ROOT))
+			cap_granted |= VEXEC;
+	}
+
+	if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
+	    PRISON_ROOT))
+		cap_granted |= VREAD;
+
+	if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE,
+	    PRISON_ROOT))
+		cap_granted |= VWRITE;
+
+	if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
+	    PRISON_ROOT))
+		cap_granted |= VADMIN;
+#endif /* CAPABILITIES */
+
+	/*
+	 * The owner matches if the effective uid associated with the
+	 * credential matches that of the ACL_USER_OBJ entry.  While we're
+	 * doing the first scan, also cache the location of the ACL_MASK
+	 * and ACL_OTHER entries, preventing some future iterations.
+	 */
+	acl_mask = acl_other = NULL;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			if (file_uid != cred->cr_uid)
+				break;
+			dac_granted = 0;
+			dac_granted |= VADMIN;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+			if ((acc_mode & (dac_granted | cap_granted)) ==
+			    acc_mode) {
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+			}
+			goto error;
+
+		case ACL_MASK:
+			acl_mask = &acl->acl_entry[i];
+			break;
+
+		case ACL_OTHER:
+			acl_other = &acl->acl_entry[i];
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * An ACL_OTHER entry should always exist in a valid access
+	 * ACL.  If it doesn't, then generate a serious failure.  For now,
+	 * this means a debugging message and EPERM, but in the future
+	 * should probably be a panic.
+	 */
+	if (acl_other == NULL) {
+		/*
+		 * XXX This should never happen
+		 */
+		printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
+		return (EPERM);
+	}
+
+	/*
+	 * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
+	 * are masked by an ACL_MASK entry, if any.  As such, first identify
+	 * the ACL_MASK field, then iterate through identifying potential
+	 * user matches, then group matches.  If there is no ACL_MASK,
+	 * assume that the mask allows all requests to succeed.
+	 */
+	if (acl_mask != NULL) {
+		acl_mask_granted = 0;
+		if (acl_mask->ae_perm & ACL_EXECUTE)
+			acl_mask_granted |= VEXEC;
+		if (acl_mask->ae_perm & ACL_READ)
+			acl_mask_granted |= VREAD;
+		if (acl_mask->ae_perm & ACL_WRITE)
+			acl_mask_granted |= VWRITE;
+	} else
+		acl_mask_granted = VEXEC | VREAD | VWRITE;
+
+	/*
+	 * Iterate through user ACL entries.  Do checks twice, first
+	 * without privilege, and then if a match is found but failed,
+	 * a second time with privilege.
+	 */
+
+	/*
+	 * Check ACL_USER ACL entries.
+	 */
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER:
+			if (acl->acl_entry[i].ae_id != cred->cr_uid)
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			dac_granted &= acl_mask_granted;
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+			if ((acc_mode & (dac_granted | cap_granted)) !=
+			    acc_mode)
+				goto error;
+
+			if (privused != NULL)
+				*privused = 1;
+			return (0);
+		}
+	}
+
+	/*
+	 * Group match is best-match, not first-match, so find a 
+	 * "best" match.  Iterate across, testing each potential group
+	 * match.  Make sure we keep track of whether we found a match
+	 * or not, so that we know if we should try again with any
+	 * available privilege, or if we should move on to ACL_OTHER.
+	 */
+	group_matched = 0;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_GROUP_OBJ:
+			if (!groupmember(file_gid, cred))
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			dac_granted  &= acl_mask_granted;
+
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+
+			group_matched = 1;
+			break;
+
+		case ACL_GROUP:
+			if (!groupmember(acl->acl_entry[i].ae_id, cred))
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			dac_granted  &= acl_mask_granted;
+
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+
+			group_matched = 1;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	if (group_matched == 1) {
+		/*
+		 * There was a match, but it did not grant rights via
+		 * pure DAC.  Try again, this time with privilege.
+		 */
+		for (i = 0; i < acl->acl_cnt; i++) {
+			switch (acl->acl_entry[i].ae_tag) {
+			case ACL_GROUP_OBJ:
+				if (!groupmember(file_gid, cred))
+					break;
+				dac_granted = 0;
+				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+					dac_granted |= VEXEC;
+				if (acl->acl_entry[i].ae_perm & ACL_READ)
+					dac_granted |= VREAD;
+				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+					dac_granted |= VWRITE;
+				dac_granted &= acl_mask_granted;
+
+				if ((acc_mode & (dac_granted | cap_granted)) !=
+				    acc_mode)
+					break;
+
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+
+			case ACL_GROUP:
+				if (!groupmember(acl->acl_entry[i].ae_id,
+				    cred))
+					break;
+				dac_granted = 0;
+				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+				if (acl->acl_entry[i].ae_perm & ACL_READ)
+					dac_granted |= VREAD;
+				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+					dac_granted |= VWRITE;
+				dac_granted &= acl_mask_granted;
+
+				if ((acc_mode & (dac_granted | cap_granted)) !=
+				    acc_mode)
+					break;
+
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+
+			default:
+				break;
+			}
+		}
+		/*
+		 * Even with privilege, group membership was not sufficient.
+		 * Return failure.
+		 */
+		goto error;
+	}
+		
+	/*
+	 * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
+	 */
+	dac_granted = 0;
+	if (acl_other->ae_perm & ACL_EXECUTE)
+		dac_granted |= VEXEC;
+	if (acl_other->ae_perm & ACL_READ)
+		dac_granted |= VREAD;
+	if (acl_other->ae_perm & ACL_WRITE)
+		dac_granted |= VWRITE;
+
+	if ((acc_mode & dac_granted) == acc_mode)
+		return (0);
+	if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
+		if (privused != NULL)
+			*privused = 1;
+		return (0);
+	}
+
+error:
+	return ((acc_mode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * For the purposes of filesystems maintaining the _OBJ entries in an
+ * inode with a mode_t field, this routine converts a mode_t entry
+ * to an acl_perm_t.
+ */
+acl_perm_t
+acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
+{
+	acl_perm_t	perm = 0;
+
+	switch(tag) {
+	case ACL_USER_OBJ:
+		if (mode & S_IXUSR)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IRUSR)
+			perm |= ACL_READ;
+		if (mode & S_IWUSR)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	case ACL_GROUP_OBJ:
+		if (mode & S_IXGRP)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IRGRP)
+			perm |= ACL_READ;
+		if (mode & S_IWGRP)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	case ACL_OTHER:
+		if (mode & S_IXOTH)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IROTH)
+			perm |= ACL_READ;
+		if (mode & S_IWOTH)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	default:
+		printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
+		return (0);
+	}
+}
+
+/*
+ * Given inode information (uid, gid, mode), return an acl entry of the
+ * appropriate type.
+ */
+struct acl_entry
+acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
+{
+	struct acl_entry	acl_entry;
+
+	acl_entry.ae_tag = tag;
+	acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
+	switch(tag) {
+	case ACL_USER_OBJ:
+		acl_entry.ae_id = uid;
+		break;
+
+	case ACL_GROUP_OBJ:
+		acl_entry.ae_id = gid;
+		break;
+
+	case ACL_OTHER:
+		acl_entry.ae_id = ACL_UNDEFINED_ID;
+		break;
+
+	default:
+		acl_entry.ae_id = ACL_UNDEFINED_ID;
+		printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
+	}
+
+	return (acl_entry);
+}
+
+/*
+ * Utility function to generate a file mode given appropriate ACL entries.
+ */
+mode_t
+acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
+    struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
+{
+	mode_t	mode;
+
+	mode = 0;
+	if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXUSR;
+	if (acl_user_obj_entry->ae_perm & ACL_READ)
+		mode |= S_IRUSR;
+	if (acl_user_obj_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWUSR;
+	if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXGRP;
+	if (acl_group_obj_entry->ae_perm & ACL_READ)
+		mode |= S_IRGRP;
+	if (acl_group_obj_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWGRP;
+	if (acl_other_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXOTH;
+	if (acl_other_entry->ae_perm & ACL_READ)
+		mode |= S_IROTH;
+	if (acl_other_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWOTH;
+
+	return (mode);
+}
+
+/*
+ * Perform a syntactic check of the ACL, sufficient to allow an
+ * implementing filesystem to determine if it should accept this and
+ * rely on the POSIX.1e ACL properties.
+ */
+int
+acl_posix1e_check(struct acl *acl)
+{
+	int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
+	int num_acl_mask, num_acl_other, i;
+
+	/*
+	 * Verify that the number of entries does not exceed the maximum
+	 * defined for acl_t.
+	 * Verify that the correct number of various sorts of ae_tags are
+	 * present:
+	 *   Exactly one ACL_USER_OBJ
+	 *   Exactly one ACL_GROUP_OBJ
+	 *   Exactly one ACL_OTHER
+	 *   If any ACL_USER or ACL_GROUP entries appear, then exactly one
+	 *   ACL_MASK entry must also appear.
+	 * Verify that all ae_perm entries are in ACL_PERM_BITS.
+	 * Verify all ae_tag entries are understood by this implementation.
+	 * Note: Does not check for uniqueness of qualifier (ae_id) field.
+	 */
+	num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
+	    num_acl_mask = num_acl_other = 0;
+	if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
+		return (EINVAL);
+	for (i = 0; i < acl->acl_cnt; i++) {
+		/*
+		 * Check for a valid tag.
+		 */
+		switch(acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_user_obj++;
+			break;
+		case ACL_GROUP_OBJ:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_group_obj++;
+			break;
+		case ACL_USER:
+			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_user++;
+			break;
+		case ACL_GROUP:
+			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_group++;
+			break;
+		case ACL_OTHER:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_other++;
+			break;
+		case ACL_MASK:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_mask++;
+			break;
+		default:
+			return (EINVAL);
+		}
+		/*
+		 * Check for valid perm entries.
+		 */
+		if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
+		    ACL_PERM_BITS)
+			return (EINVAL);
+	}
+	if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
+	    (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
+		return (EINVAL);
+	if (((num_acl_group != 0) || (num_acl_user != 0)) &&
+	    (num_acl_mask != 1))
+		return (EINVAL);
+	return (0);
+}
+
+/*
+ * These calls wrap the real vnode operations, and are called by the 
+ * syscall code once the syscall has converted the path or file
+ * descriptor to a vnode (unlocked).  The aclp pointer is assumed
+ * still to point to userland, so this should not be consumed within
+ * the kernel except by syscall code.  Other code should directly
+ * invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernacl;
+	struct mount *mp;
+	int error;
+
+	error = copyin(aclp, &inkernacl, sizeof(struct acl));
+	if (error)
+		return(error);
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return(error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernelacl;
+	int error;
+
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	if (error == 0)
+		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
+	return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+	struct mount *mp;
+	int error;
+
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_SETACL(vp, type, NULL, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernelacl;
+	int error;
+
+	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
+	if (error)
+		return(error);
+	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
+	return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
+ * Don't need to lock, as the vacl_ code will get/release any locks
+ * required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), 
+			    SCARG(uap, aclp));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type),
+			    SCARG(uap, aclp));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_get_acl(td, (struct vnode *)fp->f_data,
+			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_set_acl(td, (struct vnode *)fp->f_data,
+			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_delete(td, nd.ni_vp, SCARG(uap, type));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_delete(td, (struct vnode *)fp->f_data, 
+			    SCARG(uap, type));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+	struct nameidata	nd;
+	int	error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type),
+			    SCARG(uap, aclp));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_aclcheck(td, (struct vnode *)fp->f_data,
+			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c
new file mode 100644
index 0000000..5132e02
--- /dev/null
+++ b/sys/kern/subr_autoconf.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This software was developed by the Computer Systems Engineering group
+ * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
+ * contributed to Berkeley.
+ *
+ * All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Lawrence Berkeley Laboratories.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_autoconf.c	8.1 (Berkeley) 6/10/93
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+
+/*
+ * Autoconfiguration subroutines.
+ */
+
+/*
+ * "Interrupt driven config" functions.
+ */
+static TAILQ_HEAD(, intr_config_hook) intr_config_hook_list =
+	TAILQ_HEAD_INITIALIZER(intr_config_hook_list);
+
+
+/* ARGSUSED */
+static void run_interrupt_driven_config_hooks(void *dummy);
+static void
+run_interrupt_driven_config_hooks(dummy)
+	void *dummy;
+{
+	struct intr_config_hook *hook_entry, *next_entry;
+
+	for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
+	     hook_entry != NULL;
+	     hook_entry = next_entry) {
+		next_entry = TAILQ_NEXT(hook_entry, ich_links);
+		(*hook_entry->ich_func)(hook_entry->ich_arg);
+	}
+
+	while (!TAILQ_EMPTY(&intr_config_hook_list)) {
+		tsleep(&intr_config_hook_list, PCONFIG, "conifhk", 0);
+	}
+}
+SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST,
+	run_interrupt_driven_config_hooks, NULL)
+
+/*
+ * Register a hook that will be called after "cold"
+ * autoconfiguration is complete and interrupts can
+ * be used to complete initialization.
+ */
+int
+config_intrhook_establish(hook)
+	struct intr_config_hook *hook;
+{
+	struct intr_config_hook *hook_entry;
+
+	for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
+	     hook_entry != NULL;
+	     hook_entry = TAILQ_NEXT(hook_entry, ich_links))
+		if (hook_entry == hook)
+			break;
+	if (hook_entry != NULL) {
+		printf("config_intrhook_establish: establishing an "
+		       "already established hook.\n");
+		return (1);
+	}
+	TAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links);
+	if (cold == 0)
+		/* XXX Sufficient for modules loaded after initial config??? */
+		run_interrupt_driven_config_hooks(NULL);	
+	return (0);
+}
+
+void
+config_intrhook_disestablish(hook)
+	struct intr_config_hook *hook;
+{
+	struct intr_config_hook *hook_entry;
+
+	for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
+	     hook_entry != NULL;
+	     hook_entry = TAILQ_NEXT(hook_entry, ich_links))
+		if (hook_entry == hook)
+			break;
+	if (hook_entry == NULL)
+		panic("config_intrhook_disestablish: disestablishing an "
+		      "unestablished hook");
+
+	TAILQ_REMOVE(&intr_config_hook_list, hook, ich_links);
+	/* Wakeup anyone watching the list */
+	wakeup(&intr_config_hook_list);
+}
diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c
new file mode 100644
index 0000000..eeeb7d9
--- /dev/null
+++ b/sys/kern/subr_blist.c
@@ -0,0 +1,929 @@
+
+/*
+ * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
+ *
+ *	(c)Copyright 1998, Matthew Dillon.  Terms for use and redistribution
+ *	are covered by the BSD Copyright as found in /usr/src/COPYRIGHT.
+ *
+ *	This module implements a general bitmap allocator/deallocator.  The
+ *	allocator eats around 2 bits per 'block'.  The module does not 
+ *	try to interpret the meaning of a 'block' other then to return 
+ *	SWAPBLK_NONE on an allocation failure.
+ *
+ *	A radix tree is used to maintain the bitmap.  Two radix constants are
+ *	involved:  One for the bitmaps contained in the leaf nodes (typically
+ *	32), and one for the meta nodes (typically 16).  Both meta and leaf
+ *	nodes have a hint field.  This field gives us a hint as to the largest
+ *	free contiguous range of blocks under the node.  It may contain a
+ *	value that is too high, but will never contain a value that is too 
+ *	low.  When the radix tree is searched, allocation failures in subtrees
+ *	update the hint. 
+ *
+ *	The radix tree also implements two collapsed states for meta nodes:
+ *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
+ *	in either of these two states, all information contained underneath
+ *	the node is considered stale.  These states are used to optimize
+ *	allocation and freeing operations.
+ *
+ * 	The hinting greatly increases code efficiency for allocations while
+ *	the general radix structure optimizes both allocations and frees.  The
+ *	radix tree should be able to operate well no matter how much 
+ *	fragmentation there is and no matter how large a bitmap is used.
+ *
+ *	Unlike the rlist code, the blist code wires all necessary memory at
+ *	creation time.  Neither allocations nor frees require interaction with
+ *	the memory subsystem.  In contrast, the rlist code may allocate memory 
+ *	on an rlist_free() call.  The non-blocking features of the blist code
+ *	are used to great advantage in the swap code (vm/nswap_pager.c).  The
+ *	rlist code uses a little less overall memory then the blist code (but
+ *	due to swap interleaving not all that much less), but the blist code 
+ *	scales much, much better.
+ *
+ *	LAYOUT: The radix tree is layed out recursively using a
+ *	linear array.  Each meta node is immediately followed (layed out
+ *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
+ *	is a recursive structure but one that can be easily scanned through
+ *	a very simple 'skip' calculation.  In order to support large radixes, 
+ *	portions of the tree may reside outside our memory allocation.  We 
+ *	handle this with an early-termination optimization (when bighint is 
+ *	set to -1) on the scan.  The memory allocation is only large enough 
+ *	to cover the number of blocks requested at creation time even if it
+ *	must be encompassed in larger root-node radix.
+ *
+ *	NOTE: the allocator cannot currently allocate more then 
+ *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too 
+ *	large' if you try.  This is an area that could use improvement.  The 
+ *	radix is large enough that this restriction does not effect the swap 
+ *	system, though.  Currently only the allocation code is effected by
+ *	this algorithmic unfeature.  The freeing code can handle arbitrary
+ *	ranges.
+ *
+ *	This code can be compiled stand-alone for debugging.
+ *
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/blist.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/mutex.h> 
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+
+#else
+
+#ifndef BLIST_NO_DEBUG
+#define BLIST_DEBUG
+#endif
+
+#define SWAPBLK_NONE ((daddr_t)-1)
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#define malloc(a,b,c)	malloc(a)
+#define free(a,b)	free(a)
+
+typedef unsigned int u_daddr_t;
+
+#include <sys/blist.h>
+
+void panic(const char *ctl, ...);
+
+#endif
+
+/*
+ * static support functions
+ */
+
+static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
+static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, 
+				daddr_t count, daddr_t radix, int skip);
+static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
+static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 
+					daddr_t radix, int skip, daddr_t blk);
+static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 
+				daddr_t skip, blist_t dest, daddr_t count);
+static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, 
+						int skip, daddr_t count);
+#ifndef _KERNEL
+static void	blst_radix_print(blmeta_t *scan, daddr_t blk, 
+					daddr_t radix, int skip, int tab);
+#endif
+
+#ifdef _KERNEL
+static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
+#endif
+
+/*
+ * blist_create() - create a blist capable of handling up to the specified
+ *		    number of blocks
+ *
+ *	blocks must be greater then 0
+ *
+ *	The smallest blist consists of a single leaf node capable of 
+ *	managing BLIST_BMAP_RADIX blocks.
+ */
+
+blist_t 
+blist_create(daddr_t blocks)
+{
+	blist_t bl;
+	int radix;
+	int skip = 0;
+
+	/*
+	 * Calculate radix and skip field used for scanning.
+	 */
+	radix = BLIST_BMAP_RADIX;
+
+	while (radix < blocks) {
+		radix <<= BLIST_META_RADIX_SHIFT;
+		skip = (skip + 1) << BLIST_META_RADIX_SHIFT;
+	}
+
+	bl = malloc(sizeof(struct blist), M_SWAP, M_WAITOK | M_ZERO);
+
+	bl->bl_blocks = blocks;
+	bl->bl_radix = radix;
+	bl->bl_skip = skip;
+	bl->bl_rootblks = 1 +
+	    blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
+	bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, M_WAITOK);
+
+#if defined(BLIST_DEBUG)
+	printf(
+		"BLIST representing %d blocks (%d MB of swap)"
+		", requiring %dK of ram\n",
+		bl->bl_blocks,
+		bl->bl_blocks * 4 / 1024,
+		(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+	);
+	printf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks);
+#endif
+	blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
+
+	return(bl);
+}
+
+void 
+blist_destroy(blist_t bl)
+{
+	free(bl->bl_root, M_SWAP);
+	free(bl, M_SWAP);
+}
+
+/*
+ * blist_alloc() - reserve space in the block bitmap.  Return the base
+ *		     of a contiguous region or SWAPBLK_NONE if space could
+ *		     not be allocated.
+ */
+
+daddr_t 
+blist_alloc(blist_t bl, daddr_t count)
+{
+	daddr_t blk = SWAPBLK_NONE;
+
+	if (bl) {
+		if (bl->bl_radix == BLIST_BMAP_RADIX)
+			blk = blst_leaf_alloc(bl->bl_root, 0, count);
+		else
+			blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
+		if (blk != SWAPBLK_NONE)
+			bl->bl_free -= count;
+	}
+	return(blk);
+}
+
+/*
+ * blist_free() -	free up space in the block bitmap.  Return the base
+ *		     	of a contiguous region.  Panic if an inconsistancy is
+ *			found.
+ */
+
+void 
+blist_free(blist_t bl, daddr_t blkno, daddr_t count)
+{
+	if (bl) {
+		if (bl->bl_radix == BLIST_BMAP_RADIX)
+			blst_leaf_free(bl->bl_root, blkno, count);
+		else
+			blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
+		bl->bl_free += count;
+	}
+}
+
+/*
+ * blist_resize() -	resize an existing radix tree to handle the
+ *			specified number of blocks.  This will reallocate
+ *			the tree and transfer the previous bitmap to the new
+ *			one.  When extending the tree you can specify whether
+ *			the new blocks are to left allocated or freed.
+ */
+
+void
+blist_resize(blist_t *pbl, daddr_t count, int freenew)
+{
+    blist_t newbl = blist_create(count);
+    blist_t save = *pbl;
+
+    *pbl = newbl;
+    if (count > save->bl_blocks)
+	    count = save->bl_blocks;
+    blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
+
+    /*
+     * If resizing upwards, should we free the new space or not?
+     */
+    if (freenew && count < newbl->bl_blocks) {
+	    blist_free(newbl, count, newbl->bl_blocks - count);
+    }
+    blist_destroy(save);
+}
+
+#ifdef BLIST_DEBUG
+
+/*
+ * blist_print()    - dump radix tree
+ */
+
+void
+blist_print(blist_t bl)
+{
+	printf("BLIST {\n");
+	blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
+	printf("}\n");
+}
+
+#endif
+
+/************************************************************************
+ *			  ALLOCATION SUPPORT FUNCTIONS			*
+ ************************************************************************
+ *
+ *	These support functions do all the actual work.  They may seem 
+ *	rather longish, but that's because I've commented them up.  The
+ *	actual code is straight forward.
+ *
+ */
+
+/*
+ * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
+ *
+ *	This is the core of the allocator and is optimized for the 1 block
+ *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
+ *	somewhat slower.  The 1 block allocation case is log2 and extremely
+ *	quick.
+ */
+
+static daddr_t
+blst_leaf_alloc(
+	blmeta_t *scan,
+	daddr_t blk,
+	int count
+) {
+	u_daddr_t orig = scan->u.bmu_bitmap;
+
+	if (orig == 0) {
+		/*
+		 * Optimize bitmap all-allocated case.  Also, count = 1
+		 * case assumes at least 1 bit is free in the bitmap, so
+		 * we have to take care of this case here.
+		 */
+		scan->bm_bighint = 0;
+		return(SWAPBLK_NONE);
+	}
+	if (count == 1) {
+		/*
+		 * Optimized code to allocate one bit out of the bitmap
+		 */
+		u_daddr_t mask;
+		int j = BLIST_BMAP_RADIX/2;
+		int r = 0;
+
+		mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
+
+		while (j) {
+			if ((orig & mask) == 0) {
+			    r += j;
+			    orig >>= j;
+			}
+			j >>= 1;
+			mask >>= j;
+		}
+		scan->u.bmu_bitmap &= ~(1 << r);
+		return(blk + r);
+	}
+	if (count <= BLIST_BMAP_RADIX) {
+		/*
+		 * non-optimized code to allocate N bits out of the bitmap.
+		 * The more bits, the faster the code runs.  It will run
+		 * the slowest allocating 2 bits, but since there aren't any
+		 * memory ops in the core loop (or shouldn't be, anyway),
+		 * you probably won't notice the difference.
+		 */
+		int j;
+		int n = BLIST_BMAP_RADIX - count;
+		u_daddr_t mask;
+
+		mask = (u_daddr_t)-1 >> n;
+
+		for (j = 0; j <= n; ++j) {
+			if ((orig & mask) == mask) {
+				scan->u.bmu_bitmap &= ~mask;
+				return(blk + j);
+			}
+			mask = (mask << 1);
+		}
+	}
+	/*
+	 * We couldn't allocate count in this subtree, update bighint.
+	 */
+	scan->bm_bighint = count - 1;
+	return(SWAPBLK_NONE);
+}
+
+/*
+ * blist_meta_alloc() -	allocate at a meta in the radix tree.
+ *
+ *	Attempt to allocate at a meta node.  If we can't, we update
+ *	bighint and return a failure.  Updating bighint optimize future
+ *	calls that hit this node.  We have to check for our collapse cases
+ *	and we have a few optimizations strewn in as well.
+ */
+
+static daddr_t
+blst_meta_alloc(
+	blmeta_t *scan, 
+	daddr_t blk,
+	daddr_t count,
+	daddr_t radix, 
+	int skip
+) {
+	int i;
+	int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+	if (scan->u.bmu_avail == 0)  {
+		/*
+		 * ALL-ALLOCATED special case
+		 */
+		scan->bm_bighint = count;
+		return(SWAPBLK_NONE);
+	}
+
+	if (scan->u.bmu_avail == radix) {
+		radix >>= BLIST_META_RADIX_SHIFT;
+
+		/*
+		 * ALL-FREE special case, initialize uninitialize
+		 * sublevel.
+		 */
+		for (i = 1; i <= skip; i += next_skip) {
+			if (scan[i].bm_bighint == (daddr_t)-1)
+				break;
+			if (next_skip == 1) {
+				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+				scan[i].bm_bighint = BLIST_BMAP_RADIX;
+			} else {
+				scan[i].bm_bighint = radix;
+				scan[i].u.bmu_avail = radix;
+			}
+		}
+	} else {
+		radix >>= BLIST_META_RADIX_SHIFT;
+	}
+
+	for (i = 1; i <= skip; i += next_skip) {
+		if (count <= scan[i].bm_bighint) {
+			/*
+			 * count fits in object
+			 */
+			daddr_t r;
+			if (next_skip == 1) {
+				r = blst_leaf_alloc(&scan[i], blk, count);
+			} else {
+				r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
+			}
+			if (r != SWAPBLK_NONE) {
+				scan->u.bmu_avail -= count;
+				if (scan->bm_bighint > scan->u.bmu_avail)
+					scan->bm_bighint = scan->u.bmu_avail;
+				return(r);
+			}
+		} else if (scan[i].bm_bighint == (daddr_t)-1) {
+			/*
+			 * Terminator
+			 */
+			break;
+		} else if (count > radix) {
+			/*
+			 * count does not fit in object even if it were
+			 * complete free.
+			 */
+			panic("blist_meta_alloc: allocation too large");
+		}
+		blk += radix;
+	}
+
+	/*
+	 * We couldn't allocate count in this subtree, update bighint.
+	 */
+	if (scan->bm_bighint >= count)
+		scan->bm_bighint = count - 1;
+	return(SWAPBLK_NONE);
+}
+
+/*
+ * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
+ *
+ */
+
+static void
+blst_leaf_free(
+	blmeta_t *scan,
+	daddr_t blk,
+	int count
+) {
+	/*
+	 * free some data in this bitmap
+	 *
+	 * e.g.
+	 *	0000111111111110000
+	 *          \_________/\__/
+	 *		v        n
+	 */
+	int n = blk & (BLIST_BMAP_RADIX - 1);
+	u_daddr_t mask;
+
+	mask = ((u_daddr_t)-1 << n) &
+	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+	if (scan->u.bmu_bitmap & mask)
+		panic("blst_radix_free: freeing free block");
+	scan->u.bmu_bitmap |= mask;
+
+	/*
+	 * We could probably do a better job here.  We are required to make
+	 * bighint at least as large as the biggest contiguous block of 
+	 * data.  If we just shoehorn it, a little extra overhead will
+	 * be incured on the next allocation (but only that one typically).
+	 */
+	scan->bm_bighint = BLIST_BMAP_RADIX;
+}
+
+/*
+ * BLST_META_FREE() - free allocated blocks from radix tree meta info
+ *
+ *	This support routine frees a range of blocks from the bitmap.
+ *	The range must be entirely enclosed by this radix node.  If a
+ *	meta node, we break the range down recursively to free blocks
+ *	in subnodes (which means that this code can free an arbitrary
+ *	range whereas the allocation code cannot allocate an arbitrary
+ *	range).
+ */
+
+static void 
+blst_meta_free(
+	blmeta_t *scan, 
+	daddr_t freeBlk,
+	daddr_t count,
+	daddr_t radix, 
+	int skip,
+	daddr_t blk
+) {
+	int i;
+	int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+#if 0
+	printf("FREE (%x,%d) FROM (%x,%d)\n",
+	    freeBlk, count,
+	    blk, radix
+	);
+#endif
+
+	if (scan->u.bmu_avail == 0) {
+		/*
+		 * ALL-ALLOCATED special case, with possible
+		 * shortcut to ALL-FREE special case.
+		 */
+		scan->u.bmu_avail = count;
+		scan->bm_bighint = count;
+
+		if (count != radix)  {
+			for (i = 1; i <= skip; i += next_skip) {
+				if (scan[i].bm_bighint == (daddr_t)-1)
+					break;
+				scan[i].bm_bighint = 0;
+				if (next_skip == 1) {
+					scan[i].u.bmu_bitmap = 0;
+				} else {
+					scan[i].u.bmu_avail = 0;
+				}
+			}
+			/* fall through */
+		}
+	} else {
+		scan->u.bmu_avail += count;
+		/* scan->bm_bighint = radix; */
+	}
+
+	/*
+	 * ALL-FREE special case.
+	 */
+
+	if (scan->u.bmu_avail == radix)
+		return;
+	if (scan->u.bmu_avail > radix)
+		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
+		    (long long)count, (long long)scan->u.bmu_avail,
+		    (long long)radix);
+
+	/*
+	 * Break the free down into its components
+	 */
+
+	radix >>= BLIST_META_RADIX_SHIFT;
+
+	i = (freeBlk - blk) / radix;
+	blk += i * radix;
+	i = i * next_skip + 1;
+
+	while (i <= skip && blk < freeBlk + count) {
+		daddr_t v;
+
+		v = blk + radix - freeBlk;
+		if (v > count)
+			v = count;
+
+		if (scan->bm_bighint == (daddr_t)-1)
+			panic("blst_meta_free: freeing unexpected range");
+
+		if (next_skip == 1) {
+			blst_leaf_free(&scan[i], freeBlk, v);
+		} else {
+			blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
+		}
+		if (scan->bm_bighint < scan[i].bm_bighint)
+		    scan->bm_bighint = scan[i].bm_bighint;
+		count -= v;
+		freeBlk += v;
+		blk += radix;
+		i += next_skip;
+	}
+}
+
+/*
+ * BLIST_RADIX_COPY() - copy one radix tree to another
+ *
+ *	Locates free space in the source tree and frees it in the destination
+ *	tree.  The space may not already be free in the destination.
+ */
+
+static void blst_copy(
+	blmeta_t *scan, 
+	daddr_t blk,
+	daddr_t radix, 
+	daddr_t skip, 
+	blist_t dest,
+	daddr_t count
+) {
+	int next_skip;
+	int i;
+
+	/*
+	 * Leaf node
+	 */
+
+	if (radix == BLIST_BMAP_RADIX) {
+		u_daddr_t v = scan->u.bmu_bitmap;
+
+		if (v == (u_daddr_t)-1) {
+			blist_free(dest, blk, count);
+		} else if (v != 0) {
+			int i;
+
+			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
+				if (v & (1 << i))
+					blist_free(dest, blk + i, 1);
+			}
+		}
+		return;
+	}
+
+	/*
+	 * Meta node
+	 */
+
+	if (scan->u.bmu_avail == 0) {
+		/*
+		 * Source all allocated, leave dest allocated
+		 */
+		return;
+	} 
+	if (scan->u.bmu_avail == radix) {
+		/*
+		 * Source all free, free entire dest
+		 */
+		if (count < radix)
+			blist_free(dest, blk, count);
+		else
+			blist_free(dest, blk, radix);
+		return;
+	}
+
+
+	radix >>= BLIST_META_RADIX_SHIFT;
+	next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+	for (i = 1; count && i <= skip; i += next_skip) {
+		if (scan[i].bm_bighint == (daddr_t)-1)
+			break;
+
+		if (count >= radix) {
+			blst_copy(
+			    &scan[i],
+			    blk,
+			    radix,
+			    next_skip - 1,
+			    dest,
+			    radix
+			);
+			count -= radix;
+		} else {
+			if (count) {
+				blst_copy(
+				    &scan[i],
+				    blk,
+				    radix,
+				    next_skip - 1,
+				    dest,
+				    count
+				);
+			}
+			count = 0;
+		}
+		blk += radix;
+	}
+}
+
+/*
+ * BLST_RADIX_INIT() - initialize radix tree
+ *
+ *	Initialize our meta structures and bitmaps and calculate the exact
+ *	amount of space required to manage 'count' blocks - this space may
+ *	be considerably less then the calculated radix due to the large
+ *	RADIX values we use.
+ */
+
+static daddr_t	
+blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
+{
+	int i;
+	int next_skip;
+	daddr_t memindex = 0;
+
+	/*
+	 * Leaf node
+	 */
+
+	if (radix == BLIST_BMAP_RADIX) {
+		if (scan) {
+			scan->bm_bighint = 0;
+			scan->u.bmu_bitmap = 0;
+		}
+		return(memindex);
+	}
+
+	/*
+	 * Meta node.  If allocating the entire object we can special
+	 * case it.  However, we need to figure out how much memory
+	 * is required to manage 'count' blocks, so we continue on anyway.
+	 */
+
+	if (scan) {
+		scan->bm_bighint = 0;
+		scan->u.bmu_avail = 0;
+	}
+
+	radix >>= BLIST_META_RADIX_SHIFT;
+	next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+
+	for (i = 1; i <= skip; i += next_skip) {
+		if (count >= radix) {
+			/*
+			 * Allocate the entire object
+			 */
+			memindex = i + blst_radix_init(
+			    ((scan) ? &scan[i] : NULL),
+			    radix,
+			    next_skip - 1,
+			    radix
+			);
+			count -= radix;
+		} else if (count > 0) {
+			/*
+			 * Allocate a partial object
+			 */
+			memindex = i + blst_radix_init(
+			    ((scan) ? &scan[i] : NULL),
+			    radix,
+			    next_skip - 1,
+			    count
+			);
+			count = 0;
+		} else {
+			/*
+			 * Add terminator and break out
+			 */
+			if (scan)
+				scan[i].bm_bighint = (daddr_t)-1;
+			break;
+		}
+	}
+	if (memindex < i)
+		memindex = i;
+	return(memindex);
+}
+
+#ifdef BLIST_DEBUG
+
+static void	
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
+{
+	int i;
+	int next_skip;
+	int lastState = 0;
+
+	if (radix == BLIST_BMAP_RADIX) {
+		printf(
+		    "%*.*s(%04x,%d): bitmap %08x big=%d\n", 
+		    tab, tab, "",
+		    blk, radix,
+		    scan->u.bmu_bitmap,
+		    scan->bm_bighint
+		);
+		return;
+	}
+
+	if (scan->u.bmu_avail == 0) {
+		printf(
+		    "%*.*s(%04x,%d) ALL ALLOCATED\n",
+		    tab, tab, "",
+		    blk,
+		    radix
+		);
+		return;
+	}
+	if (scan->u.bmu_avail == radix) {
+		printf(
+		    "%*.*s(%04x,%d) ALL FREE\n",
+		    tab, tab, "",
+		    blk,
+		    radix
+		);
+		return;
+	}
+
+	printf(
+	    "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n",
+	    tab, tab, "",
+	    blk, radix,
+	    scan->u.bmu_avail,
+	    radix,
+	    scan->bm_bighint
+	);
+
+	radix >>= BLIST_META_RADIX_SHIFT;
+	next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+	tab += 4;
+
+	for (i = 1; i <= skip; i += next_skip) {
+		if (scan[i].bm_bighint == (daddr_t)-1) {
+			printf(
+			    "%*.*s(%04x,%d): Terminator\n",
+			    tab, tab, "",
+			    blk, radix
+			);
+			lastState = 0;
+			break;
+		}
+		blst_radix_print(
+		    &scan[i],
+		    blk,
+		    radix,
+		    next_skip - 1,
+		    tab
+		);
+		blk += radix;
+	}
+	tab -= 4;
+
+	printf(
+	    "%*.*s}\n",
+	    tab, tab, ""
+	);
+}
+
+#endif
+
+#ifdef BLIST_DEBUG
+
+int
+main(int ac, char **av)
+{
+	int size = 1024;
+	int i;
+	blist_t bl;
+
+	for (i = 1; i < ac; ++i) {
+		const char *ptr = av[i];
+		if (*ptr != '-') {
+			size = strtol(ptr, NULL, 0);
+			continue;
+		}
+		ptr += 2;
+		fprintf(stderr, "Bad option: %s\n", ptr - 2);
+		exit(1);
+	}
+	bl = blist_create(size);
+	blist_free(bl, 0, size);
+
+	for (;;) {
+		char buf[1024];
+		daddr_t da = 0;
+		daddr_t count = 0;
+
+
+		printf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix);
+		fflush(stdout);
+		if (fgets(buf, sizeof(buf), stdin) == NULL)
+			break;
+		switch(buf[0]) {
+		case 'r':
+			if (sscanf(buf + 1, "%d", &count) == 1) {
+				blist_resize(&bl, count, 1);
+			} else {
+				printf("?\n");
+			}
+		case 'p':
+			blist_print(bl);
+			break;
+		case 'a':
+			if (sscanf(buf + 1, "%d", &count) == 1) {
+				daddr_t blk = blist_alloc(bl, count);
+				printf("    R=%04x\n", blk);
+			} else {
+				printf("?\n");
+			}
+			break;
+		case 'f':
+			if (sscanf(buf + 1, "%x %d", &da, &count) == 2) {
+				blist_free(bl, da, count);
+			} else {
+				printf("?\n");
+			}
+			break;
+		case '?':
+		case 'h':
+			puts(
+			    "p          -print\n"
+			    "a %d       -allocate\n"
+			    "f %x %d    -free\n"
+			    "r %d       -resize\n"
+			    "h/?        -help"
+			);
+			break;
+		default:
+			printf("?\n");
+			break;
+		}
+	}
+	return(0);
+}
+
+void
+panic(const char *ctl, ...)
+{
+	va_list va;
+
+	va_start(va, ctl);
+	vfprintf(stderr, ctl, va);
+	fprintf(stderr, "\n");
+	va_end(va);
+	exit(1);
+}
+
+#endif
+
diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c
new file mode 100644
index 0000000..7281051
--- /dev/null
+++ b/sys/kern/subr_bus.c
@@ -0,0 +1,2179 @@
+/*-
+ * Copyright (c) 1997,1998 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/kobj.h>
+#include <sys/bus_private.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <machine/bus.h>
+#include <sys/rman.h>
+#include <machine/stdarg.h>	/* for device_printf() */
+
+static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
+
+#ifdef BUS_DEBUG
+
+static int bus_debug = 1;
+SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RW, &bus_debug, 0,
+    "Debug bus code");
+
+#define PDEBUG(a)	if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a, printf("\n");}
+#define DEVICENAME(d)	((d)? device_get_name(d): "no device")
+#define DRIVERNAME(d)	((d)? d->name : "no driver")
+#define DEVCLANAME(d)	((d)? d->name : "no devclass")
+
+/* Produce the indenting, indent*2 spaces plus a '.' ahead of that to
+ * prevent syslog from deleting initial spaces
+ */
+#define indentprintf(p)	do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf("  "); printf p ; } while (0)
+
+static void print_device_short(device_t dev, int indent);
+static void print_device(device_t dev, int indent);
+void print_device_tree_short(device_t dev, int indent);
+void print_device_tree(device_t dev, int indent);
+static void print_driver_short(driver_t *driver, int indent);
+static void print_driver(driver_t *driver, int indent);
+static void print_driver_list(driver_list_t drivers, int indent);
+static void print_devclass_short(devclass_t dc, int indent);
+static void print_devclass(devclass_t dc, int indent);
+void print_devclass_list_short(void);
+void print_devclass_list(void);
+
+#else
+/* Make the compiler ignore the function calls */
+#define PDEBUG(a)			/* nop */
+#define DEVICENAME(d)			/* nop */
+#define DRIVERNAME(d)			/* nop */
+#define DEVCLANAME(d)			/* nop */
+
+#define print_device_short(d,i)		/* nop */
+#define print_device(d,i)		/* nop */
+#define print_device_tree_short(d,i)	/* nop */
+#define print_device_tree(d,i)		/* nop */
+#define print_driver_short(d,i)		/* nop */
+#define print_driver(d,i)		/* nop */
+#define print_driver_list(d,i)		/* nop */
+#define print_devclass_short(d,i)	/* nop */
+#define print_devclass(d,i)		/* nop */
+#define print_devclass_list_short()	/* nop */
+#define print_devclass_list()		/* nop */
+#endif
+
+TAILQ_HEAD(,device)	bus_data_devices;
+static int bus_data_generation = 1;
+
+kobj_method_t null_methods[] = {
+	{ 0, 0 }
+};
+
+DEFINE_CLASS(null, null_methods, 0);
+
+/*
+ * Devclass implementation
+ */
+
+static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
+
+static devclass_t
+devclass_find_internal(const char *classname, int create)
+{
+	devclass_t dc;
+
+	PDEBUG(("looking for %s", classname));
+	if (!classname)
+		return (NULL);
+
+	TAILQ_FOREACH(dc, &devclasses, link) {
+		if (!strcmp(dc->name, classname))
+			return (dc);
+	}
+
+	PDEBUG(("%s not found%s", classname, (create? ", creating": "")));
+	if (create) {
+		dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
+		    M_BUS, M_NOWAIT|M_ZERO);
+		if (!dc)
+			return (NULL);
+		dc->name = (char*) (dc + 1);
+		strcpy(dc->name, classname);
+		TAILQ_INIT(&dc->drivers);
+		TAILQ_INSERT_TAIL(&devclasses, dc, link);
+
+		bus_data_generation_update();
+	}
+
+	return (dc);
+}
+
+devclass_t
+devclass_create(const char *classname)
+{
+	return (devclass_find_internal(classname, TRUE));
+}
+
+devclass_t
+devclass_find(const char *classname)
+{
+	return (devclass_find_internal(classname, FALSE));
+}
+
+int
+devclass_add_driver(devclass_t dc, driver_t *driver)
+{
+	driverlink_t dl;
+	int i;
+
+	PDEBUG(("%s", DRIVERNAME(driver)));
+
+	dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO);
+	if (!dl)
+		return (ENOMEM);
+
+	/*
+	 * Compile the driver's methods. Also increase the reference count
+	 * so that the class doesn't get freed when the last instance
+	 * goes. This means we can safely use static methods and avoids a
+	 * double-free in devclass_delete_driver.
+	 */
+	kobj_class_compile((kobj_class_t) driver);
+
+	/*
+	 * Make sure the devclass which the driver is implementing exists.
+	 */
+	devclass_find_internal(driver->name, TRUE);
+
+	dl->driver = driver;
+	TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
+	driver->refs++;
+
+	/*
+	 * Call BUS_DRIVER_ADDED for any existing busses in this class.
+	 */
+	for (i = 0; i < dc->maxunit; i++)
+		if (dc->devices[i])
+			BUS_DRIVER_ADDED(dc->devices[i], driver);
+
+	bus_data_generation_update();
+	return (0);
+}
+
+int
+devclass_delete_driver(devclass_t busclass, driver_t *driver)
+{
+	devclass_t dc = devclass_find(driver->name);
+	driverlink_t dl;
+	device_t dev;
+	int i;
+	int error;
+
+	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
+
+	if (!dc)
+		return (0);
+
+	/*
+	 * Find the link structure in the bus' list of drivers.
+	 */
+	TAILQ_FOREACH(dl, &busclass->drivers, link) {
+		if (dl->driver == driver)
+			break;
+	}
+
+	if (!dl) {
+		PDEBUG(("%s not found in %s list", driver->name,
+		    busclass->name));
+		return (ENOENT);
+	}
+
+	/*
+	 * Disassociate from any devices.  We iterate through all the
+	 * devices in the devclass of the driver and detach any which are
+	 * using the driver and which have a parent in the devclass which
+	 * we are deleting from.
+	 *
+	 * Note that since a driver can be in multiple devclasses, we
+	 * should not detach devices which are not children of devices in
+	 * the affected devclass.
+	 */
+	for (i = 0; i < dc->maxunit; i++) {
+		if (dc->devices[i]) {
+			dev = dc->devices[i];
+			if (dev->driver == driver && dev->parent &&
+			    dev->parent->devclass == busclass) {
+				if ((error = device_detach(dev)) != 0)
+					return (error);
+				device_set_driver(dev, NULL);
+			}
+		}
+	}
+
+	TAILQ_REMOVE(&busclass->drivers, dl, link);
+	free(dl, M_BUS);
+
+	driver->refs--;
+	if (driver->refs == 0)
+		kobj_class_free((kobj_class_t) driver);
+
+	bus_data_generation_update();
+	return (0);
+}
+
+static driverlink_t
+devclass_find_driver_internal(devclass_t dc, const char *classname)
+{
+	driverlink_t dl;
+
+	PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
+
+	TAILQ_FOREACH(dl, &dc->drivers, link) {
+		if (!strcmp(dl->driver->name, classname))
+			return (dl);
+	}
+
+	PDEBUG(("not found"));
+	return (NULL);
+}
+
+driver_t *
+devclass_find_driver(devclass_t dc, const char *classname)
+{
+	driverlink_t dl;
+
+	dl = devclass_find_driver_internal(dc, classname);
+	if (dl)
+		return (dl->driver);
+	return (NULL);
+}
+
+const char *
+devclass_get_name(devclass_t dc)
+{
+	return (dc->name);
+}
+
+device_t
+devclass_get_device(devclass_t dc, int unit)
+{
+	if (dc == NULL || unit < 0 || unit >= dc->maxunit)
+		return (NULL);
+	return (dc->devices[unit]);
+}
+
+void *
+devclass_get_softc(devclass_t dc, int unit)
+{
+	device_t dev;
+
+	dev = devclass_get_device(dc, unit);
+	if (!dev)
+		return (NULL);
+
+	return (device_get_softc(dev));
+}
+
+int
+devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
+{
+	int i;
+	int count;
+	device_t *list;
+
+	count = 0;
+	for (i = 0; i < dc->maxunit; i++)
+		if (dc->devices[i])
+			count++;
+
+	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
+	if (!list)
+		return (ENOMEM);
+
+	count = 0;
+	for (i = 0; i < dc->maxunit; i++) {
+		if (dc->devices[i]) {
+			list[count] = dc->devices[i];
+			count++;
+		}
+	}
+
+	*devlistp = list;
+	*devcountp = count;
+
+	return (0);
+}
+
+int
+devclass_get_maxunit(devclass_t dc)
+{
+	return (dc->maxunit);
+}
+
+int
+devclass_find_free_unit(devclass_t dc, int unit)
+{
+	if (dc == NULL)
+		return (unit);
+	while (unit < dc->maxunit && dc->devices[unit] != NULL)
+		unit++;
+	return (unit);
+}
+
+static int
+devclass_alloc_unit(devclass_t dc, int *unitp)
+{
+	int unit = *unitp;
+
+	PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+	/* If we were given a wired unit number, check for existing device */
+	/* XXX imp XXX */
+	if (unit != -1) {
+		if (unit >= 0 && unit < dc->maxunit &&
+		    dc->devices[unit] != NULL) {
+			if (bootverbose)
+				printf("%s: %s%d already exists; skipping it\n",
+				    dc->name, dc->name, *unitp);
+			return (EEXIST);
+		}
+	} else {
+		/* Unwired device, find the next available slot for it */
+		unit = 0;
+		while (unit < dc->maxunit && dc->devices[unit] != NULL)
+			unit++;
+	}
+
+	/*
+	 * We've selected a unit beyond the length of the table, so let's
+	 * extend the table to make room for all units up to and including
+	 * this one.
+	 */
+	if (unit >= dc->maxunit) {
+		device_t *newlist;
+		int newsize;
+
+		newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t));
+		newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
+		if (!newlist)
+			return (ENOMEM);
+		bcopy(dc->devices, newlist, sizeof(device_t) * dc->maxunit);
+		bzero(newlist + dc->maxunit,
+		    sizeof(device_t) * (newsize - dc->maxunit));
+		if (dc->devices)
+			free(dc->devices, M_BUS);
+		dc->devices = newlist;
+		dc->maxunit = newsize;
+	}
+	PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+	*unitp = unit;
+	return (0);
+}
+
+static int
+devclass_add_device(devclass_t dc, device_t dev)
+{
+	int buflen, error;
+
+	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+	buflen = snprintf(NULL, 0, "%s%d$", dc->name, dev->unit);
+	if (buflen < 0)
+		return (ENOMEM);
+	dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO);
+	if (!dev->nameunit)
+		return (ENOMEM);
+
+	if ((error = devclass_alloc_unit(dc, &dev->unit)) != 0) {
+		free(dev->nameunit, M_BUS);
+		dev->nameunit = NULL;
+		return (error);
+	}
+	dc->devices[dev->unit] = dev;
+	dev->devclass = dc;
+	snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);
+
+	return (0);
+}
+
+static int
+devclass_delete_device(devclass_t dc, device_t dev)
+{
+	if (!dc || !dev)
+		return (0);
+
+	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+	if (dev->devclass != dc || dc->devices[dev->unit] != dev)
+		panic("devclass_delete_device: inconsistent device class");
+	dc->devices[dev->unit] = NULL;
+	if (dev->flags & DF_WILDCARD)
+		dev->unit = -1;
+	dev->devclass = NULL;
+	free(dev->nameunit, M_BUS);
+	dev->nameunit = NULL;
+
+	return (0);
+}
+
+static device_t
+make_device(device_t parent, const char *name, int unit)
+{
+	device_t dev;
+	devclass_t dc;
+
+	PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
+
+	if (name) {
+		dc = devclass_find_internal(name, TRUE);
+		if (!dc) {
+			printf("make_device: can't find device class %s\n",
+			    name);
+			return (NULL);
+		}
+	} else {
+		dc = NULL;
+	}
+
+	dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT|M_ZERO);
+	if (!dev)
+		return (NULL);
+
+	dev->parent = parent;
+	TAILQ_INIT(&dev->children);
+	kobj_init((kobj_t) dev, &null_class);
+	dev->driver = NULL;
+	dev->devclass = NULL;
+	dev->unit = unit;
+	dev->nameunit = NULL;
+	dev->desc = NULL;
+	dev->busy = 0;
+	dev->devflags = 0;
+	dev->flags = DF_ENABLED;
+	dev->order = 0;
+	if (unit == -1)
+		dev->flags |= DF_WILDCARD;
+	if (name) {
+		dev->flags |= DF_FIXEDCLASS;
+		if (devclass_add_device(dc, dev)) {
+			kobj_delete((kobj_t) dev, M_BUS);
+			return (NULL);
+		}
+	}
+	dev->ivars = NULL;
+	dev->softc = NULL;
+
+	dev->state = DS_NOTPRESENT;
+
+	TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
+	bus_data_generation_update();
+
+	return (dev);
+}
+
+static int
+device_print_child(device_t dev, device_t child)
+{
+	int retval = 0;
+
+	if (device_is_alive(child))
+		retval += BUS_PRINT_CHILD(dev, child);
+	else
+		retval += device_printf(child, " not found\n");
+
+	return (retval);
+}
+
+device_t
+device_add_child(device_t dev, const char *name, int unit)
+{
+	return (device_add_child_ordered(dev, 0, name, unit));
+}
+
+device_t
+device_add_child_ordered(device_t dev, int order, const char *name, int unit)
+{
+	device_t child;
+	device_t place;
+
+	PDEBUG(("%s at %s with order %d as unit %d",
+	    name, DEVICENAME(dev), order, unit));
+
+	child = make_device(dev, name, unit);
+	if (child == NULL)
+		return (child);
+	child->order = order;
+
+	TAILQ_FOREACH(place, &dev->children, link) {
+		if (place->order > order)
+			break;
+	}
+
+	if (place) {
+		/*
+		 * The device 'place' is the first device whose order is
+		 * greater than the new child.
+		 */
+		TAILQ_INSERT_BEFORE(place, child, link);
+	} else {
+		/*
+		 * The new child's order is greater or equal to the order of
+		 * any existing device. Add the child to the tail of the list.
+		 */
+		TAILQ_INSERT_TAIL(&dev->children, child, link);
+	}
+
+	bus_data_generation_update();
+	return (child);
+}
+
+int
+device_delete_child(device_t dev, device_t child)
+{
+	int error;
+	device_t grandchild;
+
+	PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
+
+	/* remove children first */
+	while ( (grandchild = TAILQ_FIRST(&child->children)) ) {
+		error = device_delete_child(child, grandchild);
+		if (error)
+			return (error);
+	}
+
+	if ((error = device_detach(child)) != 0)
+		return (error);
+	if (child->devclass)
+		devclass_delete_device(child->devclass, child);
+	TAILQ_REMOVE(&dev->children, child, link);
+	TAILQ_REMOVE(&bus_data_devices, child, devlink);
+	device_set_desc(child, NULL);
+	free(child, M_BUS);
+
+	bus_data_generation_update();
+	return (0);
+}
+
+/*
+ * Find only devices attached to this bus.
+ */
+device_t
+device_find_child(device_t dev, const char *classname, int unit)
+{
+	devclass_t dc;
+	device_t child;
+
+	dc = devclass_find(classname);
+	if (!dc)
+		return (NULL);
+
+	child = devclass_get_device(dc, unit);
+	if (child && child->parent == dev)
+		return (child);
+	return (NULL);
+}
+
+static driverlink_t
+first_matching_driver(devclass_t dc, device_t dev)
+{
+	if (dev->devclass)
+		return (devclass_find_driver_internal(dc, dev->devclass->name));
+	return (TAILQ_FIRST(&dc->drivers));
+}
+
+static driverlink_t
+next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
+{
+	if (dev->devclass) {
+		driverlink_t dl;
+		for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
+			if (!strcmp(dev->devclass->name, dl->driver->name))
+				return (dl);
+		return (NULL);
+	}
+	return (TAILQ_NEXT(last, link));
+}
+
+static int
+device_probe_child(device_t dev, device_t child)
+{
+	devclass_t dc;
+	driverlink_t best = 0;
+	driverlink_t dl;
+	int result, pri = 0;
+	int hasclass = (child->devclass != 0);
+
+	dc = dev->devclass;
+	if (!dc)
+		panic("device_probe_child: parent device has no devclass");
+
+	if (child->state == DS_ALIVE)
+		return (0);
+
+	for (dl = first_matching_driver(dc, child);
+	     dl;
+	     dl = next_matching_driver(dc, child, dl)) {
+		PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
+		device_set_driver(child, dl->driver);
+		if (!hasclass)
+			device_set_devclass(child, dl->driver->name);
+		result = DEVICE_PROBE(child);
+		if (!hasclass)
+			device_set_devclass(child, 0);
+
+		/*
+		 * If the driver returns SUCCESS, there can be no higher match
+		 * for this device.
+		 */
+		if (result == 0) {
+			best = dl;
+			pri = 0;
+			break;
+		}
+
+		/*
+		 * The driver returned an error so it certainly doesn't match.
+		 */
+		if (result > 0) {
+			device_set_driver(child, 0);
+			continue;
+		}
+
+		/*
+		 * A priority lower than SUCCESS, remember the best matching
+		 * driver. Initialise the value of pri for the first match.
+		 */
+		if (best == 0 || result > pri) {
+			best = dl;
+			pri = result;
+			continue;
+		}
+	}
+
+	/*
+	 * If we found a driver, change state and initialise the devclass.
+	 */
+	if (best) {
+		if (!child->devclass)
+			device_set_devclass(child, best->driver->name);
+		device_set_driver(child, best->driver);
+		if (pri < 0) {
+			/*
+			 * A bit bogus. Call the probe method again to make
+			 * sure that we have the right description.
+			 */
+			DEVICE_PROBE(child);
+		}
+		child->state = DS_ALIVE;
+
+		bus_data_generation_update();
+		return (0);
+	}
+
+	return (ENXIO);
+}
+
+device_t
+device_get_parent(device_t dev)
+{
+	return (dev->parent);
+}
+
+int
+device_get_children(device_t dev, device_t **devlistp, int *devcountp)
+{
+	int count;
+	device_t child;
+	device_t *list;
+
+	count = 0;
+	TAILQ_FOREACH(child, &dev->children, link) {
+		count++;
+	}
+
+	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
+	if (!list)
+		return (ENOMEM);
+
+	count = 0;
+	TAILQ_FOREACH(child, &dev->children, link) {
+		list[count] = child;
+		count++;
+	}
+
+	*devlistp = list;
+	*devcountp = count;
+
+	return (0);
+}
+
+driver_t *
+device_get_driver(device_t dev)
+{
+	return (dev->driver);
+}
+
+devclass_t
+device_get_devclass(device_t dev)
+{
+	return (dev->devclass);
+}
+
+const char *
+device_get_name(device_t dev)
+{
+	if (dev->devclass)
+		return (devclass_get_name(dev->devclass));
+	return (NULL);
+}
+
+const char *
+device_get_nameunit(device_t dev)
+{
+	return (dev->nameunit);
+}
+
+int
+device_get_unit(device_t dev)
+{
+	return (dev->unit);
+}
+
+const char *
+device_get_desc(device_t dev)
+{
+	return (dev->desc);
+}
+
+u_int32_t
+device_get_flags(device_t dev)
+{
+	return (dev->devflags);
+}
+
+int
+device_print_prettyname(device_t dev)
+{
+	const char *name = device_get_name(dev);
+
+	if (name == 0)
+		return (printf("unknown: "));
+	return (printf("%s%d: ", name, device_get_unit(dev)));
+}
+
+int
+device_printf(device_t dev, const char * fmt, ...)
+{
+	va_list ap;
+	int retval;
+
+	retval = device_print_prettyname(dev);
+	va_start(ap, fmt);
+	retval += vprintf(fmt, ap);
+	va_end(ap);
+	return (retval);
+}
+
+static void
+device_set_desc_internal(device_t dev, const char* desc, int copy)
+{
+	if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
+		free(dev->desc, M_BUS);
+		dev->flags &= ~DF_DESCMALLOCED;
+		dev->desc = NULL;
+	}
+
+	if (copy && desc) {
+		dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
+		if (dev->desc) {
+			strcpy(dev->desc, desc);
+			dev->flags |= DF_DESCMALLOCED;
+		}
+	} else {
+		/* Avoid a -Wcast-qual warning */
+		dev->desc = (char *)(uintptr_t) desc;
+	}
+
+	bus_data_generation_update();
+}
+
+void
+device_set_desc(device_t dev, const char* desc)
+{
+	device_set_desc_internal(dev, desc, FALSE);
+}
+
+void
+device_set_desc_copy(device_t dev, const char* desc)
+{
+	device_set_desc_internal(dev, desc, TRUE);
+}
+
+void
+device_set_flags(device_t dev, u_int32_t flags)
+{
+	dev->devflags = flags;
+}
+
+void *
+device_get_softc(device_t dev)
+{
+	return (dev->softc);
+}
+
+void
+device_set_softc(device_t dev, void *softc)
+{
+	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
+		free(dev->softc, M_BUS);
+	dev->softc = softc;
+	if (dev->softc)
+		dev->flags |= DF_EXTERNALSOFTC;
+	else
+		dev->flags &= ~DF_EXTERNALSOFTC;
+}
+
+void *
+device_get_ivars(device_t dev)
+{
+	return (dev->ivars);
+}
+
+void
+device_set_ivars(device_t dev, void * ivars)
+{
+	if (!dev)
+		return;
+
+	dev->ivars = ivars;
+
+	return;
+}
+
+device_state_t
+device_get_state(device_t dev)
+{
+	return (dev->state);
+}
+
+void
+device_enable(device_t dev)
+{
+	dev->flags |= DF_ENABLED;
+}
+
+void
+device_disable(device_t dev)
+{
+	dev->flags &= ~DF_ENABLED;
+}
+
+void
+device_busy(device_t dev)
+{
+	if (dev->state < DS_ATTACHED)
+		panic("device_busy: called for unattached device");
+	if (dev->busy == 0 && dev->parent)
+		device_busy(dev->parent);
+	dev->busy++;
+	dev->state = DS_BUSY;
+}
+
+void
+device_unbusy(device_t dev)
+{
+	if (dev->state != DS_BUSY)
+		panic("device_unbusy: called for non-busy device");
+	dev->busy--;
+	if (dev->busy == 0) {
+		if (dev->parent)
+			device_unbusy(dev->parent);
+		dev->state = DS_ATTACHED;
+	}
+}
+
+void
+device_quiet(device_t dev)
+{
+	dev->flags |= DF_QUIET;
+}
+
+void
+device_verbose(device_t dev)
+{
+	dev->flags &= ~DF_QUIET;
+}
+
+int
+device_is_quiet(device_t dev)
+{
+	return ((dev->flags & DF_QUIET) != 0);
+}
+
+int
+device_is_enabled(device_t dev)
+{
+	return ((dev->flags & DF_ENABLED) != 0);
+}
+
+int
+device_is_alive(device_t dev)
+{
+	return (dev->state >= DS_ALIVE);
+}
+
+int
+device_set_devclass(device_t dev, const char *classname)
+{
+	devclass_t dc;
+	int error;
+
+	if (!classname) {
+		if (dev->devclass)
+			devclass_delete_device(dev->devclass, dev);
+		return (0);
+	}
+
+	if (dev->devclass) {
+		printf("device_set_devclass: device class already set\n");
+		return (EINVAL);
+	}
+
+	dc = devclass_find_internal(classname, TRUE);
+	if (!dc)
+		return (ENOMEM);
+
+	error = devclass_add_device(dc, dev);
+
+	bus_data_generation_update();
+	return (error);
+}
+
+int
+device_set_driver(device_t dev, driver_t *driver)
+{
+	if (dev->state >= DS_ATTACHED)
+		return (EBUSY);
+
+	if (dev->driver == driver)
+		return (0);
+
+	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
+		free(dev->softc, M_BUS);
+		dev->softc = NULL;
+	}
+	kobj_delete((kobj_t) dev, 0);
+	dev->driver = driver;
+	if (driver) {
+		kobj_init((kobj_t) dev, (kobj_class_t) driver);
+		if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
+			dev->softc = malloc(driver->size, M_BUS,
+			    M_NOWAIT | M_ZERO);
+			if (!dev->softc) {
+				kobj_init((kobj_t) dev, &null_class);
+				dev->driver = NULL;
+				return (ENOMEM);
+			}
+		}
+	} else {
+		kobj_init((kobj_t) dev, &null_class);
+	}
+
+	bus_data_generation_update();
+	return (0);
+}
+
+int
+device_probe_and_attach(device_t dev)
+{
+	device_t bus = dev->parent;
+	int error = 0;
+	int hasclass = (dev->devclass != 0);
+
+	if (dev->state >= DS_ALIVE)
+		return (0);
+
+	if (dev->flags & DF_ENABLED) {
+		error = device_probe_child(bus, dev);
+		if (!error) {
+			if (!device_is_quiet(dev))
+				device_print_child(bus, dev);
+			error = DEVICE_ATTACH(dev);
+			if (!error)
+				dev->state = DS_ATTACHED;
+			else {
+				printf("device_probe_and_attach: %s%d attach returned %d\n",
+				    dev->driver->name, dev->unit, error);
+				/* Unset the class; set in device_probe_child */
+				if (!hasclass)
+					device_set_devclass(dev, 0);
+				device_set_driver(dev, NULL);
+				dev->state = DS_NOTPRESENT;
+			}
+		} else {
+			if (!(dev->flags & DF_DONENOMATCH)) {
+				BUS_PROBE_NOMATCH(bus, dev);
+				dev->flags |= DF_DONENOMATCH;
+			}
+		}
+	} else {
+		if (bootverbose) {
+			device_print_prettyname(dev);
+			printf("not probed (disabled)\n");
+		}
+	}
+
+	return (error);
+}
+
+int
+device_detach(device_t dev)
+{
+	int error;
+
+	PDEBUG(("%s", DEVICENAME(dev)));
+	if (dev->state == DS_BUSY)
+		return (EBUSY);
+	if (dev->state != DS_ATTACHED)
+		return (0);
+
+	if ((error = DEVICE_DETACH(dev)) != 0)
+		return (error);
+	device_printf(dev, "detached\n");
+	if (dev->parent)
+		BUS_CHILD_DETACHED(dev->parent, dev);
+
+	if (!(dev->flags & DF_FIXEDCLASS))
+		devclass_delete_device(dev->devclass, dev);
+
+	dev->state = DS_NOTPRESENT;
+	device_set_driver(dev, NULL);
+
+	return (0);
+}
+
+int
+device_shutdown(device_t dev)
+{
+	if (dev->state < DS_ATTACHED)
+		return (0);
+	return (DEVICE_SHUTDOWN(dev));
+}
+
+int
+device_set_unit(device_t dev, int unit)
+{
+	devclass_t dc;
+	int err;
+
+	dc = device_get_devclass(dev);
+	if (unit < dc->maxunit && dc->devices[unit])
+		return (EBUSY);
+	err = devclass_delete_device(dc, dev);
+	if (err)
+		return (err);
+	dev->unit = unit;
+	err = devclass_add_device(dc, dev);
+	if (err)
+		return (err);
+
+	bus_data_generation_update();
+	return (0);
+}
+
+/*======================================*/
+/*
+ * Some useful method implementations to make life easier for bus drivers.
+ */
+
+void
+resource_list_init(struct resource_list *rl)
+{
+	SLIST_INIT(rl);
+}
+
+void
+resource_list_free(struct resource_list *rl)
+{
+	struct resource_list_entry *rle;
+
+	while ((rle = SLIST_FIRST(rl)) != NULL) {
+		if (rle->res)
+			panic("resource_list_free: resource entry is busy");
+		SLIST_REMOVE_HEAD(rl, link);
+		free(rle, M_BUS);
+	}
+}
+
+int
+resource_list_add_next(struct resource_list *rl, int type,
+    u_long start, u_long end, u_long count)
+{
+	int	rid;
+
+	rid = 0;
+	while (resource_list_find(rl, type, rid)) rid++;
+	resource_list_add(rl, type, rid, start, end, count);
+
+	return (rid);
+}
+
+void
+resource_list_add(struct resource_list *rl, int type, int rid,
+    u_long start, u_long end, u_long count)
+{
+	struct resource_list_entry *rle;
+
+	rle = resource_list_find(rl, type, rid);
+	if (!rle) {
+		rle = malloc(sizeof(struct resource_list_entry), M_BUS,
+		    M_NOWAIT);
+		if (!rle)
+			panic("resource_list_add: can't record entry");
+		SLIST_INSERT_HEAD(rl, rle, link);
+		rle->type = type;
+		rle->rid = rid;
+		rle->res = NULL;
+	}
+
+	if (rle->res)
+		panic("resource_list_add: resource entry is busy");
+
+	rle->start = start;
+	rle->end = end;
+	rle->count = count;
+}
+
+struct resource_list_entry *
+resource_list_find(struct resource_list *rl, int type, int rid)
+{
+	struct resource_list_entry *rle;
+
+	SLIST_FOREACH(rle, rl, link) {
+		if (rle->type == type && rle->rid == rid)
+			return (rle);
+	}
+	return (NULL);
+}
+
+void
+resource_list_delete(struct resource_list *rl, int type, int rid)
+{
+	struct resource_list_entry *rle = resource_list_find(rl, type, rid);
+
+	if (rle) {
+		if (rle->res != NULL)
+			panic("resource_list_delete: resource has not been released");
+		SLIST_REMOVE(rl, rle, resource_list_entry, link);
+		free(rle, M_BUS);
+	}
+}
+
+struct resource *
+resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
+    int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
+{
+	struct resource_list_entry *rle = 0;
+	int passthrough = (device_get_parent(child) != bus);
+	int isdefault = (start == 0UL && end == ~0UL);
+
+	if (passthrough) {
+		return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
+		    type, rid, start, end, count, flags));
+	}
+
+	rle = resource_list_find(rl, type, *rid);
+
+	if (!rle)
+		return (NULL);		/* no resource of that type/rid */
+
+	if (rle->res)
+		panic("resource_list_alloc: resource entry is busy");
+
+	if (isdefault) {
+		start = rle->start;
+		count = ulmax(count, rle->count);
+		end = ulmax(rle->end, start + count - 1);
+	}
+
+	rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
+	    type, rid, start, end, count, flags);
+
+	/*
+	 * Record the new range.
+	 */
+	if (rle->res) {
+		rle->start = rman_get_start(rle->res);
+		rle->end = rman_get_end(rle->res);
+		rle->count = count;
+	}
+
+	return (rle->res);
+}
+
+int
+resource_list_release(struct resource_list *rl, device_t bus, device_t child,
+    int type, int rid, struct resource *res)
+{
+	struct resource_list_entry *rle = 0;
+	int passthrough = (device_get_parent(child) != bus);
+	int error;
+
+	if (passthrough) {
+		return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
+		    type, rid, res));
+	}
+
+	rle = resource_list_find(rl, type, rid);
+
+	if (!rle)
+		panic("resource_list_release: can't find resource");
+	if (!rle->res)
+		panic("resource_list_release: resource entry is not busy");
+
+	error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
+	    type, rid, res);
+	if (error)
+		return (error);
+
+	rle->res = NULL;
+	return (0);
+}
+
+int
+resource_list_print_type(struct resource_list *rl, const char *name, int type,
+    const char *format)
+{
+	struct resource_list_entry *rle;
+	int printed, retval;
+
+	printed = 0;
+	retval = 0;
+	/* Yes, this is kinda cheating */
+	SLIST_FOREACH(rle, rl, link) {
+		if (rle->type == type) {
+			if (printed == 0)
+				retval += printf(" %s ", name);
+			else
+				retval += printf(",");
+			printed++;
+			retval += printf(format, rle->start);
+			if (rle->count > 1) {
+				retval += printf("-");
+				retval += printf(format, rle->start +
+						 rle->count - 1);
+			}
+		}
+	}
+	return (retval);
+}
+
+/*
+ * Call DEVICE_IDENTIFY for each driver.
+ */
+int
+bus_generic_probe(device_t dev)
+{
+	devclass_t dc = dev->devclass;
+	driverlink_t dl;
+
+	TAILQ_FOREACH(dl, &dc->drivers, link) {
+		DEVICE_IDENTIFY(dl->driver, dev);
+	}
+
+	return (0);
+}
+
+int
+bus_generic_attach(device_t dev)
+{
+	device_t child;
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		device_probe_and_attach(child);
+	}
+
+	return (0);
+}
+
+int
+bus_generic_detach(device_t dev)
+{
+	device_t child;
+	int error;
+
+	if (dev->state != DS_ATTACHED)
+		return (EBUSY);
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		if ((error = device_detach(child)) != 0)
+			return (error);
+	}
+
+	return (0);
+}
+
+int
+bus_generic_shutdown(device_t dev)
+{
+	device_t child;
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		device_shutdown(child);
+	}
+
+	return (0);
+}
+
+int
+bus_generic_suspend(device_t dev)
+{
+	int		error;
+	device_t	child, child2;
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		error = DEVICE_SUSPEND(child);
+		if (error) {
+			for (child2 = TAILQ_FIRST(&dev->children);
+			     child2 && child2 != child;
+			     child2 = TAILQ_NEXT(child2, link))
+				DEVICE_RESUME(child2);
+			return (error);
+		}
+	}
+	return (0);
+}
+
+int
+bus_generic_resume(device_t dev)
+{
+	device_t	child;
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		DEVICE_RESUME(child);
+		/* if resume fails, there's nothing we can usefully do... */
+	}
+	return (0);
+}
+
+int
+bus_print_child_header (device_t dev, device_t child)
+{
+	int	retval = 0;
+
+	if (device_get_desc(child)) {
+		retval += device_printf(child, "<%s>", device_get_desc(child));
+	} else {
+		retval += printf("%s", device_get_nameunit(child));
+	}
+
+	return (retval);
+}
+
+int
+bus_print_child_footer (device_t dev, device_t child)
+{
+	return (printf(" on %s\n", device_get_nameunit(dev)));
+}
+
+int
+bus_generic_print_child(device_t dev, device_t child)
+{
+	int	retval = 0;
+
+	retval += bus_print_child_header(dev, child);
+	retval += bus_print_child_footer(dev, child);
+
+	return (retval);
+}
+
+int
+bus_generic_read_ivar(device_t dev, device_t child, int index,
+    uintptr_t * result)
+{
+	return (ENOENT);
+}
+
+int
+bus_generic_write_ivar(device_t dev, device_t child, int index,
+    uintptr_t value)
+{
+	return (ENOENT);
+}
+
+struct resource_list *
+bus_generic_get_resource_list (device_t dev, device_t child)
+{
+	return (NULL);
+}
+
+void
+bus_generic_driver_added(device_t dev, driver_t *driver)
+{
+	device_t child;
+
+	DEVICE_IDENTIFY(driver, dev);
+	TAILQ_FOREACH(child, &dev->children, link) {
+		if (child->state == DS_NOTPRESENT)
+			device_probe_and_attach(child);
+	}
+}
+
+int
+bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
+    int flags, driver_intr_t *intr, void *arg, void **cookiep)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
+		    intr, arg, cookiep));
+	return (EINVAL);
+}
+
+int
+bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
+    void *cookie)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
+	return (EINVAL);
+}
+
+struct resource *
+bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
+    u_long start, u_long end, u_long count, u_int flags)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
+		    start, end, count, flags));
+	return (NULL);
+}
+
+int
+bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
+    struct resource *r)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
+		    r));
+	return (EINVAL);
+}
+
+int
+bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
+    struct resource *r)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
+		    r));
+	return (EINVAL);
+}
+
+int
+bus_generic_deactivate_resource(device_t dev, device_t child, int type,
+    int rid, struct resource *r)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
+		    r));
+	return (EINVAL);
+}
+
+int
+bus_generic_rl_get_resource (device_t dev, device_t child, int type, int rid,
+    u_long *startp, u_long *countp)
+{
+	struct resource_list *		rl = NULL;
+	struct resource_list_entry *	rle = NULL;
+
+	rl = BUS_GET_RESOURCE_LIST(dev, child);
+	if (!rl)
+		return (EINVAL);
+
+	rle = resource_list_find(rl, type, rid);
+	if (!rle)
+		return (ENOENT);
+
+	if (startp)
+		*startp = rle->start;
+	if (countp)
+		*countp = rle->count;
+
+	return (0);
+}
+
+int
+bus_generic_rl_set_resource (device_t dev, device_t child, int type, int rid,
+    u_long start, u_long count)
+{
+	struct resource_list *		rl = NULL;
+
+	rl = BUS_GET_RESOURCE_LIST(dev, child);
+	if (!rl)
+		return (EINVAL);
+
+	resource_list_add(rl, type, rid, start, (start + count - 1), count);
+
+	return (0);
+}
+
+void
+bus_generic_rl_delete_resource (device_t dev, device_t child, int type, int rid)
+{
+	struct resource_list *		rl = NULL;
+
+	rl = BUS_GET_RESOURCE_LIST(dev, child);
+	if (!rl)
+		return;
+
+	resource_list_delete(rl, type, rid);
+
+	return;
+}
+
+int
+bus_generic_rl_release_resource (device_t dev, device_t child, int type,
+    int rid, struct resource *r)
+{
+	struct resource_list *		rl = NULL;
+
+	rl = BUS_GET_RESOURCE_LIST(dev, child);
+	if (!rl)
+		return (EINVAL);
+
+	return (resource_list_release(rl, dev, child, type, rid, r));
+}
+
+struct resource *
+bus_generic_rl_alloc_resource (device_t dev, device_t child, int type,
+    int *rid, u_long start, u_long end, u_long count, u_int flags)
+{
+	struct resource_list *		rl = NULL;
+
+	rl = BUS_GET_RESOURCE_LIST(dev, child);
+	if (!rl)
+		return (NULL);
+
+	return (resource_list_alloc(rl, dev, child, type, rid,
+	    start, end, count, flags));
+}
+
+/*
+ * Some convenience functions to make it easier for drivers to use the
+ * resource-management functions.  All these really do is hide the
+ * indirection through the parent's method table, making for slightly
+ * less-wordy code.  In the future, it might make sense for this code
+ * to maintain some sort of a list of resources allocated by each device.
+ */
+struct resource *
+bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
+    u_long count, u_int flags)
+{
+	if (dev->parent == 0)
+		return (0);
+	return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
+	    count, flags));
+}
+
+int
+bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+	if (dev->parent == 0)
+		return (EINVAL);
+	return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+int
+bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+	if (dev->parent == 0)
+		return (EINVAL);
+	return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+int
+bus_release_resource(device_t dev, int type, int rid, struct resource *r)
+{
+	if (dev->parent == 0)
+		return (EINVAL);
+	return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+int
+bus_setup_intr(device_t dev, struct resource *r, int flags,
+    driver_intr_t handler, void *arg, void **cookiep)
+{
+	if (dev->parent == 0)
+		return (EINVAL);
+	return (BUS_SETUP_INTR(dev->parent, dev, r, flags,
+	    handler, arg, cookiep));
+}
+
+int
+bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
+{
+	if (dev->parent == 0)
+		return (EINVAL);
+	return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
+}
+
+int
+bus_set_resource(device_t dev, int type, int rid,
+    u_long start, u_long count)
+{
+	return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
+	    start, count));
+}
+
+int
+bus_get_resource(device_t dev, int type, int rid,
+    u_long *startp, u_long *countp)
+{
+	return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+	    startp, countp));
+}
+
+u_long
+bus_get_resource_start(device_t dev, int type, int rid)
+{
+	u_long start, count;
+	int error;
+
+	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+	    &start, &count);
+	if (error)
+		return (0);
+	return (start);
+}
+
+u_long
+bus_get_resource_count(device_t dev, int type, int rid)
+{
+	u_long start, count;
+	int error;
+
+	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+	    &start, &count);
+	if (error)
+		return (0);
+	return (count);
+}
+
+void
+bus_delete_resource(device_t dev, int type, int rid)
+{
+	BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
+}
+
+static int
+root_print_child(device_t dev, device_t child)
+{
+	int	retval = 0;
+
+	retval += bus_print_child_header(dev, child);
+	retval += printf("\n");
+
+	return (retval);
+}
+
+static int
+root_setup_intr(device_t dev, device_t child, driver_intr_t *intr, void *arg,
+    void **cookiep)
+{
+	/*
+	 * If an interrupt mapping gets to here something bad has happened.
+	 */
+	panic("root_setup_intr");
+}
+
+static kobj_method_t root_methods[] = {
+	/* Device interface */
+	KOBJMETHOD(device_shutdown,	bus_generic_shutdown),
+	KOBJMETHOD(device_suspend,	bus_generic_suspend),
+	KOBJMETHOD(device_resume,	bus_generic_resume),
+
+	/* Bus interface */
+	KOBJMETHOD(bus_print_child,	root_print_child),
+	KOBJMETHOD(bus_read_ivar,	bus_generic_read_ivar),
+	KOBJMETHOD(bus_write_ivar,	bus_generic_write_ivar),
+	KOBJMETHOD(bus_setup_intr,	root_setup_intr),
+
+	{ 0, 0 }
+};
+
+static driver_t root_driver = {
+	"root",
+	root_methods,
+	1,			/* no softc */
+};
+
+device_t	root_bus;
+devclass_t	root_devclass;
+
+static int
+root_bus_module_handler(module_t mod, int what, void* arg)
+{
+	switch (what) {
+	case MOD_LOAD:
+		TAILQ_INIT(&bus_data_devices);
+		kobj_class_compile((kobj_class_t) &root_driver);
+		root_bus = make_device(NULL, "root", 0);
+		root_bus->desc = "System root bus";
+		kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
+		root_bus->driver = &root_driver;
+		root_bus->state = DS_ATTACHED;
+		root_devclass = devclass_find_internal("root", FALSE);
+		return (0);
+
+	case MOD_SHUTDOWN:
+		device_shutdown(root_bus);
+		return (0);
+	}
+
+	return (0);
+}
+
+static moduledata_t root_bus_mod = {
+	"rootbus",
+	root_bus_module_handler,
+	0
+};
+DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+
+void
+root_bus_configure(void)
+{
+	device_t dev;
+
+	PDEBUG(("."));
+
+	TAILQ_FOREACH(dev, &root_bus->children, link) {
+		device_probe_and_attach(dev);
+	}
+}
+
+int
+driver_module_handler(module_t mod, int what, void *arg)
+{
+	int error, i;
+	struct driver_module_data *dmd;
+	devclass_t bus_devclass;
+
+	dmd = (struct driver_module_data *)arg;
+	bus_devclass = devclass_find_internal(dmd->dmd_busname, TRUE);
+	error = 0;
+
+	switch (what) {
+	case MOD_LOAD:
+		if (dmd->dmd_chainevh)
+			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
+
+		for (i = 0; !error && i < dmd->dmd_ndrivers; i++) {
+			PDEBUG(("Loading module: driver %s on bus %s",
+			    DRIVERNAME(dmd->dmd_drivers[i]), dmd->dmd_busname));
+			error = devclass_add_driver(bus_devclass,
+			    dmd->dmd_drivers[i]);
+		}
+		if (error)
+			break;
+
+		/*
+		 * The drivers loaded in this way are assumed to all
+		 * implement the same devclass.
+		 */
+		*dmd->dmd_devclass =
+		    devclass_find_internal(dmd->dmd_drivers[0]->name, TRUE);
+		break;
+
+	case MOD_UNLOAD:
+		for (i = 0; !error && i < dmd->dmd_ndrivers; i++) {
+			PDEBUG(("Unloading module: driver %s from bus %s",
+			    DRIVERNAME(dmd->dmd_drivers[i]),
+			    dmd->dmd_busname));
+			error = devclass_delete_driver(bus_devclass,
+			    dmd->dmd_drivers[i]);
+		}
+
+		if (!error && dmd->dmd_chainevh)
+			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
+		break;
+	}
+
+	return (error);
+}
+
+#ifdef BUS_DEBUG
+
+/* the _short versions avoid iteration by not calling anything that prints
+ * more than oneliners. I love oneliners.
+ */
+
+static void
+print_device_short(device_t dev, int indent)
+{
+	if (!dev)
+		return;
+
+	indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
+	    dev->unit, dev->desc,
+	    (dev->parent? "":"no "),
+	    (TAILQ_EMPTY(&dev->children)? "no ":""),
+	    (dev->flags&DF_ENABLED? "enabled,":"disabled,"),
+	    (dev->flags&DF_FIXEDCLASS? "fixed,":""),
+	    (dev->flags&DF_WILDCARD? "wildcard,":""),
+	    (dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
+	    (dev->ivars? "":"no "),
+	    (dev->softc? "":"no "),
+	    dev->busy));
+}
+
+static void
+print_device(device_t dev, int indent)
+{
+	if (!dev)
+		return;
+
+	print_device_short(dev, indent);
+
+	indentprintf(("Parent:\n"));
+	print_device_short(dev->parent, indent+1);
+	indentprintf(("Driver:\n"));
+	print_driver_short(dev->driver, indent+1);
+	indentprintf(("Devclass:\n"));
+	print_devclass_short(dev->devclass, indent+1);
+}
+
+void
+print_device_tree_short(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+	device_t child;
+
+	if (!dev)
+		return;
+
+	print_device_short(dev, indent);
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		print_device_tree_short(child, indent+1);
+	}
+}
+
+void
+print_device_tree(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+	device_t child;
+
+	if (!dev)
+		return;
+
+	print_device(dev, indent);
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		print_device_tree(child, indent+1);
+	}
+}
+
+static void
+print_driver_short(driver_t *driver, int indent)
+{
+	if (!driver)
+		return;
+
+	indentprintf(("driver %s: softc size = %d\n",
+	    driver->name, driver->size));
+}
+
+static void
+print_driver(driver_t *driver, int indent)
+{
+	if (!driver)
+		return;
+
+	print_driver_short(driver, indent);
+}
+
+
+static void
+print_driver_list(driver_list_t drivers, int indent)
+{
+	driverlink_t driver;
+
+	TAILQ_FOREACH(driver, &drivers, link) {
+		print_driver(driver->driver, indent);
+	}
+}
+
+static void
+print_devclass_short(devclass_t dc, int indent)
+{
+	if ( !dc )
+		return;
+
+	indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
+}
+
+static void
+print_devclass(devclass_t dc, int indent)
+{
+	int i;
+
+	if ( !dc )
+		return;
+
+	print_devclass_short(dc, indent);
+	indentprintf(("Drivers:\n"));
+	print_driver_list(dc->drivers, indent+1);
+
+	indentprintf(("Devices:\n"));
+	for (i = 0; i < dc->maxunit; i++)
+		if (dc->devices[i])
+			print_device(dc->devices[i], indent+1);
+}
+
+void
+print_devclass_list_short(void)
+{
+	devclass_t dc;
+
+	printf("Short listing of devclasses, drivers & devices:\n");
+	TAILQ_FOREACH(dc, &devclasses, link) {
+		print_devclass_short(dc, 0);
+	}
+}
+
+void
+print_devclass_list(void)
+{
+	devclass_t dc;
+
+	printf("Full listing of devclasses, drivers & devices:\n");
+	TAILQ_FOREACH(dc, &devclasses, link) {
+		print_devclass(dc, 0);
+	}
+}
+
+#endif
+
+/*
+ * User-space access to the device tree.
+ *
+ * We implement a small set of nodes:
+ *
+ * hw.bus			Single integer read method to obtain the
+ *				current generation count.
+ * hw.bus.devices		Reads the entire device tree in flat space.
+ * hw.bus.rman			Resource manager interface
+ *
+ * We might like to add the ability to scan devclasses and/or drivers to
+ * determine what else is currently loaded/available.
+ */
+SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL);
+
+static int
+sysctl_bus(SYSCTL_HANDLER_ARGS)
+{
+	struct u_businfo	ubus;
+
+	ubus.ub_version = BUS_USER_VERSION;
+	ubus.ub_generation = bus_data_generation;
+
+	return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
+}
+SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus,
+    "bus-related data");
+
+static int
+sysctl_devices(SYSCTL_HANDLER_ARGS)
+{
+	int			*name = (int *)arg1;
+	u_int			namelen = arg2;
+	int			index;
+	struct device		*dev;
+	struct u_device		udev;	/* XXX this is a bit big */
+	int			error;
+
+	if (namelen != 2)
+		return (EINVAL);
+
+	if (bus_data_generation_check(name[0]))
+		return (EINVAL);
+
+	index = name[1];
+
+	/*
+	 * Scan the list of devices, looking for the requested index.
+	 */
+	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
+		if (index-- == 0)
+			break;
+	}
+	if (dev == NULL)
+		return (ENOENT);
+
+	/*
+	 * Populate the return array.
+	 */
+	udev.dv_handle = (uintptr_t)dev;
+	udev.dv_parent = (uintptr_t)dev->parent;
+	if (dev->nameunit == NULL) {
+		udev.dv_name[0] = 0;
+	} else {
+		snprintf(udev.dv_name, 32, "%s", dev->nameunit);
+	}
+	if (dev->desc == NULL) {
+		udev.dv_desc[0] = 0;
+	} else {
+		snprintf(udev.dv_desc, 32, "%s", dev->desc);
+	}
+	if ((dev->driver == NULL) || (dev->driver->name == NULL)) {
+		udev.dv_drivername[0] = 0;
+	} else {
+		snprintf(udev.dv_drivername, 32, "%s", dev->driver->name);
+	}
+	error = SYSCTL_OUT(req, &udev, sizeof(udev));
+	return (error);
+}
+
+SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices,
+    "system device tree");
+
+/*
+ * Sysctl interface for scanning the resource lists.
+ *
+ * We take two input parameters; the index into the list of resource
+ * managers, and the resource offset into the list.
+ */
+static int
+sysctl_rman(SYSCTL_HANDLER_ARGS)
+{
+	int			*name = (int *)arg1;
+	u_int			namelen = arg2;
+	int			rman_idx, res_idx;
+	struct rman		*rm;
+	struct resource		*res;
+	struct u_rman		urm;
+	struct u_resource	ures;
+	int			error;
+
+	if (namelen != 3)
+		return (EINVAL);
+
+	if (bus_data_generation_check(name[0]))
+		return (EINVAL);
+	rman_idx = name[1];
+	res_idx = name[2];
+
+	/*
+	 * Find the indexed resource manager
+	 */
+	TAILQ_FOREACH(rm, &rman_head, rm_link) {
+		if (rman_idx-- == 0)
+			break;
+	}
+	if (rm == NULL)
+		return (ENOENT);
+
+	/*
+	 * If the resource index is -1, we want details on the
+	 * resource manager.
+	 */
+	if (res_idx == -1) {
+		urm.rm_handle = (uintptr_t)rm;
+		snprintf(urm.rm_descr, RM_TEXTLEN, "%s", rm->rm_descr);
+		urm.rm_descr[RM_TEXTLEN - 1] = '\0';
+		urm.rm_start = rm->rm_start;
+		urm.rm_size = rm->rm_end - rm->rm_start + 1;
+		urm.rm_type = rm->rm_type;
+
+		error = SYSCTL_OUT(req, &urm, sizeof(urm));
+		return (error);
+	}
+
+	/*
+	 * Find the indexed resource and return it.
+	 */
+	TAILQ_FOREACH(res, &rm->rm_list, r_link) {
+		if (res_idx-- == 0) {
+			ures.r_handle = (uintptr_t)res;
+			ures.r_parent = (uintptr_t)res->r_rm;
+			ures.r_device = (uintptr_t)res->r_dev;
+			if (res->r_dev != NULL) {
+				if (device_get_name(res->r_dev) != NULL) {
+					snprintf(ures.r_devname, RM_TEXTLEN,
+					    "%s%d",
+					    device_get_name(res->r_dev),
+					    device_get_unit(res->r_dev));
+				} else {
+					snprintf(ures.r_devname, RM_TEXTLEN,
+					    "nomatch");
+				}
+			} else {
+				ures.r_devname[0] = 0;
+			}
+			ures.r_start = res->r_start;
+			ures.r_size = res->r_end - res->r_start + 1;
+			ures.r_flags = res->r_flags;
+
+			error = SYSCTL_OUT(req, &ures, sizeof(ures));
+			return (error);
+		}
+	}
+	return (ENOENT);
+}
+
+SYSCTL_NODE(_hw_bus, OID_AUTO, rman, CTLFLAG_RD, sysctl_rman,
+    "kernel resource manager");
+
+int
+bus_data_generation_check(int generation)
+{
+	if (generation != bus_data_generation)
+		return (1);
+
+	/* XXX generate optimised lists here? */
+	return (0);
+}
+
+void
+bus_data_generation_update(void)
+{
+	bus_data_generation++;
+}
diff --git a/sys/kern/subr_clist.c b/sys/kern/subr_clist.c
new file mode 100644
index 0000000..78bb231
--- /dev/null
+++ b/sys/kern/subr_clist.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * clist support routines
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/tty.h>
+#include <sys/clist.h>
+
+static void clist_init(void *);
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
+
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define	INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc(void);
+static void cblock_alloc_cblocks(int number);
+static void cblock_free(struct cblock *cblockp);
+static void cblock_free_cblocks(int number);
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
+{
+	int cbsize = CBSIZE;
+
+	printf(
+	"tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+	       ctotcount * cbsize, ctotcount * cbsize - cfreecount, cfreecount,
+	       cfreecount - cslushcount * cbsize, cslushcount * cbsize);
+}
+#endif /* DDB */
+
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+	void *dummy;
+{
+	/*
+	 * Allocate an initial base set of cblocks as a 'slush'.
+	 * We allocate non-slush cblocks with each initial ttyopen() and
+	 * deallocate them with each ttyclose().
+	 * We should adjust the slush allocation.  This can't be done in
+	 * the i/o routines because they are sometimes called from
+	 * interrupt handlers when it may be unsafe to call malloc().
+	 */
+	cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
+}
+
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static __inline struct cblock *
+cblock_alloc()
+{
+	struct cblock *cblockp;
+
+	cblockp = cfreelist;
+	if (cblockp == NULL)
+		panic("clist reservation botch");
+	cfreelist = cblockp->c_next;
+	cblockp->c_next = NULL;
+	cfreecount -= CBSIZE;
+	return (cblockp);
+}
+
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static __inline void
+cblock_free(cblockp)
+	struct cblock *cblockp;
+{
+	if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+		bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+	cblockp->c_next = cfreelist;
+	cfreelist = cblockp;
+	cfreecount += CBSIZE;
+}
+
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+	int number;
+{
+	int i;
+	struct cblock *cbp;
+
+	for (i = 0; i < number; ++i) {
+		cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+		if (cbp == NULL) {
+			printf(
+"cblock_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+			cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+		}
+		/*
+		 * Freed cblocks have zero quotes and garbage elsewhere.
+		 * Set the may-have-quote bit to force zeroing the quotes.
+		 */
+		setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+		cblock_free(cbp);
+	}
+	ctotcount += number;
+}
+
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+	struct clist *clistp;
+	int ccmax;
+	int ccreserved;
+{
+	int dcbr;
+
+	/*
+	 * Allow for wasted space at the head.
+	 */
+	if (ccmax != 0)
+		ccmax += CBSIZE - 1;
+	if (ccreserved != 0)
+		ccreserved += CBSIZE - 1;
+
+	clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+	dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+	if (dcbr >= 0)
+		cblock_alloc_cblocks(dcbr);
+	else {
+		if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+			dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+		cblock_free_cblocks(-dcbr);
+	}
+	clistp->c_cbreserved += dcbr;
+}
+
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+	int number;
+{
+	int i;
+
+	for (i = 0; i < number; ++i)
+		free(cblock_alloc(), M_TTYS);
+	ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
+void
+clist_free_cblocks(clistp)
+	struct clist *clistp;
+{
+	if (clistp->c_cbcount != 0)
+		panic("freeing active clist cblocks");
+	cblock_free_cblocks(clistp->c_cbreserved);
+	clistp->c_cbmax = 0;
+	clistp->c_cbreserved = 0;
+}
+
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+	struct clist *clistp;
+{
+	int chr = -1;
+	int s;
+	struct cblock *cblockp;
+
+	s = spltty();
+
+	/* If there are characters in the list, get one */
+	if (clistp->c_cc) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		chr = (u_char)*clistp->c_cf;
+
+		/*
+		 * If this char is quoted, set the flag.
+		 */
+		if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * Advance to next character.
+		 */
+		clistp->c_cf++;
+		clistp->c_cc--;
+		/*
+		 * If we have advanced the 'first' character pointer
+		 * past the end of this cblock, advance to the next one.
+		 * If there are no more characters, set the first and
+		 * last pointers to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+	struct clist *clistp;
+	char *dest;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	char *dest_orig = dest;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (clistp && amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		bcopy(clistp->c_cf, dest, numc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		dest += numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (dest - dest_orig);
+}
+
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+	struct clist *clistp;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+	int chr;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	int s;
+
+	s = spltty();
+
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("putc to a clist with no reserved cblocks\n");
+			return (-1);		/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+		if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = (cblockp - 1);
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (-1);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+	}
+
+	/*
+	 * If this character is quoted, set the quote bit, if not, clear it.
+	 */
+	if (chr & TTY_QUOTE) {
+		setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+		/*
+		 * Use one of the spare quote bits to record that something
+		 * may be quoted.
+		 */
+		setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+	} else
+		clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+	*clistp->c_cl++ = chr;
+	clistp->c_cc++;
+
+	splx(s);
+	return (0);
+}
+
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+	char *src;
+	int amount;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	char *firstbyte, *lastbyte;
+	u_char startmask, endmask;
+	int startbit, endbit, num_between, numc;
+	int s;
+
+	/*
+	 * Avoid allocating an initial cblock and then not using it.
+	 * c_cc == 0 must imply c_cbount == 0.
+	 */
+	if (amount <= 0)
+		return (amount);
+
+	s = spltty();
+
+	/*
+	 * If there are no cblocks assigned to this clist yet,
+	 * then get one.
+	 */
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("b_to_q to a clist with no reserved cblocks.\n");
+			return (amount);	/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+	}
+
+	while (amount) {
+		/*
+		 * Get another cblock if needed.
+		 */
+		if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = cblockp - 1;
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (amount);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+
+		/*
+		 * Copy a chunk of the linear buffer up to the end
+		 * of this cblock.
+		 */
+		numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+		bcopy(src, clistp->c_cl, numc);
+
+		/*
+		 * Clear quote bits if they aren't known to be clear.
+		 * The following could probably be made into a separate
+		 * "bitzero()" routine, but why bother?
+		 */
+		if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+			startbit = clistp->c_cl - (char *)cblockp->c_info;
+			endbit = startbit + numc - 1;
+
+			firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+			lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+			/*
+			 * Calculate mask of bits to preserve in first and
+			 * last bytes.
+			 */
+			startmask = NBBY - (startbit % NBBY);
+			startmask = 0xff >> startmask;
+			endmask = (endbit % NBBY);
+			endmask = 0xff << (endmask + 1);
+
+			if (firstbyte != lastbyte) {
+				*firstbyte &= startmask;
+				*lastbyte &= endmask;
+
+				num_between = lastbyte - firstbyte - 1;
+				if (num_between)
+					bzero(firstbyte + 1, num_between);
+			} else {
+				*firstbyte &= (startmask | endmask);
+			}
+		}
+
+		/*
+		 * ...and update pointer for the next chunk.
+		 */
+		src += numc;
+		clistp->c_cl += numc;
+		clistp->c_cc += numc;
+		amount -= numc;
+		/*
+		 * If we go through the loop again, it's always
+		 * for data in the next cblock, so by adding one (cblock),
+		 * (which makes the pointer 1 beyond the end of this
+		 * cblock) we prepare for the assignment of 'prev'
+		 * above.
+		 */
+		cblockp += 1;
+
+	}
+
+	splx(s);
+	return (amount);
+}
+
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
+char *
+nextc(clistp, cp, dst)
+	struct clist *clistp;
+	char *cp;
+	int *dst;
+{
+	struct cblock *cblockp;
+
+	++cp;
+	/*
+	 * See if the next character is beyond the end of
+	 * the clist.
+	 */
+	if (clistp->c_cc && (cp != clistp->c_cl)) {
+		/*
+		 * If the next character is beyond the end of this
+		 * cblock, advance to the next cblock.
+		 */
+		if (((intptr_t)cp & CROUND) == 0)
+			cp = ((struct cblock *)cp - 1)->c_next->c_info;
+		cblockp = (struct cblock *)((intptr_t)cp & ~CROUND);
+
+		/*
+		 * Get the character. Set the quote flag if this character
+		 * is quoted.
+		 */
+		*dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+		return (cp);
+	}
+
+	return (NULL);
+}
+
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+	struct clist *clistp;
+{
+	struct cblock *cblockp = 0, *cbp = 0;
+	int s;
+	int chr = -1;
+
+
+	s = spltty();
+
+	if (clistp->c_cc) {
+		--clistp->c_cc;
+		--clistp->c_cl;
+
+		chr = (u_char)*clistp->c_cl;
+
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+
+		/*
+		 * Set quote flag if this character was quoted.
+		 */
+		if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * If all of the characters have been unput in this
+		 * cblock, then find the previous one and free this
+		 * one.
+		 */
+		if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+			cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+
+			while (cbp->c_next != cblockp)
+				cbp = cbp->c_next;
+
+			/*
+			 * When the previous cblock is at the end, the 'last'
+			 * pointer always points (invalidly) one past.
+			 */
+			clistp->c_cl = (char *)(cbp+1);
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+			cbp->c_next = NULL;
+		}
+	}
+
+	/*
+	 * If there are no more characters on the list, then
+	 * free the last cblock.
+	 */
+	if ((clistp->c_cc == 0) && clistp->c_cl) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+		cblock_free(cblockp);
+		if (--clistp->c_cbcount >= clistp->c_cbreserved)
+			++cslushcount;
+		clistp->c_cf = clistp->c_cl = NULL;
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
+void
+catq(src_clistp, dest_clistp)
+	struct clist *src_clistp, *dest_clistp;
+{
+	int chr, s;
+
+	s = spltty();
+	/*
+	 * If the destination clist is empty (has no cblocks atttached),
+	 * and there are no possible complications with the resource counters,
+	 * then we simply assign the current clist to the destination.
+	 */
+	if (!dest_clistp->c_cf
+	    && src_clistp->c_cbcount <= src_clistp->c_cbmax
+	    && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+		dest_clistp->c_cf = src_clistp->c_cf;
+		dest_clistp->c_cl = src_clistp->c_cl;
+		src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+		dest_clistp->c_cc = src_clistp->c_cc;
+		src_clistp->c_cc = 0;
+		dest_clistp->c_cbcount = src_clistp->c_cbcount;
+		src_clistp->c_cbcount = 0;
+
+		splx(s);
+		return;
+	}
+
+	splx(s);
+
+	/*
+	 * XXX  This should probably be optimized to more than one
+	 * character at a time.
+	 */
+	while ((chr = getc(src_clistp)) != -1)
+		putc(chr, dest_clistp);
+}
diff --git a/sys/kern/subr_clock.c b/sys/kern/subr_clock.c
new file mode 100644
index 0000000..a79e331
--- /dev/null
+++ b/sys/kern/subr_clock.c
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: Utah $Hdr: clock.c 1.18 91/01/21$
+ *	from: @(#)clock.c	8.2 (Berkeley) 1/12/94
+ *	from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ *	and
+ *	from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Helpers for time-of-day clocks. This is useful for architectures that need
+ * support multiple models of such clocks, and generally serves to make the
+ * code more machine-independent.
+ * If the clock in question can also be used as a time counter, the driver
+ * needs to initiate this.
+ * This code is not yet used by all architectures.
+ */
+
+/*
+ * Generic routines to convert between a POSIX date
+ * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
+ * Derived from NetBSD arch/hp300/hp300/clock.c
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+#include <sys/timetc.h>
+
+#include "clock_if.h"
+
+static __inline int leapyear(int year);
+static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS);
+
+#define	FEBRUARY	2
+#define	days_in_year(y) 	(leapyear(y) ? 366 : 365)
+#define	days_in_month(y, m) \
+	(month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
+/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
+#define	day_of_week(days)	(((days) + 4) % 7)
+
+static const int month_days[12] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+static device_t clock_dev = NULL;
+static long clock_res;
+
+int adjkerntz;		/* local offset from GMT in seconds */
+int disable_rtc_set;	/* disable resettodr() if != 0 */
+int wall_cmos_clock;	/* wall CMOS clock assumed if != 0 */
+
+/*
+ * These have traditionally been in machdep, but should probably be moved to
+ * kern.
+ */
+SYSCTL_PROC(_machdep, OID_AUTO, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
+	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
+
+SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set,
+	CTLFLAG_RW, &disable_rtc_set, 0, "");
+
+SYSCTL_INT(_machdep, OID_AUTO, wall_cmos_clock,
+	CTLFLAG_RW, &wall_cmos_clock, 0, "");
+
+static int
+sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
+		req);
+	if (!error && req->newptr)
+		resettodr();
+	return (error);
+}
+
+/*
+ * This inline avoids some unnecessary modulo operations
+ * as compared with the usual macro:
+ *   ( ((year % 4) == 0 &&
+ *      (year % 100) != 0) ||
+ *     ((year % 400) == 0) )
+ * It is otherwise equivalent.
+ */
+static __inline int
+leapyear(int year)
+{
+	int rv = 0;
+
+	if ((year & 3) == 0) {
+		rv = 1;
+		if ((year % 100) == 0) {
+			rv = 0;
+			if ((year % 400) == 0)
+				rv = 1;
+		}
+	}
+	return (rv);
+}
+
+int
+clock_ct_to_ts(struct clocktime *ct, struct timespec *ts)
+{
+	time_t secs;
+	int i, year, days;
+
+	year = ct->year;
+
+	/* Sanity checks. */
+	if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 ||
+	    ct->day > days_in_month(year, ct->mon) ||
+	    ct->hour > 23 ||  ct->min > 59 || ct->sec > 59 ||
+	    ct->year > 2037)		/* time_t overflow */
+		return (EINVAL);
+
+	/*
+	 * Compute days since start of time
+	 * First from years, then from months.
+	 */
+	days = 0;
+	for (i = POSIX_BASE_YEAR; i < year; i++)
+		days += days_in_year(i);
+
+	/* Months */
+	for (i = 1; i < ct->mon; i++)
+	  	days += days_in_month(year, i);
+	days += (ct->day - 1);
+
+	/* Another sanity check. */
+	if (ct->dow != -1 && ct->dow != day_of_week(days))
+		return (EINVAL);
+
+	/* Add hours, minutes, seconds. */
+	secs = ((days * 24 + ct->hour) * 60 + ct->min) * 60 + ct->sec;
+
+	ts->tv_sec = secs;
+	ts->tv_nsec = ct->nsec;
+	return (0);
+}
+
+void
+clock_ts_to_ct(struct timespec *ts, struct clocktime *ct)
+{
+	int i, year, days;
+	time_t rsec;	/* remainder seconds */
+	time_t secs;
+
+	secs = ts->tv_sec;
+	days = secs / SECDAY;
+	rsec = secs % SECDAY;
+
+	ct->dow = day_of_week(days);
+
+	/* Subtract out whole years, counting them in i. */
+	for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++)
+		days -= days_in_year(year);
+	ct->year = year;
+
+	/* Subtract out whole months, counting them in i. */
+	for (i = 1; days >= days_in_month(year, i); i++)
+		days -= days_in_month(year, i);
+	ct->mon = i;
+
+	/* Days are what is left over (+1) from all that. */
+	ct->day = days + 1;
+
+	/* Hours, minutes, seconds are easy */
+	ct->hour = rsec / 3600;
+	rsec = rsec % 3600;
+	ct->min  = rsec / 60;
+	rsec = rsec % 60;
+	ct->sec  = rsec;
+	ct->nsec = ts->tv_nsec;
+}
+
+void
+clock_register(device_t dev, long res)
+{
+
+	if (clock_dev != NULL) {
+		if (clock_res > res) {
+			if (bootverbose) {
+				device_printf(dev, "not installed as "
+				    "time-of-day clock: clock %s has higher "
+				    "resolution\n", device_get_name(clock_dev));
+			}
+			return;
+		} else {
+			if (bootverbose) {
+				device_printf(clock_dev, "removed as "
+				    "time-of-day clock: clock %s has higher "
+				    "resolution\n", device_get_name(dev));
+			}
+		}
+	}
+	clock_dev = dev;
+	clock_res = res;
+	if (bootverbose) {
+		device_printf(dev, "registered as a time-of-day clock "
+		    "(resolution %ldus)\n", res);
+	}
+}
+
+/*
+ * inittodr and settodr derived from the i386 versions written
+ * by Christoph Robitschko <chmr@edvz.tu-graz.ac.at>,  reintroduced and
+ * updated by Chris Stenton <chris@gnome.co.uk> 8/10/94
+ */
+
+/*
+ * Initialize the time of day register, based on the time base which is, e.g.
+ * from a filesystem.
+ */
+void
+inittodr(time_t base)
+{
+	struct timespec diff, ref, ts;
+	int error;
+
+	if (base) {
+		ref.tv_sec = base;
+		ref.tv_nsec = 0;
+		tc_setclock(&ref);
+	}
+
+	if (clock_dev == NULL) {
+		printf("warning: no time-of-day clock registered, system time "
+		    "will not be set accurately\n");
+		return;
+	}
+	error = CLOCK_GETTIME(clock_dev, &ts);
+	if (error != 0 && error != EINVAL) {
+		printf("warning: clock_gettime failed (%d), the system time "
+		    "will not be set accurately\n", error);
+		return;
+	}
+	if (error == EINVAL || ts.tv_sec < 0) {
+		printf("Invalid time in real time clock.\n");
+		printf("Check and reset the date immediately!\n");
+	}
+
+	ts.tv_sec += tz.tz_minuteswest * 60 +
+	    (wall_cmos_clock ? adjkerntz : 0);
+
+	if (timespeccmp(&ref, &ts, >)) {
+		diff = ref;
+		timespecsub(&ref, &ts);
+	} else {
+		diff = ts;
+		timespecsub(&diff, &ref);
+	}
+	if (ts.tv_sec >= 2) {
+		/* badly off, adjust it */
+		tc_setclock(&ts);
+	}
+}
+
+/*
+ * Write system time back to RTC
+ */
+void
+resettodr()
+{
+	struct timespec ts;
+	int error;
+
+	if (disable_rtc_set || clock_dev == NULL)
+		return;
+
+	getnanotime(&ts);
+	ts.tv_sec -= tz.tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0);
+	if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0) {
+		printf("warning: clock_settime failed (%d), time-of-day clock "
+		    "not adjusted to system time\n", error);
+		return;
+	}
+}
diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c
new file mode 100644
index 0000000..dabdf9d
--- /dev/null
+++ b/sys/kern/subr_devstat.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/sysctl.h>
+
+#include <sys/devicestat.h>
+
+static int devstat_num_devs;
+static long devstat_generation;
+static int devstat_version = DEVSTAT_VERSION;
+static int devstat_current_devnumber;
+
+static struct devstatlist device_statq;
+
+/*
+ * Take a malloced and zeroed devstat structure given to us, fill it in 
+ * and add it to the queue of devices.  
+ */
+void
+devstat_add_entry(struct devstat *ds, const char *dev_name, 
+		  int unit_number, u_int32_t block_size,
+		  devstat_support_flags flags,
+		  devstat_type_flags device_type,
+		  devstat_priority priority)
+{
+	struct devstatlist *devstat_head;
+	struct devstat *ds_tmp;
+
+	if (ds == NULL)
+		return;
+
+	if (devstat_num_devs == 0)
+		STAILQ_INIT(&device_statq);
+
+	devstat_generation++;
+	devstat_num_devs++;
+
+	devstat_head = &device_statq;
+
+	/*
+	 * Priority sort.  Each driver passes in its priority when it adds
+	 * its devstat entry.  Drivers are sorted first by priority, and
+	 * then by probe order.
+	 * 
+	 * For the first device, we just insert it, since the priority
+	 * doesn't really matter yet.  Subsequent devices are inserted into
+	 * the list using the order outlined above.
+	 */
+	if (devstat_num_devs == 1)
+		STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
+	else {
+		STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) {
+			struct devstat *ds_next;
+
+			ds_next = STAILQ_NEXT(ds_tmp, dev_links);
+
+			/*
+			 * If we find a break between higher and lower
+			 * priority items, and if this item fits in the
+			 * break, insert it.  This also applies if the
+			 * "lower priority item" is the end of the list.
+			 */
+			if ((priority <= ds_tmp->priority)
+			 && ((ds_next == NULL)
+			   || (priority > ds_next->priority))) {
+				STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds,
+						    dev_links);
+				break;
+			} else if (priority > ds_tmp->priority) {
+				/*
+				 * If this is the case, we should be able
+				 * to insert ourselves at the head of the
+				 * list.  If we can't, something is wrong.
+				 */
+				if (ds_tmp == STAILQ_FIRST(devstat_head)) {
+					STAILQ_INSERT_HEAD(devstat_head,
+							   ds, dev_links);
+					break;
+				} else {
+					STAILQ_INSERT_TAIL(devstat_head,
+							   ds, dev_links);
+					printf("devstat_add_entry: HELP! "
+					       "sorting problem detected "
+					       "for %s%d\n", dev_name,
+					       unit_number);
+					break;
+				}
+			}
+		}
+	}
+
+	ds->device_number = devstat_current_devnumber++;
+	ds->unit_number = unit_number;
+	strncpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
+	ds->device_name[DEVSTAT_NAME_LEN - 1] = '\0';
+	ds->block_size = block_size;
+	ds->flags = flags;
+	ds->device_type = device_type;
+	ds->priority = priority;
+	getmicrotime(&ds->dev_creation_time);
+}
+
+/*
+ * Remove a devstat structure from the list of devices.
+ */
+void
+devstat_remove_entry(struct devstat *ds)
+{
+	struct devstatlist *devstat_head;
+
+	if (ds == NULL)
+		return;
+
+	devstat_generation++;
+	devstat_num_devs--;
+
+	devstat_head = &device_statq;
+
+	/* Remove this entry from the devstat queue */
+	STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
+}
+
+/*
+ * Record a transaction start.
+ */
+void
+devstat_start_transaction(struct devstat *ds)
+{
+	/* sanity check */
+	if (ds == NULL)
+		return;
+
+	/*
+	 * We only want to set the start time when we are going from idle
+	 * to busy.  The start time is really the start of the latest busy
+	 * period.
+	 */
+	if (ds->busy_count == 0)
+		getmicrouptime(&ds->start_time);
+	ds->busy_count++;
+}
+
+/*
+ * Record the ending of a transaction, and incrment the various counters.
+ */
+void
+devstat_end_transaction(struct devstat *ds, u_int32_t bytes, 
+			devstat_tag_type tag_type, devstat_trans_flags flags)
+{
+	struct timeval busy_time;
+
+	/* sanity check */
+	if (ds == NULL)
+		return;
+
+	getmicrouptime(&ds->last_comp_time);
+	ds->busy_count--;
+
+	/*
+	 * There might be some transactions (DEVSTAT_NO_DATA) that don't
+	 * transfer any data.
+	 */
+	if (flags == DEVSTAT_READ) {
+		ds->bytes_read += bytes;
+		ds->num_reads++;
+	} else if (flags == DEVSTAT_WRITE) {
+		ds->bytes_written += bytes;
+		ds->num_writes++;
+	} else if (flags == DEVSTAT_FREE) {
+		ds->bytes_freed += bytes;
+		ds->num_frees++;
+	} else
+		ds->num_other++;
+
+	/*
+	 * Keep a count of the various tag types sent.
+	 */
+	if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 &&
+	    tag_type != DEVSTAT_TAG_NONE)
+		ds->tag_types[tag_type]++;
+
+	/*
+	 * We only update the busy time when we go idle.  Otherwise, this
+	 * calculation would require many more clock cycles.
+	 */
+	if (ds->busy_count == 0) {
+		/* Calculate how long we were busy */
+		busy_time = ds->last_comp_time;
+		timevalsub(&busy_time, &ds->start_time);
+
+		/* Add our busy time to the total busy time. */
+		timevaladd(&ds->busy_time, &busy_time);
+	} else if (ds->busy_count < 0)
+		printf("devstat_end_transaction: HELP!! busy_count "
+		       "for %s%d is < 0 (%d)!\n", ds->device_name,
+		       ds->unit_number, ds->busy_count);
+}
+
+void
+devstat_end_transaction_bio(struct devstat *ds, struct bio *bp)
+{
+	devstat_trans_flags flg;
+
+	if (bp->bio_cmd == BIO_DELETE)
+		flg = DEVSTAT_FREE;
+	else if (bp->bio_cmd == BIO_READ)
+		flg = DEVSTAT_READ;
+	else
+		flg = DEVSTAT_WRITE;
+
+	devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
+				DEVSTAT_TAG_SIMPLE, flg);
+}
+
+/*
+ * This is the sysctl handler for the devstat package.  The data pushed out
+ * on the kern.devstat.all sysctl variable consists of the current devstat
+ * generation number, and then an array of devstat structures, one for each
+ * device in the system.
+ *
+ * I'm really not too fond of this method of doing things, but there really
+ * aren't that many alternatives.  We must have some method of making sure
+ * that the generation number the user gets corresponds with the data the
+ * user gets.  If the user makes a separate sysctl call to get the
+ * generation, and then a sysctl call to get the device statistics, the
+ * device list could have changed in that brief period of time.  By
+ * supplying the generation number along with the statistics output, we can
+ * guarantee that the generation number and the statistics match up.
+ */
+static int
+sysctl_devstat(SYSCTL_HANDLER_ARGS)
+{
+	int error, i;
+	struct devstat *nds;
+	struct devstatlist *devstat_head;
+
+	if (devstat_num_devs == 0)
+		return(EINVAL);
+
+	error = 0;
+	devstat_head = &device_statq;
+
+	/*
+	 * First push out the generation number.
+	 */
+	error = SYSCTL_OUT(req, &devstat_generation, sizeof(long));
+
+	/*
+	 * Now push out all the devices.
+	 */
+	for (i = 0, nds = STAILQ_FIRST(devstat_head); 
+	    (nds != NULL) && (i < devstat_num_devs) && (error == 0); 
+	     nds = STAILQ_NEXT(nds, dev_links), i++)
+		error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
+
+	return(error);
+}
+
+/*
+ * Sysctl entries for devstat.  The first one is a node that all the rest
+ * hang off of. 
+ */
+SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, 0, "Device Statistics");
+
+SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE,
+    0, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list");
+/*
+ * Export the number of devices in the system so that userland utilities
+ * can determine how much memory to allocate to hold all the devices.
+ */
+SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, 
+    &devstat_num_devs, 0, "Number of devices in the devstat list");
+SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
+    &devstat_generation, 0, "Devstat list generation");
+SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, 
+    &devstat_version, 0, "Devstat list version number");
diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c
new file mode 100644
index 0000000..1982e7f
--- /dev/null
+++ b/sys/kern/subr_disk.c
@@ -0,0 +1,434 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ *
+ */
+
+#include "opt_geom.h"
+#ifndef GEOM
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <machine/md_var.h>
+#include <sys/ctype.h>
+
+static MALLOC_DEFINE(M_DISK, "disk", "disk data");
+
+static d_strategy_t diskstrategy;
+static d_open_t diskopen;
+static d_close_t diskclose; 
+static d_ioctl_t diskioctl;
+static d_psize_t diskpsize;
+
+static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
+
+void disk_dev_synth(dev_t dev);
+
+void
+disk_dev_synth(dev_t dev)
+{
+	struct disk *dp;
+	int u, s, p;
+	dev_t pdev;
+
+	if (dksparebits(dev))
+		return;
+	LIST_FOREACH(dp, &disklist, d_list) {
+		if (major(dev) != dp->d_devsw->d_maj)
+			continue;
+		u = dkunit(dev);
+		p = RAW_PART;
+		s = WHOLE_DISK_SLICE;
+		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
+		if (pdev->si_devsw == NULL)
+			return;		/* Probably a unit we don't have */
+		s = dkslice(dev);
+		p = dkpart(dev);
+		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
+			/* XXX: actually should not happen */
+			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 
+			    UID_ROOT, GID_OPERATOR, 0640, "%s%d", 
+				dp->d_devsw->d_name, u);
+			dev_depends(pdev, dev);
+			return;
+		}
+		if (s == COMPATIBILITY_SLICE) {
+			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 
+			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c", 
+				dp->d_devsw->d_name, u, 'a' + p);
+			dev_depends(pdev, dev);
+			return;
+		}
+		if (p != RAW_PART) {
+			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 
+			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c", 
+				dp->d_devsw->d_name, u, s - BASE_SLICE + 1,
+				'a' + p);
+		} else {
+			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 
+			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d", 
+				dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
+			make_dev_alias(dev, "%s%ds%dc",
+			    dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
+		}
+		dev_depends(pdev, dev);
+		return;
+	}
+}
+
+static void
+disk_clone(void *arg, char *name, int namelen, dev_t *dev)
+{
+	struct disk *dp;
+	char const *d;
+	char *e;
+	int j, u, s, p;
+	dev_t pdev;
+
+	if (*dev != NODEV)
+		return;
+
+	LIST_FOREACH(dp, &disklist, d_list) {
+		d = dp->d_devsw->d_name;
+		j = dev_stdclone(name, &e, d, &u);
+		if (j == 0)
+			continue;
+		if (u > DKMAXUNIT)
+			continue;
+		p = RAW_PART;
+		s = WHOLE_DISK_SLICE;
+		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
+		if (pdev->si_disk == NULL)
+			continue;
+		if (*e != '\0') {
+			j = dev_stdclone(e, &e, "s", &s);
+			if (j == 0) 
+				s = COMPATIBILITY_SLICE;
+			else if (j == 1 || j == 2)
+				s += BASE_SLICE - 1;
+			if (!*e)
+				;		/* ad0s1 case */
+			else if (e[1] != '\0')
+				return;		/* can never be a disk name */
+			else if (*e < 'a' || *e > 'h')
+				return;		/* can never be a disk name */
+			else
+				p = *e - 'a';
+		}
+		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
+			return;
+		} else if (s >= BASE_SLICE && p != RAW_PART) {
+			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 
+			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
+			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1, 
+			    p + 'a');
+		} else if (s >= BASE_SLICE) {
+			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 
+			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
+			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
+			make_dev_alias(*dev, "%s%ds%dc", 
+			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
+		} else {
+			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p), 
+			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
+			    pdev->si_devsw->d_name, u, p + 'a');
+		}
+		dev_depends(pdev, *dev);
+		return;
+	}
+}
+
+static void
+inherit_raw(dev_t pdev, dev_t dev)
+{
+	dev->si_disk = pdev->si_disk;
+	dev->si_drv1 = pdev->si_drv1;
+	dev->si_drv2 = pdev->si_drv2;
+	dev->si_iosize_max = pdev->si_iosize_max;
+	dev->si_bsize_phys = pdev->si_bsize_phys;
+	dev->si_bsize_best = pdev->si_bsize_best;
+}
+
+dev_t
+disk_create(int unit, struct disk *dp, int flags, struct cdevsw *cdevsw, struct cdevsw *proto)
+{
+	static int once;
+	dev_t dev;
+
+	if (!once) {
+		EVENTHANDLER_REGISTER(dev_clone, disk_clone, 0, 1000);
+		once++;
+	}
+
+	bzero(dp, sizeof(*dp));
+
+	if (proto->d_open != diskopen) {
+		*proto = *cdevsw;
+		proto->d_open = diskopen;
+		proto->d_close = diskclose;
+		proto->d_ioctl = diskioctl;
+		proto->d_strategy = diskstrategy;
+		proto->d_psize = diskpsize;
+	}
+
+	if (bootverbose)
+		printf("Creating DISK %s%d\n", cdevsw->d_name, unit);
+	dev = make_dev(proto, dkmakeminor(unit, WHOLE_DISK_SLICE, RAW_PART),
+	    UID_ROOT, GID_OPERATOR, 0640, "%s%d", cdevsw->d_name, unit);
+
+	dev->si_disk = dp;
+	dp->d_dev = dev;
+	dp->d_dsflags = flags;
+	dp->d_devsw = cdevsw;
+	LIST_INSERT_HEAD(&disklist, dp, d_list);
+
+	return (dev);
+}
+
+static int
+diskdumpconf(u_int onoff, dev_t dev, struct disk *dp)
+{
+	struct dumperinfo di;
+	struct disklabel *dl;
+
+	if (!onoff) 
+		return(set_dumper(NULL));
+	dl = dsgetlabel(dev, dp->d_slice);
+	if (!dl)
+		return (ENXIO);
+	bzero(&di, sizeof di);
+	di.dumper = (dumper_t *)dp->d_devsw->d_dump;
+	di.priv = dp->d_dev;
+	di.blocksize = dl->d_secsize;
+	di.mediaoffset = (off_t)(dl->d_partitions[dkpart(dev)].p_offset +
+	    dp->d_slice->dss_slices[dkslice(dev)].ds_offset) * DEV_BSIZE;
+	di.mediasize =
+	    (off_t)(dl->d_partitions[dkpart(dev)].p_size) * DEV_BSIZE;
+	return(set_dumper(&di));
+}
+
+void 
+disk_invalidate (struct disk *disk)
+{
+	if (disk->d_slice)
+		dsgone(&disk->d_slice);
+}
+
+void
+disk_destroy(dev_t dev)
+{
+	LIST_REMOVE(dev->si_disk, d_list);
+	bzero(dev->si_disk, sizeof(*dev->si_disk));
+    	dev->si_disk = NULL;
+	destroy_dev(dev);
+	return;
+}
+
+struct disk *
+disk_enumerate(struct disk *disk)
+{
+	if (!disk)
+		return (LIST_FIRST(&disklist));
+	else
+		return (LIST_NEXT(disk, d_list));
+}
+
+static int
+sysctl_disks(SYSCTL_HANDLER_ARGS)
+{
+	struct disk *disk;
+	int error, first;
+
+	disk = NULL;
+	first = 1;
+
+	while ((disk = disk_enumerate(disk))) {
+		if (!first) {
+			error = SYSCTL_OUT(req, " ", 1);
+			if (error)
+				return error;
+		} else {
+			first = 0;
+		}
+		error = SYSCTL_OUT(req, disk->d_dev->si_name, strlen(disk->d_dev->si_name));
+		if (error)
+			return error;
+	}
+	error = SYSCTL_OUT(req, "", 1);
+	return error;
+}
+ 
+SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, 0, NULL, 
+    sysctl_disks, "A", "names of available disks");
+
+/*
+ * The cdevsw functions
+ */
+
+static int
+diskopen(dev_t dev, int oflags, int devtype, struct thread *td)
+{
+	dev_t pdev;
+	struct disk *dp;
+	int error;
+
+	error = 0;
+	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+
+	dp = pdev->si_disk;
+	if (!dp)
+		return (ENXIO);
+
+	while (dp->d_flags & DISKFLAG_LOCK) {
+		dp->d_flags |= DISKFLAG_WANTED;
+		error = tsleep(dp, PRIBIO | PCATCH, "diskopen", hz);
+		if (error)
+			return (error);
+	}
+	dp->d_flags |= DISKFLAG_LOCK;
+
+	if (!dsisopen(dp->d_slice)) {
+		if (!pdev->si_iosize_max)
+			pdev->si_iosize_max = dev->si_iosize_max;
+		error = dp->d_devsw->d_open(pdev, oflags, devtype, td);
+	}
+
+	/* Inherit properties from the whole/raw dev_t */
+	inherit_raw(pdev, dev);
+
+	if (error)
+		goto out;
+	
+	error = dsopen(dev, devtype, dp->d_dsflags, &dp->d_slice, &dp->d_label);
+
+	if (!dsisopen(dp->d_slice)) 
+		dp->d_devsw->d_close(pdev, oflags, devtype, td);
+out:	
+	dp->d_flags &= ~DISKFLAG_LOCK;
+	if (dp->d_flags & DISKFLAG_WANTED) {
+		dp->d_flags &= ~DISKFLAG_WANTED;
+		wakeup(dp);
+	}
+	
+	return(error);
+}
+
+static int
+diskclose(dev_t dev, int fflag, int devtype, struct thread *td)
+{
+	struct disk *dp;
+	int error;
+	dev_t pdev;
+
+	error = 0;
+	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+	dp = pdev->si_disk;
+	if (!dp)
+		return (ENXIO);
+	dsclose(dev, devtype, dp->d_slice);
+	if (!dsisopen(dp->d_slice))
+		error = dp->d_devsw->d_close(dp->d_dev, fflag, devtype, td);
+	return (error);
+}
+
+static void
+diskstrategy(struct bio *bp)
+{
+	dev_t pdev;
+	struct disk *dp;
+
+	pdev = dkmodpart(dkmodslice(bp->bio_dev, WHOLE_DISK_SLICE), RAW_PART);
+	dp = pdev->si_disk;
+	bp->bio_resid = bp->bio_bcount;
+	if (dp != bp->bio_dev->si_disk)
+		inherit_raw(pdev, bp->bio_dev);
+
+	if (!dp) {
+		biofinish(bp, NULL, ENXIO);
+		return;
+	}
+
+	if (dscheck(bp, dp->d_slice) <= 0) {
+		biodone(bp);
+		return;
+	}
+
+	if (bp->bio_bcount == 0) {
+		biodone(bp);
+		return;
+	}
+
+	KASSERT(dp->d_devsw != NULL, ("NULL devsw"));
+	KASSERT(dp->d_devsw->d_strategy != NULL, ("NULL d_strategy"));
+	dp->d_devsw->d_strategy(bp);
+	return;
+	
+}
+
+static int
+diskioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+	struct disk *dp;
+	int error;
+	u_int u;
+	dev_t pdev;
+
+	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+	dp = pdev->si_disk;
+	if (!dp)
+		return (ENXIO);
+	if (cmd == DIOCSKERNELDUMP) {
+		u = *(u_int *)data;
+		return (diskdumpconf(u, dev, dp));
+	}
+	if (cmd == DIOCGFRONTSTUFF) {
+		*(off_t *)data = 8192;	/* XXX: crude but enough) */
+		return (0);
+	}
+	error = dsioctl(dev, cmd, data, fflag, &dp->d_slice);
+	if (error == ENOIOCTL)
+		error = dp->d_devsw->d_ioctl(dev, cmd, data, fflag, td);
+	return (error);
+}
+
+static int
+diskpsize(dev_t dev)
+{
+	struct disk *dp;
+	dev_t pdev;
+
+	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+	dp = pdev->si_disk;
+	if (!dp)
+		return (-1);
+	if (dp != dev->si_disk) {
+		dev->si_drv1 = pdev->si_drv1;
+		dev->si_drv2 = pdev->si_drv2;
+		/* XXX: don't set bp->b_dev->si_disk (?) */
+	}
+	return (dssize(dev, &dp->d_slice));
+}
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, disklabel, CTLFLAG_RD, 
+    0, sizeof(struct disklabel), "sizeof(struct disklabel)");
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD, 
+    0, sizeof(struct diskslices), "sizeof(struct diskslices)");
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD, 
+    0, sizeof(struct disk), "sizeof(struct disk)");
+
+#endif
diff --git a/sys/kern/subr_disklabel.c b/sys/kern/subr_disklabel.c
new file mode 100644
index 0000000..e149687
--- /dev/null
+++ b/sys/kern/subr_disklabel.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/stdint.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/syslog.h>
+#include <machine/atomic.h>
+
+#ifdef notquite
+/*
+ * Mutex to use when delaying niced I/O bound processes in bioqdisksort().
+ */
+static struct mtx dksort_mtx;
+static void
+dksort_init(void)
+{
+
+	mtx_init(&dksort_mtx, "dksort", NULL, MTX_DEF);
+}
+SYSINIT(dksort, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, dksort_init, NULL)
+#endif
+
+/*
+ * Seek sort for disks.
+ *
+ * The buf_queue keep two queues, sorted in ascending block order.  The first
+ * queue holds those requests which are positioned after the current block
+ * (in the first request); the second, which starts at queue->switch_point,
+ * holds requests which came in after their block number was passed.  Thus
+ * we implement a one way scan, retracting after reaching the end of the drive
+ * to the first request on the second queue, at which time it becomes the
+ * first queue.
+ *
+ * A one-way scan is natural because of the way UNIX read-ahead blocks are
+ * allocated.
+ */
+
+void
+bioqdisksort(bioq, bp)
+	struct bio_queue_head *bioq;
+	struct bio *bp;
+{
+	struct bio *bq;
+	struct bio *bn;
+	struct bio *be;
+
+#ifdef notquite
+	struct thread *td = curthread;
+	
+	if (td && td->td_ksegrp->kg_nice > 0) {
+		TAILQ_FOREACH(bn, &bioq->queue, bio_queue)
+			if (BIOTOBUF(bp)->b_vp != BIOTOBUF(bn)->b_vp)
+				break;
+		if (bn != NULL) {
+			mtx_lock(&dksort_mtx);
+			msleep((caddr_t)&dksort_mtx, &dksort_mtx,
+			    PPAUSE | PCATCH | PDROP, "ioslow",
+			    td->td_ksegrp->kg_nice);
+		}
+	}
+#endif
+	if (!atomic_cmpset_int(&bioq->busy, 0, 1))
+		panic("Recursing in bioqdisksort()");
+	be = TAILQ_LAST(&bioq->queue, bio_queue);
+	/*
+	 * If the queue is empty or we are an
+	 * ordered transaction, then it's easy.
+	 */
+	if ((bq = bioq_first(bioq)) == NULL) {
+		bioq_insert_tail(bioq, bp);
+		bioq->busy = 0;
+		return;
+	} else if (bioq->insert_point != NULL) {
+
+		/*
+		 * A certain portion of the list is
+		 * "locked" to preserve ordering, so
+		 * we can only insert after the insert
+		 * point.
+		 */
+		bq = bioq->insert_point;
+	} else {
+
+		/*
+		 * If we lie before the last removed (currently active)
+		 * request, and are not inserting ourselves into the
+		 * "locked" portion of the list, then we must add ourselves
+		 * to the second request list.
+		 */
+		if (bp->bio_pblkno < bioq->last_pblkno) {
+
+			bq = bioq->switch_point;
+			/*
+			 * If we are starting a new secondary list,
+			 * then it's easy.
+			 */
+			if (bq == NULL) {
+				bioq->switch_point = bp;
+				bioq_insert_tail(bioq, bp);
+				bioq->busy = 0;
+				return;
+			}
+			/*
+			 * If we lie ahead of the current switch point,
+			 * insert us before the switch point and move
+			 * the switch point.
+			 */
+			if (bp->bio_pblkno < bq->bio_pblkno) {
+				bioq->switch_point = bp;
+				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
+				bioq->busy = 0;
+				return;
+			}
+		} else {
+			if (bioq->switch_point != NULL)
+				be = TAILQ_PREV(bioq->switch_point,
+						bio_queue, bio_queue);
+			/*
+			 * If we lie between last_pblkno and bq,
+			 * insert before bq.
+			 */
+			if (bp->bio_pblkno < bq->bio_pblkno) {
+				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
+				bioq->busy = 0;
+				return;
+			}
+		}
+	}
+
+	/*
+	 * Request is at/after our current position in the list.
+	 * Optimize for sequential I/O by seeing if we go at the tail.
+	 */
+	if (bp->bio_pblkno > be->bio_pblkno) {
+		TAILQ_INSERT_AFTER(&bioq->queue, be, bp, bio_queue);
+		bioq->busy = 0;
+		return;
+	}
+
+	/* Otherwise, insertion sort */
+	while ((bn = TAILQ_NEXT(bq, bio_queue)) != NULL) {
+		
+		/*
+		 * We want to go after the current request if it is the end
+		 * of the first request list, or if the next request is a
+		 * larger cylinder than our request.
+		 */
+		if (bn == bioq->switch_point
+		 || bp->bio_pblkno < bn->bio_pblkno)
+			break;
+		bq = bn;
+	}
+	TAILQ_INSERT_AFTER(&bioq->queue, bq, bp, bio_queue);
+	bioq->busy = 0;
+}
+
+
+/*
+ * Attempt to read a disk label from a device using the indicated strategy
+ * routine.  The label must be partly set up before this: secpercyl, secsize
+ * and anything required in the strategy routine (e.g., dummy bounds for the
+ * partition containing the label) must be filled in before calling us.
+ * Returns NULL on success and an error string on failure.
+ */
+char *
+readdisklabel(dev, lp)
+	dev_t dev;
+	register struct disklabel *lp;
+{
+	register struct buf *bp;
+	struct disklabel *dlp;
+	char *msg = NULL;
+
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dev;
+	bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+	bp->b_bcount = lp->d_secsize;
+	bp->b_flags &= ~B_INVAL;
+	bp->b_iocmd = BIO_READ;
+	DEV_STRATEGY(bp, 1);
+	if (bufwait(bp))
+		msg = "I/O error";
+	else if (bp->b_resid != 0)
+		msg = "disk too small for a label";
+	else for (dlp = (struct disklabel *)bp->b_data;
+	    dlp <= (struct disklabel *)((char *)bp->b_data +
+	    lp->d_secsize - sizeof(*dlp));
+	    dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+		if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) {
+			if (msg == NULL)
+				msg = "no disk label";
+		} else if (dlp->d_npartitions > MAXPARTITIONS ||
+			   dkcksum(dlp) != 0)
+			msg = "disk label corrupted";
+		else {
+			*lp = *dlp;
+			msg = NULL;
+			break;
+		}
+	}
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	return (msg);
+}
+
+/*
+ * Check new disk label for sensibility before setting it.
+ */
+int
+setdisklabel(olp, nlp, openmask)
+	register struct disklabel *olp, *nlp;
+	u_long openmask;
+{
+	register int i;
+	register struct partition *opp, *npp;
+
+	/*
+	 * Check it is actually a disklabel we are looking at.
+	 */
+	if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC ||
+	    dkcksum(nlp) != 0)
+		return (EINVAL);
+	/*
+	 * For each partition that we think is open,
+	 */
+	while ((i = ffs((long)openmask)) != 0) {
+		i--;
+		/*
+	 	 * Check it is not changing....
+	 	 */
+		openmask &= ~(1 << i);
+		if (nlp->d_npartitions <= i)
+			return (EBUSY);
+		opp = &olp->d_partitions[i];
+		npp = &nlp->d_partitions[i];
+		if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size)
+			return (EBUSY);
+		/*
+		 * Copy internally-set partition information
+		 * if new label doesn't include it.		XXX
+		 * (If we are using it then we had better stay the same type)
+		 * This is possibly dubious, as someone else noted (XXX)
+		 */
+		if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) {
+			npp->p_fstype = opp->p_fstype;
+			npp->p_fsize = opp->p_fsize;
+			npp->p_frag = opp->p_frag;
+			npp->p_cpg = opp->p_cpg;
+		}
+	}
+ 	nlp->d_checksum = 0;
+ 	nlp->d_checksum = dkcksum(nlp);
+	*olp = *nlp;
+	return (0);
+}
+
+/*
+ * Write disk label back to device after modification.
+ */
+int
+writedisklabel(dev, lp)
+	dev_t dev;
+	register struct disklabel *lp;
+{
+	struct buf *bp;
+	struct disklabel *dlp;
+	int error = 0;
+
+	if (lp->d_partitions[RAW_PART].p_offset != 0)
+		return (EXDEV);			/* not quite right */
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dkmodpart(dev, RAW_PART);
+	bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+	bp->b_bcount = lp->d_secsize;
+#if 1
+	/*
+	 * We read the label first to see if it's there,
+	 * in which case we will put ours at the same offset into the block..
+	 * (I think this is stupid [Julian])
+	 * Note that you can't write a label out over a corrupted label!
+	 * (also stupid.. how do you write the first one? by raw writes?)
+	 */
+	bp->b_flags &= ~B_INVAL;
+	bp->b_iocmd = BIO_READ;
+	DEV_STRATEGY(bp, 1);
+	error = bufwait(bp);
+	if (error)
+		goto done;
+	if (bp->b_resid != 0) {
+		error = ENOSPC;
+		goto done;
+	}
+	for (dlp = (struct disklabel *)bp->b_data;
+	    dlp <= (struct disklabel *)
+	      ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp));
+	    dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+		if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC &&
+		    dkcksum(dlp) == 0) {
+			*dlp = *lp;
+			bp->b_flags &= ~B_DONE;
+			bp->b_iocmd = BIO_WRITE;
+#ifdef __alpha__
+			alpha_fix_srm_checksum(bp);
+#endif
+			DEV_STRATEGY(bp, 1);
+			error = bufwait(bp);
+			goto done;
+		}
+	}
+	error = ESRCH;
+done:
+#else
+	bzero(bp->b_data, lp->d_secsize);
+	dlp = (struct disklabel *)bp->b_data;
+	*dlp = *lp;
+	bp->b_flags &= ~B_INVAL;
+	bp->b_iocmd = BIO_WRITE;
+	DEV_STRATEGY(bp, 1);
+	error = bufwait(bp);
+#endif
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	return (error);
+}
+
+/*
+ * Disk error is the preface to plaintive error messages
+ * about failing disk transfers.  It prints messages of the form
+
+hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
+
+ * if the offset of the error in the transfer and a disk label
+ * are both available.  blkdone should be -1 if the position of the error
+ * is unknown; the disklabel pointer may be null from drivers that have not
+ * been converted to use them.  The message is printed with printf.
+ * The message should be completed with at least a newline. There is no
+ * trailing space.
+ */
+void
+diskerr(bp, what, blkdone, lp)
+	struct bio *bp;
+	char *what;
+	int blkdone;
+	register struct disklabel *lp;
+{
+	int part = dkpart(bp->bio_dev);
+	char partname[2];
+	char *sname;
+	daddr_t sn;
+
+	*partname = '\0';
+	sname = bp->bio_dev->si_name;
+	printf("%s%s: %s %sing fsbn ", sname, partname, what,
+	      bp->bio_cmd == BIO_READ ? "read" : "writ");
+	sn = bp->bio_blkno;
+	if (bp->bio_bcount <= DEV_BSIZE)
+		printf("%jd", (intmax_t)sn);
+	else {
+		if (blkdone >= 0) {
+			sn += blkdone;
+			printf("%jd of ", (intmax_t)sn);
+		}
+		printf("%ld-%ld", (long)bp->bio_blkno,
+		    (long)(bp->bio_blkno + (bp->bio_bcount - 1) / DEV_BSIZE));
+	}
+	if (lp && (blkdone >= 0 || bp->bio_bcount <= lp->d_secsize)) {
+		sn += lp->d_partitions[part].p_offset;
+		/*
+		 * XXX should add slice offset and not print the slice,
+		 * but we don't know the slice pointer.
+		 * XXX should print bp->b_pblkno so that this will work
+		 * independent of slices, labels and bad sector remapping,
+		 * but some drivers don't set bp->b_pblkno.
+		 */
+		printf(" (%s bn %jd; cn %jd", sname, (intmax_t)sn,
+		    (intmax_t)(sn / lp->d_secpercyl));
+		sn %= lp->d_secpercyl;
+		printf(" tn %ld sn %ld)", (long)(sn / lp->d_nsectors),
+		    (long)(sn % lp->d_nsectors));
+	}
+}
diff --git a/sys/kern/subr_diskmbr.c b/sys/kern/subr_diskmbr.c
new file mode 100644
index 0000000..40d5b2d
--- /dev/null
+++ b/sys/kern/subr_diskmbr.c
@@ -0,0 +1,544 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)ufs_disksubr.c	7.16 (Berkeley) 5/4/91
+ *	from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#ifdef PC98
+#define	PC98_ATCOMPAT
+#define	dsinit			atcompat_dsinit
+#endif
+#include <sys/disklabel.h>
+#define	DOSPTYP_EXTENDED	5
+#define	DOSPTYP_EXTENDEDX	15
+#define	DOSPTYP_ONTRACK		84
+#include <sys/diskslice.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+
+#define TRACE(str)	do { if (dsi_debug) printf str; } while (0)
+
+static volatile u_char dsi_debug;
+
+/*
+ * This is what we have embedded in every boot1 for supporting the bogus
+ * "Dangerously Dedicated" mode.  However, the old table is broken because
+ * it has an illegal geometry in it - it specifies 256 heads (heads = end
+ * head + 1) which causes nasty stuff when that wraps to zero in bios code.
+ * eg: divide by zero etc.  This caused the dead-thinkpad problem, numerous
+ * SCSI bios crashes, EFI to crash, etc.
+ * 
+ * We still have to recognize the old table though, even though we stopped
+ * inflicting it apon the world.
+ */
+static struct dos_partition historical_bogus_partition_table[NDOSPART] = {
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, },
+};
+static struct dos_partition historical_bogus_partition_table_fixed[NDOSPART] = {
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0x80, 0, 1, 0, DOSPTYP_386BSD, 254, 255, 255, 0, 50000, },
+};
+
+static int check_part(char *sname, struct dos_partition *dp,
+			   u_long offset, int nsectors, int ntracks,
+			   u_long mbr_offset);
+static void mbr_extended(dev_t dev, struct disklabel *lp,
+			      struct diskslices *ssp, u_long ext_offset,
+			      u_long ext_size, u_long base_ext_offset,
+			      int nsectors, int ntracks, u_long mbr_offset,
+			      int level);
+static int mbr_setslice(char *sname, struct disklabel *lp,
+			     struct diskslice *sp, struct dos_partition *dp,
+			     u_long br_offset);
+
+static int
+check_part(sname, dp, offset, nsectors, ntracks, mbr_offset )
+	char	*sname;
+	struct dos_partition *dp;
+	u_long	offset;
+	int	nsectors;
+	int	ntracks;
+	u_long	mbr_offset;
+{
+	int	chs_ecyl;
+	int	chs_esect;
+	int	chs_scyl;
+	int	chs_ssect;
+	int	error;
+	u_long	esector;
+	u_long	esector1;
+	u_long	secpercyl;
+	u_long	ssector;
+	u_long	ssector1;
+
+	secpercyl = (u_long)nsectors * ntracks;
+	chs_scyl = DPCYL(dp->dp_scyl, dp->dp_ssect);
+	chs_ssect = DPSECT(dp->dp_ssect);
+	ssector = chs_ssect - 1 + dp->dp_shd * nsectors + chs_scyl * secpercyl
+		  + mbr_offset;
+	ssector1 = offset + dp->dp_start;
+
+	/*
+	 * If ssector1 is on a cylinder >= 1024, then ssector can't be right.
+	 * Allow the C/H/S for it to be 1023/ntracks-1/nsectors, or correct
+	 * apart from the cylinder being reduced modulo 1024.  Always allow
+	 * 1023/255/63, because this is the official way to represent
+	 * pure-LBA for the starting position.
+	 */
+	if ((ssector < ssector1
+	     && ((chs_ssect == nsectors && dp->dp_shd == ntracks - 1
+		  && chs_scyl == 1023)
+		 || (secpercyl != 0
+		     && (ssector1 - ssector) % (1024 * secpercyl) == 0)))
+	    || (dp->dp_scyl == 255 && dp->dp_shd == 255
+		&& dp->dp_ssect == 255)) {
+		TRACE(("%s: C/H/S start %d/%d/%d, start %lu: allow\n",
+		       sname, chs_scyl, dp->dp_shd, chs_ssect, ssector1));
+		ssector = ssector1;
+	}
+
+	chs_ecyl = DPCYL(dp->dp_ecyl, dp->dp_esect);
+	chs_esect = DPSECT(dp->dp_esect);
+	esector = chs_esect - 1 + dp->dp_ehd * nsectors + chs_ecyl * secpercyl
+		  + mbr_offset;
+	esector1 = ssector1 + dp->dp_size - 1;
+
+	/*
+	 * Allow certain bogus C/H/S values for esector, as above.  However,
+	 * heads == 255 isn't really legal and causes some BIOS crashes.  The
+	 * correct value to indicate a pure-LBA end is 1023/heads-1/sectors -
+	 * usually 1023/254/63.  "heads" is base 0, "sectors" is base 1.
+	 */
+	if ((esector < esector1
+	     && ((chs_esect == nsectors && dp->dp_ehd == ntracks - 1
+		  && chs_ecyl == 1023)
+		 || (secpercyl != 0
+		     && (esector1 - esector) % (1024 * secpercyl) == 0)))
+	    || (dp->dp_ecyl == 255 && dp->dp_ehd == 255
+		&& dp->dp_esect == 255)) {
+		TRACE(("%s: C/H/S end %d/%d/%d, end %lu: allow\n",
+		       sname, chs_ecyl, dp->dp_ehd, chs_esect, esector1));
+		esector = esector1;
+	}
+
+	error = (ssector == ssector1 && esector == esector1) ? 0 : EINVAL;
+	if (bootverbose)
+		printf("%s: type 0x%x, start %lu, end = %lu, size %lu %s\n",
+		       sname, dp->dp_typ, ssector1, esector1,
+		       (u_long)dp->dp_size, error ? "" : ": OK");
+	if (ssector != ssector1 && bootverbose)
+		printf("%s: C/H/S start %d/%d/%d (%lu) != start %lu: invalid\n",
+		       sname, chs_scyl, dp->dp_shd, chs_ssect,
+		       ssector, ssector1);
+	if (esector != esector1 && bootverbose)
+		printf("%s: C/H/S end %d/%d/%d (%lu) != end %lu: invalid\n",
+		       sname, chs_ecyl, dp->dp_ehd, chs_esect,
+		       esector, esector1);
+	return (error);
+}
+
+int
+dsinit(dev, lp, sspp)
+	dev_t	dev;
+	struct disklabel *lp;
+	struct diskslices **sspp;
+{
+	struct buf *bp;
+	u_char	*cp;
+	int	dospart;
+	struct dos_partition *dp;
+	struct dos_partition *dp0;
+	struct dos_partition dpcopy[NDOSPART];
+	int	error;
+	int	max_ncyls;
+	int	max_nsectors;
+	int	max_ntracks;
+	u_long	mbr_offset;
+	char	partname[2];
+	u_long	secpercyl;
+	char	*sname;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+
+	mbr_offset = DOSBBSECTOR;
+reread_mbr:
+	/* Read master boot record. */
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+	bp->b_blkno = mbr_offset;
+	bp->b_bcount = lp->d_secsize;
+	bp->b_iocmd = BIO_READ;
+	DEV_STRATEGY(bp, 1);
+	if (bufwait(bp) != 0) {
+		diskerr(&bp->b_io, "reading primary partition table: error",
+		    0, (struct disklabel *)NULL);
+		printf("\n");
+		error = EIO;
+		goto done;
+	}
+
+	/* Weakly verify it. */
+	cp = bp->b_data;
+	sname = dsname(dev, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART, partname);
+	if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+		if (bootverbose)
+			printf("%s: invalid primary partition table: no magic\n",
+			       sname);
+		error = EINVAL;
+		goto done;
+	}
+
+	/* Make a copy of the partition table to avoid alignment problems. */
+	memcpy(&dpcopy[0], cp + DOSPARTOFF, sizeof(dpcopy));
+
+	dp0 = &dpcopy[0];
+
+	/* Check for "Ontrack Diskmanager". */
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+		if (dp->dp_typ == DOSPTYP_ONTRACK) {
+			if (bootverbose)
+				printf(
+	    "%s: Found \"Ontrack Disk Manager\" on this disk.\n", sname);
+			bp->b_flags |= B_INVAL | B_AGE;
+			brelse(bp);
+			mbr_offset = 63;
+			goto reread_mbr;
+		}
+	}
+
+	if (bcmp(dp0, historical_bogus_partition_table,
+		 sizeof historical_bogus_partition_table) == 0 ||
+	    bcmp(dp0, historical_bogus_partition_table_fixed,
+		 sizeof historical_bogus_partition_table_fixed) == 0) {
+		if (bootverbose)
+			printf(
+    "%s: invalid primary partition table: Dangerously Dedicated (ignored)\n",
+    sname);
+		error = EINVAL;
+		goto done;
+	}
+
+	/* Guess the geometry. */
+	/*
+	 * TODO:
+	 * Perhaps skip entries with 0 size.
+	 * Perhaps only look at entries of type DOSPTYP_386BSD.
+	 */
+	max_ncyls = 0;
+	max_nsectors = 0;
+	max_ntracks = 0;
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+		int	ncyls;
+		int	nsectors;
+		int	ntracks;
+
+		ncyls = DPCYL(dp->dp_ecyl, dp->dp_esect) + 1;
+		if (max_ncyls < ncyls)
+			max_ncyls = ncyls;
+		nsectors = DPSECT(dp->dp_esect);
+		if (max_nsectors < nsectors)
+			max_nsectors = nsectors;
+		ntracks = dp->dp_ehd + 1;
+		if (max_ntracks < ntracks)
+			max_ntracks = ntracks;
+	}
+
+	/*
+	 * Check that we have guessed the geometry right by checking the
+	 * partition entries.
+	 */
+	/*
+	 * TODO:
+	 * As above.
+	 * Check for overlaps.
+	 * Check against d_secperunit if the latter is reliable.
+	 */
+	error = 0;
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+		if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+		    && dp->dp_start == 0 && dp->dp_size == 0)
+			continue;
+		sname = dsname(dev, dkunit(dev), BASE_SLICE + dospart,
+			       RAW_PART, partname);
+
+		/*
+		 * Temporarily ignore errors from this check.  We could
+		 * simplify things by accepting the table eariler if we
+		 * always ignore errors here.  Perhaps we should always
+		 * accept the table if the magic is right but not let
+		 * bad entries affect the geometry.
+		 */
+		check_part(sname, dp, mbr_offset, max_nsectors, max_ntracks,
+			   mbr_offset);
+	}
+	if (error != 0)
+		goto done;
+
+	/*
+	 * Accept the DOS partition table.
+	 * First adjust the label (we have been careful not to change it
+	 * before we can guarantee success).
+	 */
+	secpercyl = (u_long)max_nsectors * max_ntracks;
+	if (secpercyl != 0) {
+		lp->d_nsectors = max_nsectors;
+		lp->d_ntracks = max_ntracks;
+		lp->d_secpercyl = secpercyl;
+		lp->d_ncylinders = lp->d_secperunit / secpercyl;
+	}
+
+	/*
+	 * We are passed a pointer to a suitably initialized minimal
+	 * slices "struct" with no dangling pointers in it.  Replace it
+	 * by a maximal one.  This usually oversizes the "struct", but
+	 * enlarging it while searching for logical drives would be
+	 * inconvenient.
+	 */
+	free(*sspp, M_DEVBUF);
+	ssp = dsmakeslicestruct(MAX_SLICES, lp);
+	*sspp = ssp;
+
+	/* Initialize normal slices. */
+	sp = &ssp->dss_slices[BASE_SLICE];
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++, sp++) {
+		sname = dsname(dev, dkunit(dev), BASE_SLICE + dospart,
+			       RAW_PART, partname);
+		(void)mbr_setslice(sname, lp, sp, dp, mbr_offset);
+	}
+	ssp->dss_nslices = BASE_SLICE + NDOSPART;
+
+	/* Handle extended partitions. */
+	sp -= NDOSPART;
+	for (dospart = 0; dospart < NDOSPART; dospart++, sp++)
+		if (sp->ds_type == DOSPTYP_EXTENDED ||
+		    sp->ds_type == DOSPTYP_EXTENDEDX)
+			mbr_extended(bp->b_dev, lp, ssp,
+				     sp->ds_offset, sp->ds_size, sp->ds_offset,
+				     max_nsectors, max_ntracks, mbr_offset, 1);
+
+	/*
+	 * mbr_extended() abuses ssp->dss_nslices for the number of slices
+	 * that would be found if there were no limit on the number of slices
+	 * in *ssp.  Cut it back now.
+	 */
+	if (ssp->dss_nslices > MAX_SLICES)
+		ssp->dss_nslices = MAX_SLICES;
+
+done:
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	if (error == EINVAL)
+		error = 0;
+	return (error);
+}
+
+void
+mbr_extended(dev, lp, ssp, ext_offset, ext_size, base_ext_offset, nsectors,
+	     ntracks, mbr_offset, level)
+	dev_t	dev;
+	struct disklabel *lp;
+	struct diskslices *ssp;
+	u_long	ext_offset;
+	u_long	ext_size;
+	u_long	base_ext_offset;
+	int	nsectors;
+	int	ntracks;
+	u_long	mbr_offset;
+	int	level;
+{
+	struct buf *bp;
+	u_char	*cp;
+	int	dospart;
+	struct dos_partition *dp;
+	struct dos_partition dpcopy[NDOSPART];
+	u_long	ext_offsets[NDOSPART];
+	u_long	ext_sizes[NDOSPART];
+	char	partname[2];
+	int	slice;
+	char	*sname;
+	struct diskslice *sp;
+
+	if (level >= 16) {
+		printf(
+	"%s: excessive recursion in search for slices; aborting search\n",
+		       devtoname(dev));
+		return;
+	}
+
+	/* Read extended boot record. */
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dev;
+	bp->b_blkno = ext_offset;
+	bp->b_bcount = lp->d_secsize;
+	bp->b_iocmd = BIO_READ;
+	DEV_STRATEGY(bp, 1);
+	if (bufwait(bp) != 0) {
+		diskerr(&bp->b_io, "reading extended partition table: error",
+		    0, (struct disklabel *)NULL);
+		printf("\n");
+		goto done;
+	}
+
+	/* Weakly verify it. */
+	cp = bp->b_data;
+	if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+		sname = dsname(dev, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART,
+			       partname);
+		if (bootverbose)
+			printf("%s: invalid extended partition table: no magic\n",
+			       sname);
+		goto done;
+	}
+
+	/* Make a copy of the partition table to avoid alignment problems. */
+	memcpy(&dpcopy[0], cp + DOSPARTOFF, sizeof(dpcopy));
+
+	slice = ssp->dss_nslices;
+	for (dospart = 0, dp = &dpcopy[0]; dospart < NDOSPART;
+	    dospart++, dp++) {
+		ext_sizes[dospart] = 0;
+		if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+		    && dp->dp_start == 0 && dp->dp_size == 0)
+			continue;
+		if (dp->dp_typ == DOSPTYP_EXTENDED ||
+		    dp->dp_typ == DOSPTYP_EXTENDEDX) {
+			static char buf[32];
+
+			sname = dsname(dev, dkunit(dev), WHOLE_DISK_SLICE,
+				       RAW_PART, partname);
+			snprintf(buf, sizeof(buf), "%s", sname);
+			if (strlen(buf) < sizeof buf - 11)
+				strcat(buf, "<extended>");
+			check_part(buf, dp, base_ext_offset, nsectors,
+				   ntracks, mbr_offset);
+			ext_offsets[dospart] = base_ext_offset + dp->dp_start;
+			ext_sizes[dospart] = dp->dp_size;
+		} else {
+			sname = dsname(dev, dkunit(dev), slice, RAW_PART,
+				       partname);
+			check_part(sname, dp, ext_offset, nsectors, ntracks,
+				   mbr_offset);
+			if (slice >= MAX_SLICES) {
+				printf("%s: too many slices\n", sname);
+				slice++;
+				continue;
+			}
+			sp = &ssp->dss_slices[slice];
+			if (mbr_setslice(sname, lp, sp, dp, ext_offset) != 0)
+				continue;
+			slice++;
+		}
+	}
+	ssp->dss_nslices = slice;
+
+	/* If we found any more slices, recursively find all the subslices. */
+	for (dospart = 0; dospart < NDOSPART; dospart++)
+		if (ext_sizes[dospart] != 0)
+			mbr_extended(dev, lp, ssp, ext_offsets[dospart],
+				     ext_sizes[dospart], base_ext_offset,
+				     nsectors, ntracks, mbr_offset, ++level);
+
+done:
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+}
+
+static int
+mbr_setslice(sname, lp, sp, dp, br_offset)
+	char	*sname;
+	struct disklabel *lp;
+	struct diskslice *sp;
+	struct dos_partition *dp;
+	u_long	br_offset;
+{
+	u_long	offset;
+	u_long	size;
+
+	offset = br_offset + dp->dp_start;
+	if (offset > lp->d_secperunit || offset < br_offset) {
+		printf(
+		"%s: slice starts beyond end of the disk: rejecting it\n",
+		       sname);
+		return (1);
+	}
+	size = lp->d_secperunit - offset;
+	if (size >= dp->dp_size)
+		size = dp->dp_size;
+	else
+		printf(
+"%s: slice extends beyond end of disk: truncating from %lu to %lu sectors\n",
+		       sname, (u_long)dp->dp_size, size);
+	sp->ds_offset = offset;
+	sp->ds_size = size;
+	sp->ds_type = dp->dp_typ;
+#ifdef PC98_ATCOMPAT
+	/* Fake FreeBSD(98). */
+	if (sp->ds_type == DOSPTYP_386BSD)
+		sp->ds_type = 0x94;
+#endif
+#if 0
+	lp->d_subtype |= (lp->d_subtype & 3) | dospart | DSTYPE_INDOSPART;
+#endif
+	return (0);
+}
+
+#ifdef __alpha__
+void
+alpha_fix_srm_checksum(bp)
+	struct buf *bp;
+{
+	u_int64_t *p;
+	u_int64_t sum;
+	int i;
+
+	p = (u_int64_t *) bp->b_data;
+	sum = 0;
+	for (i = 0; i < 63; i++)
+		sum += p[i];
+	p[63] = sum;
+}
+#endif
diff --git a/sys/kern/subr_diskslice.c b/sys/kern/subr_diskslice.c
new file mode 100644
index 0000000..ec6099e
--- /dev/null
+++ b/sys/kern/subr_diskslice.c
@@ -0,0 +1,997 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)wd.c	7.2 (Berkeley) 5/9/91
+ *	from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $
+ *	from: @(#)ufs_disksubr.c	7.16 (Berkeley) 5/4/91
+ *	from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/stat.h>
+#include <sys/stdint.h>
+#include <sys/syslog.h>
+#include <sys/vnode.h>
+
+#define TRACE(str)	do { if (ds_debug) printf str; } while (0)
+
+typedef	u_char	bool_t;
+
+static volatile bool_t ds_debug;
+
+static struct disklabel *clone_label(struct disklabel *lp);
+static void dsiodone(struct bio *bp);
+static char *fixlabel(char *sname, struct diskslice *sp,
+			   struct disklabel *lp, int writeflag);
+static void free_ds_label(struct diskslices *ssp, int slice);
+static void partition_info(char *sname, int part, struct partition *pp);
+static void slice_info(char *sname, struct diskslice *sp);
+static void set_ds_label(struct diskslices *ssp, int slice,
+			      struct disklabel *lp);
+static void set_ds_labeldevs(dev_t dev, struct diskslices *ssp);
+static void set_ds_wlabel(struct diskslices *ssp, int slice,
+			       int wlabel);
+
+/*
+ * Duplicate a label for the whole disk, and initialize defaults in the
+ * copy for fields that are not already initialized.  The caller only
+ * needs to initialize d_secsize and d_secperunit, and zero the fields
+ * that are to be defaulted.
+ */
+static struct disklabel *
+clone_label(lp)
+	struct disklabel *lp;
+{
+	struct disklabel *lp1;
+
+	lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK);
+	*lp1 = *lp;
+	lp = NULL;
+	if (lp1->d_typename[0] == '\0')
+		strncpy(lp1->d_typename, "amnesiac", sizeof(lp1->d_typename));
+	if (lp1->d_packname[0] == '\0')
+		strncpy(lp1->d_packname, "fictitious", sizeof(lp1->d_packname));
+	if (lp1->d_nsectors == 0)
+		lp1->d_nsectors = 32;
+	if (lp1->d_ntracks == 0)
+		lp1->d_ntracks = 64;
+	lp1->d_secpercyl = lp1->d_nsectors * lp1->d_ntracks;
+	lp1->d_ncylinders = lp1->d_secperunit / lp1->d_secpercyl;
+	if (lp1->d_rpm == 0)
+		lp1->d_rpm = 3600;
+	if (lp1->d_interleave == 0)
+		lp1->d_interleave = 1;
+	if (lp1->d_npartitions < RAW_PART + 1)
+		lp1->d_npartitions = MAXPARTITIONS;
+	if (lp1->d_bbsize == 0)
+		lp1->d_bbsize = BBSIZE;
+	lp1->d_partitions[RAW_PART].p_size = lp1->d_secperunit;
+	lp1->d_magic = DISKMAGIC;
+	lp1->d_magic2 = DISKMAGIC;
+	lp1->d_checksum = dkcksum(lp1);
+	return (lp1);
+}
+
+dev_t
+dkmodpart(dev_t dev, int part)
+{
+	return (makedev(major(dev), (minor(dev) & ~7) | part));
+}
+
+dev_t
+dkmodslice(dev_t dev, int slice)
+{
+	return (makedev(major(dev), (minor(dev) & ~0x1f0000) | (slice << 16)));
+}
+
+u_int
+dkunit(dev_t dev)
+{
+	return (((minor(dev) >> 16) & 0x1e0) | ((minor(dev) >> 3) & 0x1f));
+}
+
+/*
+ * Determine the size of the transfer, and make sure it is
+ * within the boundaries of the partition. Adjust transfer
+ * if needed, and signal errors or early completion.
+ *
+ * XXX TODO:
+ *	o Split buffers that are too big for the device.
+ *	o Check for overflow.
+ *	o Finish cleaning this up.
+ */
+int
+dscheck(bp, ssp)
+	struct bio *bp;
+	struct diskslices *ssp;
+{
+	daddr_t blkno;
+	daddr_t endsecno;
+	daddr_t labelsect;
+	struct disklabel *lp;
+	char *msg;
+	long	nsec;
+	struct partition *pp;
+	daddr_t secno;
+	daddr_t slicerel_secno;
+	struct diskslice *sp;
+
+	blkno = bp->bio_blkno;
+	if (blkno < 0) {
+		printf("dscheck(%s): negative bio_blkno %ld\n", 
+		    devtoname(bp->bio_dev), (long)blkno);
+		bp->bio_error = EINVAL;
+		goto bad;
+	}
+	sp = &ssp->dss_slices[dkslice(bp->bio_dev)];
+	lp = sp->ds_label;
+	if (ssp->dss_secmult == 1) {
+		if (bp->bio_bcount % (u_long)DEV_BSIZE)
+			goto bad_bcount;
+		secno = blkno;
+		nsec = bp->bio_bcount >> DEV_BSHIFT;
+	} else if (ssp->dss_secshift != -1) {
+		if (bp->bio_bcount & (ssp->dss_secsize - 1))
+			goto bad_bcount;
+		if (blkno & (ssp->dss_secmult - 1))
+			goto bad_blkno;
+		secno = blkno >> ssp->dss_secshift;
+		nsec = bp->bio_bcount >> (DEV_BSHIFT + ssp->dss_secshift);
+	} else {
+		if (bp->bio_bcount % ssp->dss_secsize)
+			goto bad_bcount;
+		if (blkno % ssp->dss_secmult)
+			goto bad_blkno;
+		secno = blkno / ssp->dss_secmult;
+		nsec = bp->bio_bcount / ssp->dss_secsize;
+	}
+	if (lp == NULL) {
+		labelsect = -LABELSECTOR - 1;
+		endsecno = sp->ds_size;
+		slicerel_secno = secno;
+	} else {
+		labelsect = lp->d_partitions[LABEL_PART].p_offset;
+if (labelsect != 0) Debugger("labelsect != 0 in dscheck()");
+		pp = &lp->d_partitions[dkpart(bp->bio_dev)];
+		endsecno = pp->p_size;
+		slicerel_secno = pp->p_offset + secno;
+	}
+
+	/* overwriting disk label ? */
+	/* XXX should also protect bootstrap in first 8K */
+	if (slicerel_secno <= LABELSECTOR + labelsect &&
+#if LABELSECTOR != 0
+	    slicerel_secno + nsec > LABELSECTOR + labelsect &&
+#endif
+	    (bp->bio_cmd == BIO_WRITE) && sp->ds_wlabel == 0) {
+		bp->bio_error = EROFS;
+		goto bad;
+	}
+
+#if defined(DOSBBSECTOR) && defined(notyet)
+	/* overwriting master boot record? */
+	if (slicerel_secno <= DOSBBSECTOR && (bp->bio_cmd == BIO_WRITE) &&
+	    sp->ds_wlabel == 0) {
+		bp->bio_error = EROFS;
+		goto bad;
+	}
+#endif
+
+	/* beyond partition? */
+	if ((uintmax_t)secno + nsec > endsecno) {
+		/* if exactly at end of disk, return an EOF */
+		if (secno == endsecno) {
+			bp->bio_resid = bp->bio_bcount;
+			return (0);
+		}
+		/* or truncate if part of it fits */
+		if (secno > endsecno) {
+			bp->bio_error = EINVAL;
+			goto bad;
+		}
+		bp->bio_bcount = (endsecno - secno) * ssp->dss_secsize;
+	}
+
+	bp->bio_pblkno = sp->ds_offset + slicerel_secno;
+
+	/*
+	 * Snoop on label accesses if the slice offset is nonzero.  Fudge
+	 * offsets in the label to keep the in-core label coherent with
+	 * the on-disk one.
+	 */
+	if (slicerel_secno <= LABELSECTOR + labelsect
+#if LABELSECTOR != 0
+	    && slicerel_secno + nsec > LABELSECTOR + labelsect
+#endif
+	    && sp->ds_offset != 0) {
+		struct iodone_chain *ic;
+
+		ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK);
+		ic->ic_prev_flags = bp->bio_flags;
+		ic->ic_prev_iodone = bp->bio_done;
+		ic->ic_prev_iodone_chain = bp->bio_done_chain;
+		ic->ic_args[0].ia_long = (LABELSECTOR + labelsect -
+		    slicerel_secno) * ssp->dss_secsize;
+		ic->ic_args[1].ia_ptr = sp;
+		bp->bio_done = dsiodone;
+		bp->bio_done_chain = ic;
+		if (!(bp->bio_cmd == BIO_READ)) {
+			/*
+			 * XXX even disklabel(8) writes directly so we need
+			 * to adjust writes.  Perhaps we should drop support
+			 * for DIOCWLABEL (always write protect labels) and
+			 * require the use of DIOCWDINFO.
+			 *
+			 * XXX probably need to copy the data to avoid even
+			 * temporarily corrupting the in-core copy.
+			 */
+			/* XXX need name here. */
+			msg = fixlabel((char *)NULL, sp,
+				       (struct disklabel *)
+				       (bp->bio_data + ic->ic_args[0].ia_long),
+				       TRUE);
+			if (msg != NULL) {
+				printf("dscheck(%s): %s\n", 
+				    devtoname(bp->bio_dev), msg);
+				bp->bio_error = EROFS;
+				goto bad;
+			}
+		}
+	}
+	return (1);
+
+bad_bcount:
+	printf(
+	"dscheck(%s): bio_bcount %ld is not on a sector boundary (ssize %d)\n",
+	    devtoname(bp->bio_dev), bp->bio_bcount, ssp->dss_secsize);
+	bp->bio_error = EINVAL;
+	goto bad;
+
+bad_blkno:
+	printf(
+	"dscheck(%s): bio_blkno %ld is not on a sector boundary (ssize %d)\n",
+	    devtoname(bp->bio_dev), (long)blkno, ssp->dss_secsize);
+	bp->bio_error = EINVAL;
+	goto bad;
+
+bad:
+	bp->bio_resid = bp->bio_bcount;
+	bp->bio_flags |= BIO_ERROR;
+	return (-1);
+}
+
+void
+dsclose(dev, mode, ssp)
+	dev_t	dev;
+	int	mode;
+	struct diskslices *ssp;
+{
+	u_char	mask;
+	struct diskslice *sp;
+
+	sp = &ssp->dss_slices[dkslice(dev)];
+	mask = 1 << dkpart(dev);
+	sp->ds_openmask &= ~mask;
+}
+
+void
+dsgone(sspp)
+	struct diskslices **sspp;
+{
+	int	slice;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+
+	for (slice = 0, ssp = *sspp; slice < ssp->dss_nslices; slice++) {
+		sp = &ssp->dss_slices[slice];
+		free_ds_label(ssp, slice);
+	}
+	free(ssp, M_DEVBUF);
+	*sspp = NULL;
+}
+
+/*
+ * For the "write" commands (DIOCSDINFO and DIOCWDINFO), this
+ * is subject to the same restriction as dsopen().
+ */
+int
+dsioctl(dev, cmd, data, flags, sspp)
+	dev_t	dev;
+	u_long	cmd;
+	caddr_t	data;
+	int	flags;
+	struct diskslices **sspp;
+{
+	int	error;
+	struct disklabel *lp;
+	int	old_wlabel;
+	u_char	openmask;
+	int	part;
+	int	slice;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+	struct partition *pp;
+
+	slice = dkslice(dev);
+	ssp = *sspp;
+	sp = &ssp->dss_slices[slice];
+	lp = sp->ds_label;
+	switch (cmd) {
+
+	case DIOCGDVIRGIN:
+		lp = (struct disklabel *)data;
+		if (ssp->dss_slices[WHOLE_DISK_SLICE].ds_label) {
+			*lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label;
+		} else {
+			bzero(lp, sizeof(struct disklabel));
+		}
+
+		lp->d_magic = DISKMAGIC;
+		lp->d_magic2 = DISKMAGIC;
+		pp = &lp->d_partitions[RAW_PART];
+		pp->p_offset = 0;
+		pp->p_size = sp->ds_size;
+
+		lp->d_npartitions = MAXPARTITIONS;
+		if (lp->d_interleave == 0)
+			lp->d_interleave = 1;
+		if (lp->d_rpm == 0)
+			lp->d_rpm = 3600;
+		if (lp->d_nsectors == 0)
+			lp->d_nsectors = 32;
+		if (lp->d_ntracks == 0)
+			lp->d_ntracks = 64;
+
+		lp->d_bbsize = BBSIZE;
+		lp->d_sbsize = 0;
+		lp->d_secpercyl = lp->d_nsectors * lp->d_ntracks;
+		lp->d_ncylinders = sp->ds_size / lp->d_secpercyl;
+		lp->d_secperunit = sp->ds_size;
+		lp->d_checksum = 0;
+		lp->d_checksum = dkcksum(lp);
+		return (0);
+
+	case DIOCGDINFO:
+		if (lp == NULL)
+			return (EINVAL);
+		*(struct disklabel *)data = *lp;
+		return (0);
+
+	case DIOCGSECTORSIZE:
+		if (lp == NULL)
+			return (EINVAL);
+		*(u_int *)data = lp->d_secsize;
+		return (0);
+
+	case DIOCGMEDIASIZE:
+		if (lp == NULL)
+			return (EINVAL);
+		*(off_t *)data = (off_t)lp->d_partitions[dkpart(dev)].p_size *
+		    lp->d_secsize;
+		return (0);
+
+	case DIOCGSLICEINFO:
+		bcopy(ssp, data, (char *)&ssp->dss_slices[ssp->dss_nslices] -
+				 (char *)ssp);
+		return (0);
+
+	case DIOCSDINFO:
+		if (slice == WHOLE_DISK_SLICE)
+			return (ENODEV);
+		if (!(flags & FWRITE))
+			return (EBADF);
+		lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+		if (sp->ds_label == NULL)
+			bzero(lp, sizeof *lp);
+		else
+			bcopy(sp->ds_label, lp, sizeof *lp);
+		if (sp->ds_label == NULL)
+			openmask = 0;
+		else {
+			openmask = sp->ds_openmask;
+			if (slice == COMPATIBILITY_SLICE)
+				openmask |= ssp->dss_slices[
+				    ssp->dss_first_bsd_slice].ds_openmask;
+			else if (slice == ssp->dss_first_bsd_slice)
+				openmask |= ssp->dss_slices[
+				    COMPATIBILITY_SLICE].ds_openmask;
+		}
+		error = setdisklabel(lp, (struct disklabel *)data,
+				     (u_long)openmask);
+		/* XXX why doesn't setdisklabel() check this? */
+		if (error == 0 && lp->d_partitions[RAW_PART].p_offset != 0)
+			error = EXDEV;
+		if (error == 0) {
+			if (lp->d_secperunit > sp->ds_size)
+				error = ENOSPC;
+			for (part = 0; part < lp->d_npartitions; part++)
+				if (lp->d_partitions[part].p_size > sp->ds_size)
+					error = ENOSPC;
+		}
+		if (error != 0) {
+			free(lp, M_DEVBUF);
+			return (error);
+		}
+		free_ds_label(ssp, slice);
+		set_ds_label(ssp, slice, lp);
+		set_ds_labeldevs(dev, ssp);
+		return (0);
+
+	case DIOCSYNCSLICEINFO:
+		if (slice != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART)
+			return (EINVAL);
+		if (!*(int *)data)
+			for (slice = 0; slice < ssp->dss_nslices; slice++) {
+				openmask = ssp->dss_slices[slice].ds_openmask;
+				if (openmask
+				    && (slice != WHOLE_DISK_SLICE
+					|| openmask & ~(1 << RAW_PART)))
+					return (EBUSY);
+			}
+
+		/*
+		 * Temporarily forget the current slices struct and read
+		 * the current one.
+		 * XXX should wait for current accesses on this disk to
+		 * complete, then lock out future accesses and opens.
+		 */
+		*sspp = NULL;
+		lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+		*lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label;
+		error = dsopen(dev, S_IFCHR, ssp->dss_oflags, sspp, lp);
+		if (error != 0) {
+			free(lp, M_DEVBUF);
+			*sspp = ssp;
+			return (error);
+		}
+
+		/*
+		 * Reopen everything.  This is a no-op except in the "force"
+		 * case and when the raw bdev and cdev are both open.  Abort
+		 * if anything fails.
+		 */
+		for (slice = 0; slice < ssp->dss_nslices; slice++) {
+			for (openmask = ssp->dss_slices[slice].ds_openmask,
+			     part = 0; openmask; openmask >>= 1, part++) {
+				if (!(openmask & 1))
+					continue;
+				error = dsopen(dkmodslice(dkmodpart(dev, part),
+							  slice),
+					       S_IFCHR, ssp->dss_oflags, sspp,
+					       lp);
+				if (error != 0) {
+					free(lp, M_DEVBUF);
+					*sspp = ssp;
+					return (EBUSY);
+				}
+			}
+		}
+
+		free(lp, M_DEVBUF);
+		dsgone(&ssp);
+		return (0);
+
+	case DIOCWDINFO:
+		error = dsioctl(dev, DIOCSDINFO, data, flags, &ssp);
+		if (error != 0)
+			return (error);
+		/*
+		 * XXX this used to hack on dk_openpart to fake opening
+		 * partition 0 in case that is used instead of dkpart(dev).
+		 */
+		old_wlabel = sp->ds_wlabel;
+		set_ds_wlabel(ssp, slice, TRUE);
+		error = writedisklabel(dev, sp->ds_label);
+		/* XXX should invalidate in-core label if write failed. */
+		set_ds_wlabel(ssp, slice, old_wlabel);
+		return (error);
+
+	case DIOCWLABEL:
+#ifndef __alpha__
+		if (slice == WHOLE_DISK_SLICE)
+			return (ENODEV);
+#endif
+		if (!(flags & FWRITE))
+			return (EBADF);
+		set_ds_wlabel(ssp, slice, *(int *)data != 0);
+		return (0);
+
+	default:
+		return (ENOIOCTL);
+	}
+}
+
+static void
+dsiodone(bp)
+	struct bio *bp;
+{
+	struct iodone_chain *ic;
+	char *msg;
+
+	ic = bp->bio_done_chain;
+	bp->bio_done = ic->ic_prev_iodone;
+	bp->bio_done_chain = ic->ic_prev_iodone_chain;
+	if (!(bp->bio_cmd == BIO_READ)
+	    || (!(bp->bio_flags & BIO_ERROR) && bp->bio_error == 0)) {
+		msg = fixlabel((char *)NULL, ic->ic_args[1].ia_ptr,
+			       (struct disklabel *)
+			       (bp->bio_data + ic->ic_args[0].ia_long),
+			       FALSE);
+		if (msg != NULL)
+			printf("%s\n", msg);
+	}
+	free(ic, M_DEVBUF);
+	biodone(bp);
+}
+
+int
+dsisopen(ssp)
+	struct diskslices *ssp;
+{
+	int	slice;
+
+	if (ssp == NULL)
+		return (0);
+	for (slice = 0; slice < ssp->dss_nslices; slice++)
+		if (ssp->dss_slices[slice].ds_openmask)
+			return (1);
+	return (0);
+}
+
+/*
+ * Allocate a slices "struct" and initialize it to contain only an empty
+ * compatibility slice (pointing to itself), a whole disk slice (covering
+ * the disk as described by the label), and (nslices - BASE_SLICES) empty
+ * slices beginning at BASE_SLICE.
+ */
+struct diskslices *
+dsmakeslicestruct(nslices, lp)
+	int nslices;
+	struct disklabel *lp;
+{
+	struct diskslice *sp;
+	struct diskslices *ssp;
+
+	ssp = malloc(offsetof(struct diskslices, dss_slices) +
+		     nslices * sizeof *sp, M_DEVBUF, M_WAITOK);
+	ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE;
+	ssp->dss_nslices = nslices;
+	ssp->dss_oflags = 0;
+	ssp->dss_secmult = lp->d_secsize / DEV_BSIZE;
+	if (ssp->dss_secmult & (ssp->dss_secmult - 1))
+		ssp->dss_secshift = -1;
+	else
+		ssp->dss_secshift = ffs(ssp->dss_secmult) - 1;
+	ssp->dss_secsize = lp->d_secsize;
+	sp = &ssp->dss_slices[0];
+	bzero(sp, nslices * sizeof *sp);
+	sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit;
+	return (ssp);
+}
+
+char *
+dsname(dev, unit, slice, part, partname)
+	dev_t	dev;
+	int	unit;
+	int	slice;
+	int	part;
+	char	*partname;
+{
+	static char name[32];
+	const char *dname;
+
+	dname = devsw(dev)->d_name;
+	if (strlen(dname) > 16)
+		dname = "nametoolong";
+	snprintf(name, sizeof(name), "%s%d", dname, unit);
+	partname[0] = '\0';
+	if (slice != WHOLE_DISK_SLICE || part != RAW_PART) {
+		partname[0] = 'a' + part;
+		partname[1] = '\0';
+		if (slice != COMPATIBILITY_SLICE)
+			snprintf(name + strlen(name),
+			    sizeof(name) - strlen(name), "s%d", slice - 1);
+	}
+	return (name);
+}
+
+/*
+ * This should only be called when the unit is inactive and the strategy
+ * routine should not allow it to become active unless we call it.  Our
+ * strategy routine must be special to allow activity.
+ */
+int
+dsopen(dev, mode, flags, sspp, lp)
+	dev_t	dev;
+	int	mode;
+	u_int	flags;
+	struct diskslices **sspp;
+	struct disklabel *lp;
+{
+	dev_t	dev1;
+	int	error;
+	struct disklabel *lp1;
+	char	*msg;
+	u_char	mask;
+	int	part;
+	char	partname[2];
+	int	slice;
+	char	*sname;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+	int	unit;
+
+	dev->si_bsize_phys = lp->d_secsize;
+
+	unit = dkunit(dev);
+	if (lp->d_secsize % DEV_BSIZE) {
+		printf("%s: invalid sector size %lu\n", devtoname(dev),
+		    (u_long)lp->d_secsize);
+		return (EINVAL);
+	}
+
+	/*
+	 * XXX reinitialize the slice table unless there is an open device
+	 * on the unit.  This should only be done if the media has changed.
+	 */
+	ssp = *sspp;
+	if (!dsisopen(ssp)) {
+		if (ssp != NULL)
+			dsgone(sspp);
+		/*
+		 * Allocate a minimal slices "struct".  This will become
+		 * the final slices "struct" if we don't want real slices
+		 * or if we can't find any real slices.
+		 */
+		*sspp = dsmakeslicestruct(BASE_SLICE, lp);
+
+		if (!(flags & DSO_ONESLICE)) {
+			TRACE(("dsinit\n"));
+			error = dsinit(dev, lp, sspp);
+			if (error != 0) {
+				dsgone(sspp);
+				return (error);
+			}
+		}
+		ssp = *sspp;
+		ssp->dss_oflags = flags;
+
+		/*
+		 * If there are no real slices, then make the compatiblity
+		 * slice cover the whole disk.
+		 */
+		if (ssp->dss_nslices == BASE_SLICE)
+			ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+				= lp->d_secperunit;
+
+		/* Point the compatibility slice at the BSD slice, if any. */
+		for (slice = BASE_SLICE; slice < ssp->dss_nslices; slice++) {
+			sp = &ssp->dss_slices[slice];
+			if (sp->ds_type == DOSPTYP_386BSD /* XXX */) {
+				ssp->dss_first_bsd_slice = slice;
+				ssp->dss_slices[COMPATIBILITY_SLICE].ds_offset
+					= sp->ds_offset;
+				ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+					= sp->ds_size;
+				ssp->dss_slices[COMPATIBILITY_SLICE].ds_type
+					= sp->ds_type;
+				break;
+			}
+		}
+
+		ssp->dss_slices[WHOLE_DISK_SLICE].ds_label = clone_label(lp);
+		ssp->dss_slices[WHOLE_DISK_SLICE].ds_wlabel = TRUE;
+	}
+
+	/* Initialize secondary info for all slices.  */
+	for (slice = 0; slice < ssp->dss_nslices; slice++) {
+		sp = &ssp->dss_slices[slice];
+		if (sp->ds_label != NULL
+#ifdef __alpha__
+		    && slice != WHOLE_DISK_SLICE
+#endif
+		    )
+			continue;
+		dev1 = dkmodslice(dkmodpart(dev, RAW_PART), slice);
+#if 0
+		sname = dsname(dev, unit, slice, RAW_PART, partname);
+#else
+		*partname='\0';
+		sname = dev1->si_name;
+#endif
+		/*
+		 * XXX this should probably only be done for the need_init
+		 * case, but there may be a problem with DIOCSYNCSLICEINFO.
+		 */
+		set_ds_wlabel(ssp, slice, TRUE);	/* XXX invert */
+		lp1 = clone_label(lp);
+		TRACE(("readdisklabel\n"));
+		if (flags & DSO_NOLABELS)
+			msg = NULL;
+		else {
+			msg = readdisklabel(dev1, lp1);
+
+			/*
+			 * readdisklabel() returns NULL for success, and an
+			 * error string for failure.
+			 *
+			 * If there isn't a label on the disk, and if the
+			 * DSO_COMPATLABEL is set, we want to use the
+			 * faked-up label provided by the caller.
+			 *
+			 * So we set msg to NULL to indicate that there is
+			 * no failure (since we have a faked-up label),
+			 * free lp1, and then clone it again from lp.
+			 * (In case readdisklabel() modified lp1.)
+			 */
+			if (msg != NULL && (flags & DSO_COMPATLABEL)) {
+				msg = NULL;
+				free(lp1, M_DEVBUF);
+				lp1 = clone_label(lp);
+			}
+		}
+		if (msg == NULL)
+			msg = fixlabel(sname, sp, lp1, FALSE);
+		if (msg == NULL && lp1->d_secsize != ssp->dss_secsize)
+			msg = "inconsistent sector size";
+		if (msg != NULL) {
+			if (sp->ds_type == DOSPTYP_386BSD /* XXX */)
+				log(LOG_WARNING, "%s: cannot find label (%s)\n",
+				    sname, msg);
+			free(lp1, M_DEVBUF);
+			continue;
+		}
+		if (lp1->d_flags & D_BADSECT) {
+			log(LOG_ERR, "%s: bad sector table not supported\n",
+			    sname);
+			free(lp1, M_DEVBUF);
+			continue;
+		}
+		set_ds_label(ssp, slice, lp1);
+		set_ds_labeldevs(dev1, ssp);
+		set_ds_wlabel(ssp, slice, FALSE);
+	}
+
+	slice = dkslice(dev);
+	if (slice >= ssp->dss_nslices)
+		return (ENXIO);
+	sp = &ssp->dss_slices[slice];
+	part = dkpart(dev);
+	if (part != RAW_PART
+	    && (sp->ds_label == NULL || part >= sp->ds_label->d_npartitions))
+		return (EINVAL);	/* XXX needs translation */
+	mask = 1 << part;
+	sp->ds_openmask |= mask;
+	return (0);
+}
+
+int
+dssize(dev, sspp)
+	dev_t	dev;
+	struct diskslices **sspp;
+{
+	struct disklabel *lp;
+	int	part;
+	int	slice;
+	struct diskslices *ssp;
+
+	slice = dkslice(dev);
+	part = dkpart(dev);
+	ssp = *sspp;
+	if (ssp == NULL || slice >= ssp->dss_nslices
+	    || !(ssp->dss_slices[slice].ds_openmask & (1 << part))) {
+		if (devsw(dev)->d_open(dev, FREAD, S_IFCHR,
+		    (struct thread *)NULL) != 0)
+			return (-1);
+		devsw(dev)->d_close(dev, FREAD, S_IFCHR, (struct thread *)NULL);
+		ssp = *sspp;
+	}
+	lp = ssp->dss_slices[slice].ds_label;
+	if (lp == NULL)
+		return (-1);
+	return ((int)lp->d_partitions[part].p_size);
+}
+
+static void
+free_ds_label(ssp, slice)
+	struct diskslices *ssp;
+	int	slice;
+{
+	struct disklabel *lp;
+	struct diskslice *sp;
+
+	sp = &ssp->dss_slices[slice];
+	lp = sp->ds_label;
+	if (lp == NULL)
+		return;
+	free(lp, M_DEVBUF);
+	set_ds_label(ssp, slice, (struct disklabel *)NULL);
+}
+
+
+static char *
+fixlabel(sname, sp, lp, writeflag)
+	char	*sname;
+	struct diskslice *sp;
+	struct disklabel *lp;
+	int	writeflag;
+{
+	u_long	end;
+	u_long	offset;
+	int	part;
+	struct partition *pp;
+	u_long	start;
+	bool_t	warned;
+
+	/* These errors "can't happen" so don't bother reporting details. */
+	if (lp->d_magic != DISKMAGIC || lp->d_magic2 != DISKMAGIC)
+		return ("fixlabel: invalid magic");
+	if (dkcksum(lp) != 0)
+		return ("fixlabel: invalid checksum");
+
+	pp = &lp->d_partitions[RAW_PART];
+	if (writeflag) {
+		start = 0;
+		offset = sp->ds_offset;
+	} else {
+		start = sp->ds_offset;
+		offset = -sp->ds_offset;
+	}
+	if (pp->p_offset != start) {
+		if (sname != NULL) {
+			printf(
+"%s: rejecting BSD label: raw partition offset != slice offset\n",
+			       sname);
+			slice_info(sname, sp);
+			partition_info(sname, RAW_PART, pp);
+		}
+		return ("fixlabel: raw partition offset != slice offset");
+	}
+	if (pp->p_size != sp->ds_size) {
+		if (sname != NULL) {
+			printf("%s: raw partition size != slice size\n", sname);
+			slice_info(sname, sp);
+			partition_info(sname, RAW_PART, pp);
+		}
+		if (pp->p_size > sp->ds_size) {
+			if (sname == NULL)
+				return ("fixlabel: raw partition size > slice size");
+			printf("%s: truncating raw partition\n", sname);
+			pp->p_size = sp->ds_size;
+		}
+	}
+	end = start + sp->ds_size;
+	if (start > end)
+		return ("fixlabel: slice wraps");
+	if (lp->d_secpercyl <= 0)
+		return ("fixlabel: d_secpercyl <= 0");
+	pp -= RAW_PART;
+	warned = FALSE;
+	for (part = 0; part < lp->d_npartitions; part++, pp++) {
+		if (pp->p_offset != 0 || pp->p_size != 0) {
+			if (pp->p_offset < start
+			    || pp->p_offset + pp->p_size > end
+			    || pp->p_offset + pp->p_size < pp->p_offset) {
+				if (sname != NULL) {
+					printf(
+"%s: rejecting partition in BSD label: it isn't entirely within the slice\n",
+					       sname);
+					if (!warned) {
+						slice_info(sname, sp);
+						warned = TRUE;
+					}
+					partition_info(sname, part, pp);
+				}
+				/* XXX else silently discard junk. */
+				bzero(pp, sizeof *pp);
+			} else
+				pp->p_offset += offset;
+		}
+	}
+	lp->d_ncylinders = sp->ds_size / lp->d_secpercyl;
+	lp->d_secperunit = sp->ds_size;
+ 	lp->d_checksum = 0;
+ 	lp->d_checksum = dkcksum(lp);
+	return (NULL);
+}
+
+static void
+partition_info(sname, part, pp)
+	char	*sname;
+	int	part;
+	struct partition *pp;
+{
+	printf("%s%c: start %lu, end %lu, size %lu\n", sname, 'a' + part,
+	       (u_long)pp->p_offset, (u_long)(pp->p_offset + pp->p_size - 1),
+	       (u_long)pp->p_size);
+}
+
+static void
+slice_info(sname, sp)
+	char	*sname;
+	struct diskslice *sp;
+{
+	printf("%s: start %lu, end %lu, size %lu\n", sname,
+	       sp->ds_offset, sp->ds_offset + sp->ds_size - 1, sp->ds_size);
+}
+
+static void
+set_ds_label(ssp, slice, lp)
+	struct diskslices *ssp;
+	int	slice;
+	struct disklabel *lp;
+{
+	ssp->dss_slices[slice].ds_label = lp;
+	if (slice == COMPATIBILITY_SLICE)
+		ssp->dss_slices[ssp->dss_first_bsd_slice].ds_label = lp;
+	else if (slice == ssp->dss_first_bsd_slice)
+		ssp->dss_slices[COMPATIBILITY_SLICE].ds_label = lp;
+}
+
+static void
+set_ds_labeldevs(dev, ssp)
+	dev_t	dev;
+	struct diskslices *ssp;
+{
+}
+
+
+static void
+set_ds_wlabel(ssp, slice, wlabel)
+	struct diskslices *ssp;
+	int	slice;
+	int	wlabel;
+{
+	ssp->dss_slices[slice].ds_wlabel = wlabel;
+	if (slice == COMPATIBILITY_SLICE)
+		ssp->dss_slices[ssp->dss_first_bsd_slice].ds_wlabel = wlabel;
+	else if (slice == ssp->dss_first_bsd_slice)
+		ssp->dss_slices[COMPATIBILITY_SLICE].ds_wlabel = wlabel;
+}
diff --git a/sys/kern/subr_eventhandler.c b/sys/kern/subr_eventhandler.c
new file mode 100644
index 0000000..45b4302
--- /dev/null
+++ b/sys/kern/subr_eventhandler.c
@@ -0,0 +1,173 @@
+/*-
+ * Copyright (c) 1999 Michael Smith <msmith@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+
+static MALLOC_DEFINE(M_EVENTHANDLER, "eventhandler", "Event handler records");
+
+/* List of 'slow' lists */
+static TAILQ_HEAD(, eventhandler_list)	eventhandler_lists;
+static int				eventhandler_lists_initted = 0;
+static struct mtx			eventhandler_mutex;
+
+struct eventhandler_entry_generic 
+{
+    struct eventhandler_entry	ee;
+    void			(* func)(void);
+};
+
+/*
+ * Initialize the eventhandler mutex and list.
+ */
+static void
+eventhandler_init(void *dummy __unused)
+{
+    TAILQ_INIT(&eventhandler_lists);
+    mtx_init(&eventhandler_mutex, "eventhandler", NULL, MTX_DEF | MTX_RECURSE);
+    eventhandler_lists_initted = 1;
+}
+SYSINIT(eventhandlers, SI_SUB_EVENTHANDLER, SI_ORDER_FIRST, eventhandler_init,
+    NULL)
+
+/* 
+ * Insertion is O(n) due to the priority scan, but optimises to O(1)
+ * if all priorities are identical.
+ */
+eventhandler_tag
+eventhandler_register(struct eventhandler_list *list, char *name, 
+		      void *func, void *arg, int priority)
+{
+    struct eventhandler_entry_generic	*eg;
+    struct eventhandler_entry		*ep;
+    
+    KASSERT(eventhandler_lists_initted, ("eventhandler registered too early"));
+
+    /* lock the eventhandler lists */
+    mtx_lock(&eventhandler_mutex);
+
+    /* Do we need to find/create the (slow) list? */
+    if (list == NULL) {
+	/* look for a matching, existing list */
+	list = eventhandler_find_list(name);
+
+	/* Do we need to create the list? */
+	if (list == NULL) {
+	    if ((list = malloc(sizeof(struct eventhandler_list) + strlen(name) + 1, 
+			       M_EVENTHANDLER, M_NOWAIT)) == NULL) {
+		mtx_unlock(&eventhandler_mutex);
+		return(NULL);
+	    }
+	    list->el_flags = 0;
+	    bzero(&list->el_lock, sizeof(list->el_lock));
+	    list->el_name = (char *)list + sizeof(struct eventhandler_list);
+	    strcpy(list->el_name, name);
+	    TAILQ_INSERT_HEAD(&eventhandler_lists, list, el_link);
+	}
+    }
+    if (!(list->el_flags & EHE_INITTED)) {
+	TAILQ_INIT(&list->el_entries);
+	sx_init(&list->el_lock, name);
+	list->el_flags = EHE_INITTED;
+    }
+    mtx_unlock(&eventhandler_mutex);
+    
+    /* allocate an entry for this handler, populate it */
+    if ((eg = malloc(sizeof(struct eventhandler_entry_generic), 
+		     M_EVENTHANDLER, M_NOWAIT)) == NULL) {
+	return(NULL);
+    }
+    eg->func = func;
+    eg->ee.ee_arg = arg;
+    eg->ee.ee_priority = priority;
+    
+    /* sort it into the list */
+    EHE_LOCK(list);
+    for (ep = TAILQ_FIRST(&list->el_entries);
+	 ep != NULL; 
+	 ep = TAILQ_NEXT(ep, ee_link)) {
+	if (eg->ee.ee_priority < ep->ee_priority) {
+	    TAILQ_INSERT_BEFORE(ep, &eg->ee, ee_link);
+	    break;
+	}
+    }
+    if (ep == NULL)
+	TAILQ_INSERT_TAIL(&list->el_entries, &eg->ee, ee_link);
+    EHE_UNLOCK(list);
+    return(&eg->ee);
+}
+
+void
+eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag)
+{
+    struct eventhandler_entry	*ep = tag;
+
+    /* XXX insert diagnostic check here? */
+    EHE_LOCK(list);
+    if (ep != NULL) {
+	/* remove just this entry */
+	TAILQ_REMOVE(&list->el_entries, ep, ee_link);
+	free(ep, M_EVENTHANDLER);
+    } else {
+	/* remove entire list */
+	while (!TAILQ_EMPTY(&list->el_entries)) {
+	    ep = TAILQ_FIRST(&list->el_entries);
+	    TAILQ_REMOVE(&list->el_entries, ep, ee_link);
+	    free(ep, M_EVENTHANDLER);
+	}
+    }
+    EHE_UNLOCK(list);
+}
+
+struct eventhandler_list *
+eventhandler_find_list(char *name)
+{
+    struct eventhandler_list	*list;
+
+    if (!eventhandler_lists_initted)
+	return(NULL);
+    
+    /* scan looking for the requested list */
+    mtx_lock(&eventhandler_mutex);
+    for (list = TAILQ_FIRST(&eventhandler_lists); 
+	 list != NULL; 
+	 list = TAILQ_NEXT(list, el_link)) {
+	if (!strcmp(name, list->el_name))
+	    break;
+    }
+    mtx_unlock(&eventhandler_mutex);
+    
+    return(list);
+}
+
diff --git a/sys/kern/subr_hints.c b/sys/kern/subr_hints.c
new file mode 100644
index 0000000..c68d607
--- /dev/null
+++ b/sys/kern/subr_hints.c
@@ -0,0 +1,366 @@
+/*-
+ * Copyright (c) 2000,2001 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/sx.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+/*
+ * Access functions for device resources.
+ */
+
+static int checkmethod = 1;
+static int use_kenv;
+static char *hintp;
+
+/*
+ * Evil wildcarding resource string lookup.
+ * This walks the supplied env string table and returns a match.
+ * The start point can be remembered for incremental searches.
+ */
+static int
+res_find(int *line, int *startln,
+    const char *name, int *unit, const char *resname, const char *value,
+    const char **ret_name, int *ret_namelen, int *ret_unit,
+    const char **ret_resname, int *ret_resnamelen, const char **ret_value)
+{
+	int n = 0, hit, i = 0;
+	char r_name[32];
+	int r_unit;
+	char r_resname[32];
+	char r_value[128];
+	const char *s, *cp;
+	char *p;
+
+	if (checkmethod) {
+		switch (hintmode) {
+		case 0:		/* loader hints in environment only */
+			break;
+		case 1:		/* static hints only */
+			hintp = static_hints;
+			checkmethod = 0;
+			break;
+		case 2:		/* fallback mode */
+			if (dynamic_kenv) {
+				sx_slock(&kenv_lock);
+				cp = kenvp[0];
+				for (i = 0; cp != NULL; cp = kenvp[++i]) {
+					if (!strncmp(cp, "hint.", 5)) {
+						use_kenv = 1;
+						checkmethod = 0;
+						break;
+					}
+				}
+				sx_sunlock(&kenv_lock);
+			} else {
+				cp = kern_envp;
+				while (cp) {
+					if (strncmp(cp, "hint.", 5) == 0) {
+						cp = NULL;
+						hintp = kern_envp;
+						break;
+					}
+					while (*cp != '\0')
+						cp++;
+					cp++;
+					if (*cp == '\0') {
+						cp = NULL;
+						hintp = static_hints;
+						break;
+					}
+				}
+			}
+			break;
+		default:
+			break;
+		}
+		if (hintp == NULL) {
+			if (dynamic_kenv) {
+				use_kenv = 1;
+				checkmethod = 0;
+			} else
+				hintp = kern_envp;
+		}
+	}
+
+	if (use_kenv) {
+		sx_slock(&kenv_lock);
+		i = 0;
+		cp = kenvp[0];
+		if (cp == NULL) {
+			sx_sunlock(&kenv_lock);
+			return (ENOENT);
+		}
+	} else
+		cp = hintp;
+	while (cp) {
+		hit = 1;
+		(*line)++;
+		if (strncmp(cp, "hint.", 5) != 0)
+			hit = 0;
+		else
+			n = sscanf(cp, "hint.%32[^.].%d.%32[^=]=%128s",
+			    r_name, &r_unit, r_resname, r_value);
+		if (hit && n != 4) {
+			printf("CONFIG: invalid hint '%s'\n", cp);
+			/* XXX: abuse bogus index() declaration */
+			p = index(cp, 'h');
+			*p = 'H';
+			hit = 0;
+		}
+		if (hit && startln && *startln >= 0 && *line < *startln)
+			hit = 0;
+		if (hit && name && strcmp(name, r_name) != 0)
+			hit = 0;
+		if (hit && unit && *unit != r_unit)
+			hit = 0;
+		if (hit && resname && strcmp(resname, r_resname) != 0)
+			hit = 0;
+		if (hit && value && strcmp(value, r_value) != 0)
+			hit = 0;
+		if (hit)
+			break;
+		if (use_kenv) {
+			cp = kenvp[++i];
+			if (cp == NULL)
+				break;
+		} else {
+			while (*cp != '\0')
+				cp++;
+			cp++;
+			if (*cp == '\0') {
+				cp = NULL;
+				break;
+			}
+		}
+	}
+	if (use_kenv)
+		sx_sunlock(&kenv_lock);
+	if (cp == NULL)
+		return ENOENT;
+
+	s = cp;
+	/* This is a bit of a hack, but at least is reentrant */
+	/* Note that it returns some !unterminated! strings. */
+	s = index(s, '.') + 1;		/* start of device */
+	if (ret_name)
+		*ret_name = s;
+	s = index(s, '.') + 1;		/* start of unit */
+	if (ret_namelen)
+		*ret_namelen = s - *ret_name - 1; /* device length */
+	if (ret_unit)
+		*ret_unit = r_unit;
+	s = index(s, '.') + 1;		/* start of resname */
+	if (ret_resname)
+		*ret_resname = s;
+	s = index(s, '=') + 1;		/* start of value */
+	if (ret_resnamelen)
+		*ret_resnamelen = s - *ret_resname - 1; /* value len */
+	if (ret_value)
+		*ret_value = s;
+	if (startln)			/* line number for anchor */
+		*startln = *line + 1;
+	return 0;
+}
+
+/*
+ * Search all the data sources for matches to our query.  We look for
+ * dynamic hints first as overrides for static or fallback hints.
+ */
+static int
+resource_find(int *line, int *startln,
+    const char *name, int *unit, const char *resname, const char *value,
+    const char **ret_name, int *ret_namelen, int *ret_unit,
+    const char **ret_resname, int *ret_resnamelen, const char **ret_value)
+{
+	int i;
+	int un;
+
+	*line = 0;
+
+	/* Search for exact unit matches first */
+	i = res_find(line, startln, name, unit, resname, value,
+	    ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen,
+	    ret_value);
+	if (i == 0)
+		return 0;
+	if (unit == NULL)
+		return ENOENT;
+	/* If we are still here, search for wildcard matches */
+	un = -1;
+	i = res_find(line, startln, name, &un, resname, value,
+	    ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen,
+	    ret_value);
+	if (i == 0)
+		return 0;
+	return ENOENT;
+}
+
+int
+resource_int_value(const char *name, int unit, const char *resname, int *result)
+{
+	int error;
+	const char *str;
+	char *op;
+	unsigned long val;
+	int line;
+
+	line = 0;
+	error = resource_find(&line, NULL, name, &unit, resname, NULL,
+	    NULL, NULL, NULL, NULL, NULL, &str);
+	if (error)
+		return error;
+	if (*str == '\0') 
+		return EFTYPE;
+	val = strtoul(str, &op, 0);
+	if (*op != '\0') 
+		return EFTYPE;
+	*result = val;
+	return 0;
+}
+
+int
+resource_long_value(const char *name, int unit, const char *resname,
+    long *result)
+{
+	int error;
+	const char *str;
+	char *op;
+	unsigned long val;
+	int line;
+
+	line = 0;
+	error = resource_find(&line, NULL, name, &unit, resname, NULL,
+	    NULL, NULL, NULL, NULL, NULL, &str);
+	if (error)
+		return error;
+	if (*str == '\0') 
+		return EFTYPE;
+	val = strtoul(str, &op, 0);
+	if (*op != '\0') 
+		return EFTYPE;
+	*result = val;
+	return 0;
+}
+
+int
+resource_string_value(const char *name, int unit, const char *resname,
+    const char **result)
+{
+	int error;
+	const char *str;
+	int line;
+
+	line = 0;
+	error = resource_find(&line, NULL, name, &unit, resname, NULL,
+	    NULL, NULL, NULL, NULL, NULL, &str);
+	if (error)
+		return error;
+	*result = str;
+	return 0;
+}
+
+/*
+ * This is a bit nasty, but allows us to not modify the env strings.
+ */
+static const char *
+resource_string_copy(const char *s, int len)
+{
+	static char stringbuf[256];
+	static int offset = 0;
+	const char *ret;
+
+	if (len == 0)
+		len = strlen(s);
+	if (len > 255)
+		return NULL;
+	if ((offset + len + 1) > 255)
+		offset = 0;
+	bcopy(s, &stringbuf[offset], len);
+	stringbuf[offset + len] = '\0';
+	ret = &stringbuf[offset];
+	offset += len + 1;
+	return ret;
+}
+
+/*
+ * err = resource_find_at(&anchor, &name, &unit, resname, value)
+ * Iteratively fetch a list of devices wired "at" something
+ * res and value are restrictions.  eg: "at", "scbus0".
+ * For practical purposes, res = required, value = optional.
+ * *name and *unit are set.
+ * set *anchor to zero before starting.
+ */
+int
+resource_find_match(int *anchor, const char **name, int *unit,
+    const char *resname, const char *value)
+{
+	const char *found_name;
+	int found_namelen;
+	int found_unit;
+	int ret;
+	int newln;
+
+	newln = *anchor;
+	ret = resource_find(anchor, &newln, NULL, NULL, resname, value,
+	    &found_name, &found_namelen, &found_unit, NULL, NULL, NULL);
+	if (ret == 0) {
+		*name = resource_string_copy(found_name, found_namelen);
+		*unit = found_unit;
+	}
+	*anchor = newln;
+	return ret;
+}
+
+
+/*
+ * err = resource_find_dev(&anchor, name, &unit, res, value);
+ * Iterate through a list of devices, returning their unit numbers.
+ * res and value are optional restrictions.  eg: "at", "scbus0".
+ * *unit is set to the value.
+ * set *anchor to zero before starting.
+ */
+int
+resource_find_dev(int *anchor, const char *name, int *unit,
+    const char *resname, const char *value)
+{
+	int found_unit;
+	int newln;
+	int ret;
+
+	newln = *anchor;
+	ret = resource_find(anchor, &newln, name, NULL, resname, value,
+	    NULL, NULL, &found_unit, NULL, NULL, NULL);
+	if (ret == 0) {
+		*unit = found_unit;
+	}
+	*anchor = newln;
+	return ret;
+}
diff --git a/sys/kern/subr_kobj.c b/sys/kern/subr_kobj.c
new file mode 100644
index 0000000..b5bfa1f
--- /dev/null
+++ b/sys/kern/subr_kobj.c
@@ -0,0 +1,216 @@
+/*-
+ * Copyright (c) 2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/errno.h>
+#ifndef TEST
+#include <sys/systm.h>
+#endif
+#include <sys/kobj.h>
+
+#ifdef TEST
+#include "usertest.h"
+#endif
+
+static MALLOC_DEFINE(M_KOBJ, "kobj", "Kernel object structures");
+
+#ifdef KOBJ_STATS
+
+#include <sys/sysctl.h>
+
+u_int kobj_lookup_hits;
+u_int kobj_lookup_misses;
+
+SYSCTL_UINT(_kern, OID_AUTO, kobj_hits, CTLFLAG_RD,
+	   &kobj_lookup_hits, 0, "")
+SYSCTL_UINT(_kern, OID_AUTO, kobj_misses, CTLFLAG_RD,
+	   &kobj_lookup_misses, 0, "")
+
+#endif
+
+static int kobj_next_id = 1;
+
+static int
+kobj_error_method(void)
+{
+	return ENXIO;
+}
+
+static void
+kobj_register_method(struct kobjop_desc *desc)
+{
+	if (desc->id == 0)
+		desc->id = kobj_next_id++;
+}
+
+static void
+kobj_unregister_method(struct kobjop_desc *desc)
+{
+}
+
+static void
+kobj_class_compile_common(kobj_class_t cls, kobj_ops_t ops)
+{
+	kobj_method_t *m;
+	int i;
+
+	/*
+	 * Don't do anything if we are already compiled.
+	 */
+	if (cls->ops)
+		return;
+
+	/*
+	 * First register any methods which need it.
+	 */
+	for (i = 0, m = cls->methods; m->desc; i++, m++)
+		kobj_register_method(m->desc);
+
+	/*
+	 * Then initialise the ops table.
+	 */
+	bzero(ops, sizeof(struct kobj_ops));
+	ops->cls = cls;
+	cls->ops = ops;
+}
+
+void
+kobj_class_compile(kobj_class_t cls)
+{
+	kobj_ops_t ops;
+
+	/*
+	 * Allocate space for the compiled ops table.
+	 */
+	ops = malloc(sizeof(struct kobj_ops), M_KOBJ, M_NOWAIT);
+	if (!ops)
+		panic("kobj_compile_methods: out of memory");
+	kobj_class_compile_common(cls, ops);
+}
+
+void
+kobj_class_compile_static(kobj_class_t cls, kobj_ops_t ops)
+{
+	/*
+	 * Increment refs to make sure that the ops table is not freed.
+	 */
+	cls->refs++;
+	kobj_class_compile_common(cls, ops);
+}
+
+void
+kobj_lookup_method(kobj_method_t *methods,
+		   kobj_method_t *ce,
+		   kobjop_desc_t desc)
+{
+	ce->desc = desc;
+	for (; methods && methods->desc; methods++) {
+		if (methods->desc == desc) {
+			ce->func = methods->func;
+			return;
+		}
+	}
+	if (desc->deflt)
+		ce->func = desc->deflt;
+	else
+		ce->func = kobj_error_method;
+	return;
+}
+
+void
+kobj_class_free(kobj_class_t cls)
+{
+	int i;
+	kobj_method_t *m;
+
+	/*
+	 * Unregister any methods which are no longer used.
+	 */
+	for (i = 0, m = cls->methods; m->desc; i++, m++)
+		kobj_unregister_method(m->desc);
+
+	/*
+	 * Free memory and clean up.
+	 */
+	free(cls->ops, M_KOBJ);
+	cls->ops = 0;
+}
+
+kobj_t
+kobj_create(kobj_class_t cls,
+	    struct malloc_type *mtype,
+	    int mflags)
+{
+	kobj_t obj;
+
+	/*
+	 * Allocate and initialise the new object.
+	 */
+	obj = malloc(cls->size, mtype, mflags | M_ZERO);
+	if (!obj)
+		return 0;
+	kobj_init(obj, cls);
+
+	return obj;
+}
+
+void
+kobj_init(kobj_t obj, kobj_class_t cls)
+{
+	/*
+	 * Consider compiling the class' method table.
+	 */
+	if (!cls->ops)
+		kobj_class_compile(cls);
+
+	obj->ops = cls->ops;
+	cls->refs++;
+}
+
+void
+kobj_delete(kobj_t obj, struct malloc_type *mtype)
+{
+	kobj_class_t cls = obj->ops->cls;
+
+	/*
+	 * Consider freeing the compiled method table for the class
+	 * after its last instance is deleted. As an optimisation, we
+	 * should defer this for a short while to avoid thrashing.
+	 */
+	cls->refs--;
+	if (!cls->refs)
+		kobj_class_free(cls);
+
+	obj->ops = 0;
+	if (mtype)
+		free(obj, mtype);
+}
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
new file mode 100644
index 0000000..2c01568
--- /dev/null
+++ b/sys/kern/subr_log.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_log.c	8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+/*
+ * Error log buffer for kernel printf's.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/msgbuf.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/poll.h>
+#include <sys/filedesc.h>
+#include <sys/sysctl.h>
+
+#define LOG_RDPRI	(PZERO + 1)
+
+#define LOG_ASYNC	0x04
+#define LOG_RDWAIT	0x08
+
+static	d_open_t	logopen;
+static	d_close_t	logclose;
+static	d_read_t	logread;
+static	d_ioctl_t	logioctl;
+static	d_poll_t	logpoll;
+
+static	void logtimeout(void *arg);
+
+#define CDEV_MAJOR 7
+static struct cdevsw log_cdevsw = {
+	/* open */	logopen,
+	/* close */	logclose,
+	/* read */	logread,
+	/* write */	nowrite,
+	/* ioctl */	logioctl,
+	/* poll */	logpoll,
+	/* mmap */	nommap,
+	/* strategy */	nostrategy,
+	/* name */	"log",
+	/* maj */	CDEV_MAJOR,
+	/* dump */	nodump,
+	/* psize */	nopsize,
+	/* flags */	0,
+};
+
+static struct logsoftc {
+	int	sc_state;		/* see above for possibilities */
+	struct	selinfo sc_selp;	/* process waiting on select call */
+	struct  sigio *sc_sigio;	/* information for async I/O */
+	struct	callout sc_callout;	/* callout to wakeup syslog  */
+} logsoftc;
+
+int	log_open;			/* also used in log() */
+
+/* Times per second to check for a pending syslog wakeup. */
+static int	log_wakeups_per_second = 5;
+SYSCTL_INT(_kern, OID_AUTO, log_wakeups_per_second, CTLFLAG_RW,
+    &log_wakeups_per_second, 0, "");
+
+/*ARGSUSED*/
+static	int
+logopen(dev_t dev, int flags, int mode, struct thread *td)
+{
+	if (log_open)
+		return (EBUSY);
+	log_open = 1;
+	callout_init(&logsoftc.sc_callout, 0);
+	fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio);	/* signal process only */
+	callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second,
+	    logtimeout, NULL);
+	return (0);
+}
+
+/*ARGSUSED*/
+static	int
+logclose(dev_t dev, int flag, int mode, struct thread *td)
+{
+
+	log_open = 0;
+	callout_stop(&logsoftc.sc_callout);
+	logsoftc.sc_state = 0;
+	funsetown(&logsoftc.sc_sigio);
+	return (0);
+}
+
+/*ARGSUSED*/
+static	int
+logread(dev_t dev, struct uio *uio, int flag)
+{
+	struct msgbuf *mbp = msgbufp;
+	long l;
+	int s;
+	int error = 0;
+
+	s = splhigh();
+	while (mbp->msg_bufr == mbp->msg_bufx) {
+		if (flag & IO_NDELAY) {
+			splx(s);
+			return (EWOULDBLOCK);
+		}
+		logsoftc.sc_state |= LOG_RDWAIT;
+		if ((error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH,
+		    "klog", 0))) {
+			splx(s);
+			return (error);
+		}
+	}
+	splx(s);
+	logsoftc.sc_state &= ~LOG_RDWAIT;
+
+	while (uio->uio_resid > 0) {
+		l = mbp->msg_bufx - mbp->msg_bufr;
+		if (l < 0)
+			l = mbp->msg_size - mbp->msg_bufr;
+		l = min(l, uio->uio_resid);
+		if (l == 0)
+			break;
+		error = uiomove((caddr_t)msgbufp->msg_ptr + mbp->msg_bufr,
+		    (int)l, uio);
+		if (error)
+			break;
+		mbp->msg_bufr += l;
+		if (mbp->msg_bufr >= mbp->msg_size)
+			mbp->msg_bufr = 0;
+	}
+	return (error);
+}
+
+/*ARGSUSED*/
+static	int
+logpoll(dev_t dev, int events, struct thread *td)
+{
+	int s;
+	int revents = 0;
+
+	s = splhigh();
+
+	if (events & (POLLIN | POLLRDNORM)) {
+		if (msgbufp->msg_bufr != msgbufp->msg_bufx)
+			revents |= events & (POLLIN | POLLRDNORM);
+		else
+			selrecord(td, &logsoftc.sc_selp);
+	}
+	splx(s);
+	return (revents);
+}
+
+static void
+logtimeout(void *arg)
+{
+
+	if (!log_open)
+		return;
+	if (msgbuftrigger == 0) {
+		callout_reset(&logsoftc.sc_callout,
+		    hz / log_wakeups_per_second, logtimeout, NULL);
+		return;
+	}
+	msgbuftrigger = 0;
+	selwakeup(&logsoftc.sc_selp);
+	if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL)
+		pgsigio(&logsoftc.sc_sigio, SIGIO, 0);
+	if (logsoftc.sc_state & LOG_RDWAIT) {
+		wakeup((caddr_t)msgbufp);
+		logsoftc.sc_state &= ~LOG_RDWAIT;
+	}
+	callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second,
+	    logtimeout, NULL);
+}
+
+/*ARGSUSED*/
+static	int
+logioctl(dev_t dev, u_long com, caddr_t data, int flag, struct thread *td)
+{
+	long l;
+	int s;
+
+	switch (com) {
+
+	/* return number of characters immediately available */
+	case FIONREAD:
+		s = splhigh();
+		l = msgbufp->msg_bufx - msgbufp->msg_bufr;
+		splx(s);
+		if (l < 0)
+			l += msgbufp->msg_size;
+		*(int *)data = l;
+		break;
+
+	case FIONBIO:
+		break;
+
+	case FIOASYNC:
+		if (*(int *)data)
+			logsoftc.sc_state |= LOG_ASYNC;
+		else
+			logsoftc.sc_state &= ~LOG_ASYNC;
+		break;
+
+	case FIOSETOWN:
+		return (fsetown(*(int *)data, &logsoftc.sc_sigio));
+
+	case FIOGETOWN:
+		*(int *)data = fgetown(logsoftc.sc_sigio);
+		break;
+
+	/* This is deprecated, FIOSETOWN should be used instead. */
+	case TIOCSPGRP:
+		return (fsetown(-(*(int *)data), &logsoftc.sc_sigio));
+
+	/* This is deprecated, FIOGETOWN should be used instead */
+	case TIOCGPGRP:
+		*(int *)data = -fgetown(logsoftc.sc_sigio);
+		break;
+
+	default:
+		return (ENOTTY);
+	}
+	return (0);
+}
+
+static void
+log_drvinit(void *unused)
+{
+
+	make_dev(&log_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "klog");
+}
+
+SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,log_drvinit,NULL)
diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c
new file mode 100644
index 0000000..74e1f56
--- /dev/null
+++ b/sys/kern/subr_mbuf.c
@@ -0,0 +1,1111 @@
+/*-
+ * Copyright (c) 2001
+ * 	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+/*
+ * Maximum number of PCPU containers. If you know what you're doing you could
+ * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
+ * system during compilation, and thus prevent kernel structure bloat.
+ *
+ * SMP and non-SMP kernels clearly have a different number of possible CPUs,
+ * but because we cannot assume a dense array of CPUs, we always allocate
+ * and traverse PCPU containers up to NCPU amount and merely check for
+ * CPU availability.
+ */
+#ifdef MBALLOC_NCPU
+#define	NCPU	MBALLOC_NCPU
+#else
+#define	NCPU	MAXCPU
+#endif
+
+/*-
+ * The mbuf allocator is heavily based on Alfred Perlstein's
+ * (alfred@FreeBSD.org) "memcache" allocator which is itself based
+ * on concepts from several per-CPU memory allocators. The difference
+ * between this allocator and memcache is that, among other things:
+ *
+ * (i) We don't free back to the map from the free() routine - we leave the
+ *     option of implementing lazy freeing (from a kproc) in the future. 
+ *
+ * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the
+ *	maximum number of allocatable objects of a given type. Further,
+ *	we handle blocking on a cv in the case that the map is starved and
+ *	we have to rely solely on cached (circulating) objects.
+ *
+ * The mbuf allocator keeps all objects that it allocates in mb_buckets.
+ * The buckets keep a page worth of objects (an object can be an mbuf or an
+ * mbuf cluster) and facilitate moving larger sets of contiguous objects
+ * from the per-CPU lists to the main list for the given object. The buckets
+ * also have an added advantage in that after several moves from a per-CPU
+ * list to the main list and back to the per-CPU list, contiguous objects
+ * are kept together, thus trying to put the TLB cache to good use.
+ *
+ * The buckets are kept on singly-linked lists called "containers." A container
+ * is protected by a mutex lock in order to ensure consistency.  The mutex lock
+ * itself is allocated separately and attached to the container at boot time,
+ * thus allowing for certain containers to share the same mutex lock.  Per-CPU
+ * containers for mbufs and mbuf clusters all share the same per-CPU
+ * lock whereas the "general system" containers (i.e., the "main lists") for
+ * these objects share one global lock.
+ */
+struct mb_bucket {
+	SLIST_ENTRY(mb_bucket) mb_blist;
+	int 	mb_owner;
+	int	mb_numfree;
+	void 	*mb_free[0];
+};
+
+struct mb_container {
+	SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
+	struct	mtx *mc_lock;
+	int	mc_numowner;
+	u_int	mc_starved;
+	long	*mc_types;
+	u_long	*mc_objcount;
+	u_long	*mc_numpgs;
+};
+
+struct mb_gen_list {
+	struct	mb_container mb_cont;
+	struct	cv mgl_mstarved;
+};
+
+struct mb_pcpu_list {
+	struct	mb_container mb_cont;
+};
+
+/*
+ * Boot-time configurable object counts that will determine the maximum
+ * number of permitted objects in the mbuf and mcluster cases.  In the
+ * ext counter (nmbcnt) case, it's just an indicator serving to scale
+ * kmem_map size properly - in other words, we may be allowed to allocate
+ * more than nmbcnt counters, whereas we will never be allowed to allocate
+ * more than nmbufs mbufs or nmbclusters mclusters.
+ * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
+ * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
+ */
+#ifndef NMBCLUSTERS
+#define	NMBCLUSTERS	(1024 + maxusers * 64)
+#endif
+#ifndef NMBUFS
+#define	NMBUFS		(nmbclusters * 2)
+#endif
+#ifndef NSFBUFS
+#define	NSFBUFS		(512 + maxusers * 16)
+#endif
+#ifndef NMBCNTS
+#define	NMBCNTS		(nmbclusters + nsfbufs)
+#endif
+int	nmbufs;
+int	nmbclusters;
+int	nmbcnt;
+int	nsfbufs;
+
+/*
+ * Perform sanity checks of tunables declared above.
+ */
+static void
+tunable_mbinit(void *dummy)
+{
+
+	/*
+	 * This has to be done before VM init.
+	 */
+	nmbclusters = NMBCLUSTERS;
+	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
+	nmbufs = NMBUFS;
+	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
+	nsfbufs = NSFBUFS;
+	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+	nmbcnt = NMBCNTS;
+	TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
+	/* Sanity checks */
+	if (nmbufs < nmbclusters * 2)
+		nmbufs = nmbclusters * 2;
+	if (nmbcnt < nmbclusters + nsfbufs)
+		nmbcnt = nmbclusters + nsfbufs;
+}
+SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
+
+/*
+ * The freelist structures and mutex locks.  The number statically declared
+ * here depends on the number of CPUs.
+ *
+ * We set up in such a way that all the objects (mbufs, clusters)
+ * share the same mutex lock.  It has been established that we do not benefit
+ * from different locks for different objects, so we use the same lock,
+ * regardless of object type.
+ */
+struct mb_lstmngr {
+	struct mb_gen_list *ml_genlist;
+	struct mb_pcpu_list *ml_cntlst[NCPU];
+	struct mb_bucket **ml_btable;
+	vm_map_t	ml_map;
+	vm_offset_t	ml_mapbase;
+	vm_offset_t	ml_maptop;
+	int		ml_mapfull;
+	u_int		ml_objsize;
+	u_int		*ml_wmhigh;
+};
+static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
+static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
+
+/*
+ * Local macros for internal allocator structure manipulations.
+ */
+#ifdef SMP
+#define	MB_GET_PCPU_LIST(mb_lst)	(mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
+#else
+#define	MB_GET_PCPU_LIST(mb_lst)	(mb_lst)->ml_cntlst[0]
+#endif
+
+#define	MB_GET_GEN_LIST(mb_lst)		(mb_lst)->ml_genlist
+
+#define	MB_LOCK_CONT(mb_cnt)		mtx_lock((mb_cnt)->mb_cont.mc_lock)
+
+#define	MB_UNLOCK_CONT(mb_cnt)		mtx_unlock((mb_cnt)->mb_cont.mc_lock)
+
+#define	MB_GET_PCPU_LIST_NUM(mb_lst, num)				\
+    (mb_lst)->ml_cntlst[(num)]
+
+#define	MB_BUCKET_INDX(mb_obj, mb_lst)					\
+    (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE)
+
+#define	MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst)				\
+{									\
+	struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead);	\
+									\
+	(mb_bckt)->mb_numfree--;					\
+	(mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)];	\
+	(*((mb_lst)->mb_cont.mc_objcount))--;				\
+	if ((mb_bckt)->mb_numfree == 0) {				\
+		SLIST_REMOVE_HEAD(_mchd, mb_blist);			\
+		SLIST_NEXT((mb_bckt), mb_blist) = NULL;			\
+		(mb_bckt)->mb_owner |= MB_BUCKET_FREE;			\
+	}								\
+}
+
+#define	MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst)				\
+	(mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp);	\
+	(mb_bckt)->mb_numfree++;					\
+	(*((mb_lst)->mb_cont.mc_objcount))++;
+
+#define	MB_MBTYPES_INC(mb_cnt, mb_type, mb_num)				\
+	if ((mb_type) != MT_NOTMBUF)					\
+	    (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num)
+
+#define	MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num)				\
+	if ((mb_type) != MT_NOTMBUF)					\
+	    (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num)
+
+/*
+ * Ownership of buckets/containers is represented by integers.  The PCPU
+ * lists range from 0 to NCPU-1.  We need a free numerical id for the general
+ * list (we use NCPU).  We also need a non-conflicting free bit to indicate
+ * that the bucket is free and removed from a container, while not losing
+ * the bucket's originating container id.  We use the highest bit
+ * for the free marker.
+ */
+#define	MB_GENLIST_OWNER	(NCPU)
+#define	MB_BUCKET_FREE		(1 << (sizeof(int) * 8 - 1))
+
+/* Statistics structures for allocator (per-CPU and general). */
+static struct mbpstat mb_statpcpu[NCPU + 1];
+struct mbstat mbstat;
+
+/* Sleep time for wait code (in ticks). */
+static int mbuf_wait = 64;
+
+static u_int mbuf_limit = 512;	/* Upper limit on # of mbufs per CPU. */
+static u_int clust_limit = 128;	/* Upper limit on # of clusters per CPU. */
+
+/*
+ * Objects exported by sysctl(8).
+ */
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, 
+    "Maximum number of mbuf clusters available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
+    "Maximum number of mbufs available"); 
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
+    "Number used to scale kmem_map to ensure sufficient space for counters");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0,
+    "Maximum number of sendfile(2) sf_bufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
+    "Sleep time of mbuf subsystem wait allocations during exhaustion");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0,
+    "Upper limit of number of mbufs allowed on each PCPU list");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0,
+    "Upper limit of number of mbuf clusters allowed on each PCPU list");
+SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
+    "Mbuf general information and statistics");
+SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
+    sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
+
+/*
+ * Prototypes of local allocator routines.
+ */
+static void		*mb_alloc_wait(struct mb_lstmngr *, short);
+static struct mb_bucket	*mb_pop_cont(struct mb_lstmngr *, int,
+			    struct mb_pcpu_list *);
+static void		 mb_reclaim(void);
+static void		 mbuf_init(void *);
+
+/*
+ * Initial allocation numbers.  Each parameter represents the number of buckets
+ * of each object that will be placed initially in each PCPU container for
+ * said object.
+ */
+#define	NMB_MBUF_INIT	4
+#define	NMB_CLUST_INIT	16
+
+/*
+ * Initialize the mbuf subsystem.
+ *
+ * We sub-divide the kmem_map into several submaps; this way, we don't have
+ * to worry about artificially limiting the number of mbuf or mbuf cluster
+ * allocations, due to fear of one type of allocation "stealing" address
+ * space initially reserved for another.
+ *
+ * Set up both the general containers and all the PCPU containers.  Populate
+ * the PCPU containers with initial numbers.
+ */
+MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
+void
+mbuf_init(void *dummy)
+{
+	struct mb_pcpu_list *pcpu_cnt;
+	vm_size_t mb_map_size;
+	int i, j;
+
+	/*
+	 * Set up all the submaps, for each type of object that we deal
+	 * with in this allocator.
+	 */
+	mb_map_size = (vm_size_t)(nmbufs * MSIZE);
+	mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
+	mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE *
+	    sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
+	if (mb_list_mbuf.ml_btable == NULL)
+		goto bad;
+	mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
+	    &(mb_list_mbuf.ml_maptop), mb_map_size);
+	mb_list_mbuf.ml_map->system_map = 1;
+	mb_list_mbuf.ml_mapfull = 0;
+	mb_list_mbuf.ml_objsize = MSIZE;
+	mb_list_mbuf.ml_wmhigh = &mbuf_limit;
+
+	mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
+	mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
+	mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE
+	    * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
+	if (mb_list_clust.ml_btable == NULL)
+		goto bad;
+	mb_list_clust.ml_map = kmem_suballoc(kmem_map,
+	    &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
+	    mb_map_size);
+	mb_list_clust.ml_map->system_map = 1;
+	mb_list_clust.ml_mapfull = 0;
+	mb_list_clust.ml_objsize = MCLBYTES;
+	mb_list_clust.ml_wmhigh = &clust_limit;
+
+	/*
+	 * Allocate required general (global) containers for each object type.
+	 */
+	mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
+	    M_NOWAIT);
+	mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
+	    M_NOWAIT);
+	if ((mb_list_mbuf.ml_genlist == NULL) ||
+	    (mb_list_clust.ml_genlist == NULL))
+		goto bad;
+
+	/*
+	 * Initialize condition variables and general container mutex locks.
+	 */
+	mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, 0);
+	cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
+	cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
+	    "mcluster pool starved");
+	mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
+	    mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
+
+	/*
+	 * Set up the general containers for each object.
+	 */
+	mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
+	    mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
+	mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
+	    mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
+	mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
+	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
+	mb_list_clust.ml_genlist->mb_cont.mc_objcount =
+	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
+	mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs =
+	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs);
+	mb_list_clust.ml_genlist->mb_cont.mc_numpgs =
+	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs);
+	mb_list_mbuf.ml_genlist->mb_cont.mc_types =
+	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
+	mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
+	SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
+	SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
+
+	/*
+	 * Initialize general mbuf statistics.
+	 */
+	mbstat.m_msize = MSIZE;
+	mbstat.m_mclbytes = MCLBYTES;
+	mbstat.m_minclsize = MINCLSIZE;
+	mbstat.m_mlen = MLEN;
+	mbstat.m_mhlen = MHLEN;
+	mbstat.m_numtypes = MT_NTYPES;
+
+	/*
+	 * Allocate and initialize PCPU containers.
+	 */
+	for (i = 0; i < NCPU; i++) {
+		if (CPU_ABSENT(i))
+			continue;
+
+		mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
+		    M_MBUF, M_NOWAIT);
+		mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
+		    M_MBUF, M_NOWAIT);
+		if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
+		    (mb_list_clust.ml_cntlst[i] == NULL))
+			goto bad;
+
+		mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, 0);
+		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
+		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
+
+		mb_statpcpu[i].mb_active = 1;
+		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
+		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
+		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
+		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
+		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
+		    &(mb_statpcpu[i].mb_mbfree);
+		mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
+		    &(mb_statpcpu[i].mb_clfree);
+		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs =
+		    &(mb_statpcpu[i].mb_mbpgs);
+		mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs =
+		    &(mb_statpcpu[i].mb_clpgs);
+		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
+		    &(mb_statpcpu[i].mb_mbtypes[0]);
+		mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
+
+		SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
+		SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
+
+		/*
+		 * Perform initial allocations.
+		 */
+		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
+		MB_LOCK_CONT(pcpu_cnt);
+		for (j = 0; j < NMB_MBUF_INIT; j++) {
+			if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
+			    == NULL)
+				goto bad;
+		}
+		MB_UNLOCK_CONT(pcpu_cnt);
+
+		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
+		MB_LOCK_CONT(pcpu_cnt);
+		for (j = 0; j < NMB_CLUST_INIT; j++) {
+			if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
+			    == NULL)
+				goto bad;
+		}
+		MB_UNLOCK_CONT(pcpu_cnt);
+	}
+
+	return;
+bad:
+	panic("mbuf_init(): failed to initialize mbuf subsystem!");
+}
+
+/*
+ * Populate a given mbuf PCPU container with a bucket full of fresh new
+ * buffers.  Return a pointer to the new bucket (already in the container if
+ * successful), or return NULL on failure.
+ *
+ * LOCKING NOTES:
+ * PCPU container lock must be held when this is called.
+ * The lock is dropped here so that we can cleanly call the underlying VM
+ * code.  If we fail, we return with no locks held. If we succeed (i.e., return
+ * non-NULL), we return with the PCPU lock held, ready for allocation from
+ * the returned bucket.
+ */
+static struct mb_bucket *
+mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
+{
+	struct mb_bucket *bucket;
+	caddr_t p;
+	int i;
+
+	MB_UNLOCK_CONT(cnt_lst);
+	/*
+	 * If our object's (finite) map is starved now (i.e., no more address
+	 * space), bail out now.
+	 */
+	if (mb_list->ml_mapfull)
+		return (NULL);
+
+	bucket = malloc(sizeof(struct mb_bucket) +
+	    PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF,
+	    how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
+	if (bucket == NULL)
+		return (NULL);
+
+	p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE,
+	    how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
+	if (p == NULL) {
+		free(bucket, M_MBUF);
+		if (how == M_TRYWAIT)
+			mb_list->ml_mapfull = 1;
+		return (NULL);
+	}
+
+	bucket->mb_numfree = 0;
+	mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
+	for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) {
+		bucket->mb_free[i] = p;
+		bucket->mb_numfree++;
+		p += mb_list->ml_objsize;
+	}
+
+	MB_LOCK_CONT(cnt_lst);
+	bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
+	SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
+	(*(cnt_lst->mb_cont.mc_numpgs))++;
+	*(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
+
+	return (bucket);
+}
+
+/*
+ * Allocate an mbuf-subsystem type object.
+ * The general case is very easy.  Complications only arise if our PCPU
+ * container is empty.  Things get worse if the PCPU container is empty,
+ * the general container is empty, and we've run out of address space
+ * in our map; then we try to block if we're willing to (M_TRYWAIT).
+ */
+static __inline
+void *
+mb_alloc(struct mb_lstmngr *mb_list, int how, short type)
+{
+	static int last_report;
+	struct mb_pcpu_list *cnt_lst;
+	struct mb_bucket *bucket;
+	void *m;
+
+	m = NULL;
+	cnt_lst = MB_GET_PCPU_LIST(mb_list);
+	MB_LOCK_CONT(cnt_lst);
+
+	if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
+		/*
+		 * This is the easy allocation case. We just grab an object
+		 * from a bucket in the PCPU container. At worst, we
+		 * have just emptied the bucket and so we remove it
+		 * from the container.
+		 */
+		MB_GET_OBJECT(m, bucket, cnt_lst);
+		MB_MBTYPES_INC(cnt_lst, type, 1); 
+		MB_UNLOCK_CONT(cnt_lst);
+	} else {
+		struct mb_gen_list *gen_list;
+
+		/*
+		 * This is the less-common more difficult case. We must
+		 * first verify if the general list has anything for us
+		 * and if that also fails, we must allocate a page from
+		 * the map and create a new bucket to place in our PCPU
+		 * container (already locked). If the map is starved then
+		 * we're really in for trouble, as we have to wait on
+		 * the general container's condition variable.
+		 */
+		gen_list = MB_GET_GEN_LIST(mb_list);
+		MB_LOCK_CONT(gen_list);
+
+		if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
+		    != NULL) {
+			/*
+			 * Give ownership of the bucket to our CPU's
+			 * container, but only actually put the bucket
+			 * in the container if it doesn't become free
+			 * upon removing an mbuf from it.
+			 */
+			SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
+			    mb_blist);
+			bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
+			(*(gen_list->mb_cont.mc_numpgs))--;
+			(*(cnt_lst->mb_cont.mc_numpgs))++;
+			*(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
+			bucket->mb_numfree--;
+			m = bucket->mb_free[(bucket->mb_numfree)];
+			if (bucket->mb_numfree == 0) {
+				SLIST_NEXT(bucket, mb_blist) = NULL;
+				bucket->mb_owner |= MB_BUCKET_FREE;
+			} else {
+				SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
+				     bucket, mb_blist);
+				*(cnt_lst->mb_cont.mc_objcount) +=
+				    bucket->mb_numfree;
+			}
+			MB_UNLOCK_CONT(gen_list);
+			MB_MBTYPES_INC(cnt_lst, type, 1);
+			MB_UNLOCK_CONT(cnt_lst);
+		} else {
+			/*
+			 * We'll have to allocate a new page.
+			 */
+			MB_UNLOCK_CONT(gen_list);
+			bucket = mb_pop_cont(mb_list, how, cnt_lst);
+			if (bucket != NULL) {
+				MB_GET_OBJECT(m, bucket, cnt_lst);
+				MB_MBTYPES_INC(cnt_lst, type, 1);
+				MB_UNLOCK_CONT(cnt_lst);
+			} else {
+				if (how == M_TRYWAIT) {
+					/*
+				 	 * Absolute worst-case scenario.
+					 * We block if we're willing to, but
+					 * only after trying to steal from
+					 * other lists.
+					 */
+					m = mb_alloc_wait(mb_list, type);
+				} else {
+					/* XXX: No consistency. */
+					mbstat.m_drops++;
+
+					if (ticks < last_report ||
+					   (ticks - last_report) >= hz) {
+						last_report = ticks;
+						printf(
+"All mbufs exhausted, please see tuning(7).\n");
+/* XXX: Actually could be clusters, but it gets the point across. */
+					}
+
+				}
+			}
+		}
+	}
+
+	return (m);
+}
+
+/*
+ * This is the worst-case scenario called only if we're allocating with
+ * M_TRYWAIT.  We first drain all the protocols, then try to find an mbuf
+ * by looking in every PCPU container.  If we're still unsuccesful, we
+ * try the general container one last time and possibly block on our
+ * starved cv.
+ */
+static void *
+mb_alloc_wait(struct mb_lstmngr *mb_list, short type)
+{
+	struct mb_pcpu_list *cnt_lst;
+	struct mb_gen_list *gen_list;
+	struct mb_bucket *bucket;
+	void *m;
+	int i, cv_ret;
+
+	/*
+	 * Try to reclaim mbuf-related objects (mbufs, clusters).
+	 */
+	mb_reclaim();
+
+	/*
+	 * Cycle all the PCPU containers. Increment starved counts if found
+	 * empty.
+	 */
+	for (i = 0; i < NCPU; i++) {
+		if (CPU_ABSENT(i))
+			continue;
+		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
+		MB_LOCK_CONT(cnt_lst);
+
+		/*
+		 * If container is non-empty, get a single object from it.
+		 * If empty, increment starved count.
+		 */
+		if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
+		    NULL) {
+			MB_GET_OBJECT(m, bucket, cnt_lst);
+			MB_MBTYPES_INC(cnt_lst, type, 1);
+			MB_UNLOCK_CONT(cnt_lst);
+			mbstat.m_wait++;	/* XXX: No consistency. */
+			return (m);
+		} else
+			cnt_lst->mb_cont.mc_starved++;
+
+		MB_UNLOCK_CONT(cnt_lst);
+	}
+
+	/*
+	 * We're still here, so that means it's time to get the general
+	 * container lock, check it one more time (now that mb_reclaim()
+	 * has been called) and if we still get nothing, block on the cv.
+	 */
+	gen_list = MB_GET_GEN_LIST(mb_list);
+	MB_LOCK_CONT(gen_list);
+	if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
+		MB_GET_OBJECT(m, bucket, gen_list);
+		MB_MBTYPES_INC(gen_list, type, 1);
+		MB_UNLOCK_CONT(gen_list);
+		mbstat.m_wait++;	/* XXX: No consistency. */
+		return (m);
+	}
+
+	gen_list->mb_cont.mc_starved++;
+	cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
+	    gen_list->mb_cont.mc_lock, mbuf_wait);
+	gen_list->mb_cont.mc_starved--;
+
+	if ((cv_ret == 0) &&
+	    ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
+		MB_GET_OBJECT(m, bucket, gen_list);
+		MB_MBTYPES_INC(gen_list, type, 1);
+		mbstat.m_wait++;	/* XXX: No consistency. */
+	} else {
+		mbstat.m_drops++;	/* XXX: No consistency. */
+		m = NULL;
+	}
+
+	MB_UNLOCK_CONT(gen_list);
+
+	return (m);
+}
+
+/*-
+ * Free an object to its rightful container.
+ * In the very general case, this operation is really very easy.
+ * Complications arise primarily if:
+ *	(a) We've hit the high limit on number of free objects allowed in
+ *	    our PCPU container.
+ *	(b) We're in a critical situation where our container has been
+ *	    marked 'starved' and we need to issue wakeups on the starved
+ *	    condition variable.
+ *	(c) Minor (odd) cases: our bucket has migrated while we were
+ *	    waiting for the lock; our bucket is in the general container;
+ *	    our bucket is empty.
+ */
+static __inline
+void
+mb_free(struct mb_lstmngr *mb_list, void *m, short type)
+{
+	struct mb_pcpu_list *cnt_lst;
+	struct mb_gen_list *gen_list;
+	struct mb_bucket *bucket;
+	u_int owner;
+
+	bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
+
+	/*
+	 * Make sure that if after we lock the bucket's present container the
+	 * bucket has migrated, that we drop the lock and get the new one.
+	 */
+retry_lock:
+	owner = bucket->mb_owner & ~MB_BUCKET_FREE;
+	switch (owner) {
+	case MB_GENLIST_OWNER:
+		gen_list = MB_GET_GEN_LIST(mb_list);
+		MB_LOCK_CONT(gen_list);
+		if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
+			MB_UNLOCK_CONT(gen_list);
+			goto retry_lock;
+		}
+
+		/*
+		 * If we're intended for the general container, this is
+		 * real easy: no migrating required. The only `bogon'
+		 * is that we're now contending with all the threads
+		 * dealing with the general list, but this is expected.
+		 */
+		MB_PUT_OBJECT(m, bucket, gen_list);
+		MB_MBTYPES_DEC(gen_list, type, 1);
+		if (gen_list->mb_cont.mc_starved > 0)
+			cv_signal(&(gen_list->mgl_mstarved));
+		MB_UNLOCK_CONT(gen_list);
+		break;
+
+	default:
+		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
+		MB_LOCK_CONT(cnt_lst);
+		if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
+			MB_UNLOCK_CONT(cnt_lst);
+			goto retry_lock;
+		}
+
+		MB_PUT_OBJECT(m, bucket, cnt_lst);
+		MB_MBTYPES_DEC(cnt_lst, type, 1);
+
+		if (cnt_lst->mb_cont.mc_starved > 0) {
+			/*
+			 * This is a tough case. It means that we've
+			 * been flagged at least once to indicate that
+			 * we're empty, and that the system is in a critical
+			 * situation, so we ought to migrate at least one
+			 * bucket over to the general container.
+			 * There may or may not be a thread blocking on
+			 * the starved condition variable, but chances
+			 * are that one will eventually come up soon so
+			 * it's better to migrate now than never.
+			 */
+			gen_list = MB_GET_GEN_LIST(mb_list);
+			MB_LOCK_CONT(gen_list);
+			KASSERT((bucket->mb_owner & MB_BUCKET_FREE) != 0,
+			    ("mb_free: corrupt bucket %p\n", bucket));
+			SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
+			    bucket, mb_blist);
+			bucket->mb_owner = MB_GENLIST_OWNER;
+			(*(cnt_lst->mb_cont.mc_objcount))--;
+			(*(gen_list->mb_cont.mc_objcount))++;
+			(*(cnt_lst->mb_cont.mc_numpgs))--;
+			(*(gen_list->mb_cont.mc_numpgs))++;
+
+			/*
+			 * Determine whether or not to keep transferring
+			 * buckets to the general list or whether we've
+			 * transferred enough already.
+			 * We realize that although we may flag another
+			 * bucket to be migrated to the general container
+			 * that in the meantime, the thread that was
+			 * blocked on the cv is already woken up and
+			 * long gone. But in that case, the worst
+			 * consequence is that we will end up migrating
+			 * one bucket too many, which is really not a big
+			 * deal, especially if we're close to a critical
+			 * situation.
+			 */
+			if (gen_list->mb_cont.mc_starved > 0) {
+				cnt_lst->mb_cont.mc_starved--;
+				cv_signal(&(gen_list->mgl_mstarved));
+			} else
+				cnt_lst->mb_cont.mc_starved = 0;
+
+			MB_UNLOCK_CONT(gen_list);
+			MB_UNLOCK_CONT(cnt_lst);
+			break;
+		}
+
+		if (*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) {
+			/*
+			 * We've hit the high limit of allowed numbers of mbufs
+			 * on this PCPU list. We must now migrate a bucket
+			 * over to the general container.
+			 */
+			gen_list = MB_GET_GEN_LIST(mb_list);
+			MB_LOCK_CONT(gen_list);
+			if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
+				bucket =
+				    SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
+				SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
+				    mb_blist);
+			}
+			SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
+			    bucket, mb_blist);
+			bucket->mb_owner = MB_GENLIST_OWNER;
+			*(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
+			*(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
+			(*(cnt_lst->mb_cont.mc_numpgs))--;
+			(*(gen_list->mb_cont.mc_numpgs))++;
+
+			/*
+			 * While we're at it, transfer some of the mbtypes
+			 * "count load" onto the general list's mbtypes
+			 * array, seeing as how we're moving the bucket
+			 * there now, meaning that the freeing of objects
+			 * there will now decrement the _general list's_
+			 * mbtypes counters, and no longer our PCPU list's
+			 * mbtypes counters. We do this for the type presently
+			 * being freed in an effort to keep the mbtypes
+			 * counters approximately balanced across all lists.
+			 */ 
+			MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE /
+			    mb_list->ml_objsize) - bucket->mb_numfree);
+			MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE /
+			    mb_list->ml_objsize) - bucket->mb_numfree);
+ 
+			MB_UNLOCK_CONT(gen_list);
+			MB_UNLOCK_CONT(cnt_lst);
+			break;
+		}
+
+		if (bucket->mb_owner & MB_BUCKET_FREE) {
+			SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
+			    bucket, mb_blist);
+			bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
+		}
+
+		MB_UNLOCK_CONT(cnt_lst);
+		break;
+	}
+}
+
+/*
+ * Drain protocols in hopes to free up some resources.
+ *
+ * LOCKING NOTES:
+ * No locks should be held when this is called.  The drain routines have to
+ * presently acquire some locks which raises the possibility of lock order
+ * violation if we're holding any mutex if that mutex is acquired in reverse
+ * order relative to one of the locks in the drain routines.
+ */
+static void
+mb_reclaim(void)
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+/*
+ * XXX: Argh, we almost always trip here with witness turned on now-a-days
+ * XXX: because we often come in with Giant held. For now, there's no way
+ * XXX: to avoid this.
+ */
+#ifdef WITNESS
+	KASSERT(witness_list(curthread) == 0,
+	    ("mb_reclaim() called with locks held"));
+#endif
+
+	mbstat.m_drain++;	/* XXX: No consistency. */
+
+	for (dp = domains; dp != NULL; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_drain != NULL)
+				(*pr->pr_drain)();
+}
+
+/*
+ * Local mbuf & cluster alloc macros and routines.
+ * Local macro and function names begin with an underscore ("_").
+ */
+static void	_mclfree(struct mbuf *);
+
+#define	_m_get(m, how, type) do {					\
+	(m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how), (type));	\
+	if ((m) != NULL) {						\
+		(m)->m_type = (type);					\
+		(m)->m_next = NULL;					\
+		(m)->m_nextpkt = NULL;					\
+		(m)->m_data = (m)->m_dat;				\
+		(m)->m_flags = 0;					\
+	}								\
+} while (0)
+
+#define	_m_gethdr(m, how, type) do {					\
+	(m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how), (type));	\
+	if ((m) != NULL) {						\
+		(m)->m_type = (type);					\
+		(m)->m_next = NULL;					\
+		(m)->m_nextpkt = NULL;					\
+		(m)->m_data = (m)->m_pktdat;				\
+		(m)->m_flags = M_PKTHDR;				\
+		(m)->m_pkthdr.rcvif = NULL;				\
+		(m)->m_pkthdr.csum_flags = 0;				\
+		(m)->m_pkthdr.aux = NULL;				\
+	}								\
+} while (0)
+
+/* XXX: Check for M_PKTHDR && m_pkthdr.aux is bogus... please fix (see KAME). */
+#define	_m_free(m, n) do {						\
+	(n) = (m)->m_next;						\
+	if ((m)->m_flags & M_EXT)					\
+		MEXTFREE((m));						\
+	if (((m)->m_flags & M_PKTHDR) != 0 && (m)->m_pkthdr.aux) {	\
+		m_freem((m)->m_pkthdr.aux);				\
+		(m)->m_pkthdr.aux = NULL;				\
+	}								\
+	mb_free(&mb_list_mbuf, (m), (m)->m_type);			\
+} while (0)
+
+#define	_mext_init_ref(m) do {						\
+	(m)->m_ext.ref_cnt = malloc(sizeof(u_int), M_MBUF, M_NOWAIT);	\
+	if ((m)->m_ext.ref_cnt != NULL) {				\
+		*((m)->m_ext.ref_cnt) = 0;				\
+		MEXT_ADD_REF((m));					\
+	}								\
+} while (0)
+
+#define	_mext_dealloc_ref(m)						\
+	free((m)->m_ext.ref_cnt, M_MBUF)
+
+void
+_mext_free(struct mbuf *mb)
+{
+
+	if (mb->m_ext.ext_type == EXT_CLUSTER)
+		mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF);
+	else
+		(*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
+	_mext_dealloc_ref(mb);
+}
+
+/*
+ * We only include this here to avoid making m_clget() excessively large
+ * due to too much inlined code.
+ */
+static void
+_mclfree(struct mbuf *mb)
+{
+
+	mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF);
+	mb->m_ext.ext_buf = NULL;
+}
+
+/*
+ * Exported space allocation and de-allocation routines.
+ */
+struct mbuf *
+m_get(int how, int type)
+{
+	struct mbuf *mb;
+
+	_m_get(mb, how, type);
+	return (mb);
+}
+
+struct mbuf *
+m_gethdr(int how, int type)
+{
+	struct mbuf *mb;
+
+	_m_gethdr(mb, how, type);
+	return (mb);
+}
+
+struct mbuf *
+m_get_clrd(int how, int type)
+{
+	struct mbuf *mb;
+
+	_m_get(mb, how, type);
+	if (mb != NULL)
+		bzero(mtod(mb, caddr_t), MLEN);
+	return (mb);
+}
+
+struct mbuf *
+m_gethdr_clrd(int how, int type)
+{
+	struct mbuf *mb;
+
+	_m_gethdr(mb, how, type);
+	if (mb != NULL)
+		bzero(mtod(mb, caddr_t), MHLEN);
+	return (mb);
+}
+
+struct mbuf *
+m_free(struct mbuf *mb)
+{
+	struct mbuf *nb;
+
+	_m_free(mb, nb);
+	return (nb);
+}
+
+void
+m_clget(struct mbuf *mb, int how)
+{
+
+	mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, MT_NOTMBUF);
+	if (mb->m_ext.ext_buf != NULL) {
+		_mext_init_ref(mb);
+		if (mb->m_ext.ref_cnt == NULL)
+			_mclfree(mb);
+		else {
+			mb->m_data = mb->m_ext.ext_buf;
+			mb->m_flags |= M_EXT;
+			mb->m_ext.ext_free = NULL;
+			mb->m_ext.ext_args = NULL;
+			mb->m_ext.ext_size = MCLBYTES;
+			mb->m_ext.ext_type = EXT_CLUSTER;
+		}
+	}
+}
+
+void
+m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
+    void (*freef)(void *, void *), void *args, short flags, int type)
+{
+
+	_mext_init_ref(mb);
+	if (mb->m_ext.ref_cnt != NULL) {
+		mb->m_flags |= (M_EXT | flags);
+		mb->m_ext.ext_buf = buf;
+		mb->m_data = mb->m_ext.ext_buf;
+		mb->m_ext.ext_size = size;
+		mb->m_ext.ext_free = freef;
+		mb->m_ext.ext_args = args;
+		mb->m_ext.ext_type = type;
+	}
+}
+
+/*
+ * Change type for mbuf `mb'; this is a relatively expensive operation and
+ * should be avoided.
+ */
+void
+m_chtype(struct mbuf *mb, short new_type)
+{
+	struct mb_gen_list *gen_list;
+
+	gen_list = MB_GET_GEN_LIST(&mb_list_mbuf);
+	MB_LOCK_CONT(gen_list);
+	MB_MBTYPES_DEC(gen_list, mb->m_type, 1);
+	MB_MBTYPES_INC(gen_list, new_type, 1);
+	MB_UNLOCK_CONT(gen_list);
+	mb->m_type = new_type;
+}
diff --git a/sys/kern/subr_mchain.c b/sys/kern/subr_mchain.c
new file mode 100644
index 0000000..1a8c4bd
--- /dev/null
+++ b/sys/kern/subr_mchain.c
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2000, 2001 Boris Popov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    This product includes software developed by Boris Popov.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/endian.h>
+#include <sys/errno.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/uio.h>
+
+#include <sys/mchain.h>
+
+MODULE_VERSION(libmchain, 1);
+
+#define MBERROR(format, args...) printf("%s(%d): "format, __func__ , \
+				    __LINE__ ,## args)
+
+#define MBPANIC(format, args...) printf("%s(%d): "format, __func__ , \
+				    __LINE__ ,## args)
+
+/*
+ * Various helper functions
+ */
+int
+m_fixhdr(struct mbuf *m0)
+{
+	struct mbuf *m = m0;
+	int len = 0;
+
+	while (m) {
+		len += m->m_len;
+		m = m->m_next;
+	}
+	m0->m_pkthdr.len = len;
+	return len;
+}
+
+int
+mb_init(struct mbchain *mbp)
+{
+	struct mbuf *m;
+
+	m = m_gethdr(M_TRYWAIT, MT_DATA);
+	if (m == NULL) 
+		return ENOBUFS;
+	m->m_len = 0;
+	mb_initm(mbp, m);
+	return 0;
+}
+
+void
+mb_initm(struct mbchain *mbp, struct mbuf *m)
+{
+	bzero(mbp, sizeof(*mbp));
+	mbp->mb_top = mbp->mb_cur = m;
+	mbp->mb_mleft = M_TRAILINGSPACE(m);
+}
+
+void
+mb_done(struct mbchain *mbp)
+{
+	if (mbp->mb_top) {
+		m_freem(mbp->mb_top);
+		mbp->mb_top = NULL;
+	}
+}
+
+struct mbuf *
+mb_detach(struct mbchain *mbp)
+{
+	struct mbuf *m;
+
+	m = mbp->mb_top;
+	mbp->mb_top = NULL;
+	return m;
+}
+
+int
+mb_fixhdr(struct mbchain *mbp)
+{
+	return mbp->mb_top->m_pkthdr.len = m_fixhdr(mbp->mb_top);
+}
+
+/*
+ * Check if object of size 'size' fit to the current position and
+ * allocate new mbuf if not. Advance pointers and increase length of mbuf(s).
+ * Return pointer to the object placeholder or NULL if any error occured.
+ * Note: size should be <= MLEN 
+ */
+caddr_t
+mb_reserve(struct mbchain *mbp, int size)
+{
+	struct mbuf *m, *mn;
+	caddr_t bpos;
+
+	if (size > MLEN)
+		panic("mb_reserve: size = %d\n", size);
+	m = mbp->mb_cur;
+	if (mbp->mb_mleft < size) {
+		mn = m_get(M_TRYWAIT, MT_DATA);
+		if (mn == NULL)
+			return NULL;
+		mbp->mb_cur = m->m_next = mn;
+		m = mn;
+		m->m_len = 0;
+		mbp->mb_mleft = M_TRAILINGSPACE(m);
+	}
+	mbp->mb_mleft -= size;
+	mbp->mb_count += size;
+	bpos = mtod(m, caddr_t) + m->m_len;
+	m->m_len += size;
+	return bpos;
+}
+
+int
+mb_put_uint8(struct mbchain *mbp, u_int8_t x)
+{
+	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_uint16be(struct mbchain *mbp, u_int16_t x)
+{
+	x = htobes(x);
+	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_uint16le(struct mbchain *mbp, u_int16_t x)
+{
+	x = htoles(x);
+	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_uint32be(struct mbchain *mbp, u_int32_t x)
+{
+	x = htobel(x);
+	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_uint32le(struct mbchain *mbp, u_int32_t x)
+{
+	x = htolel(x);
+	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_int64be(struct mbchain *mbp, int64_t x)
+{
+	x = htobeq(x);
+	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_int64le(struct mbchain *mbp, int64_t x)
+{
+	x = htoleq(x);
+	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+}
+
+int
+mb_put_mem(struct mbchain *mbp, c_caddr_t source, int size, int type)
+{
+	struct mbuf *m;
+	caddr_t dst;
+	c_caddr_t src;
+	int cplen, error, mleft, count;
+
+	m = mbp->mb_cur;
+	mleft = mbp->mb_mleft;
+
+	while (size > 0) {
+		if (mleft == 0) {
+			if (m->m_next == NULL) {
+				m = m_getm(m, size, M_TRYWAIT, MT_DATA);
+				if (m == NULL)
+					return ENOBUFS;
+			}
+			m = m->m_next;
+			mleft = M_TRAILINGSPACE(m);
+			continue;
+		}
+		cplen = mleft > size ? size : mleft;
+		dst = mtod(m, caddr_t) + m->m_len;
+		switch (type) {
+		    case MB_MCUSTOM:
+			error = mbp->mb_copy(mbp, source, dst, cplen);
+			if (error)
+				return error;
+			break;
+		    case MB_MINLINE:
+			for (src = source, count = cplen; count; count--)
+				*dst++ = *src++;
+			break;
+		    case MB_MSYSTEM:
+			bcopy(source, dst, cplen);
+			break;
+		    case MB_MUSER:
+			error = copyin(source, dst, cplen);
+			if (error)
+				return error;
+			break;
+		    case MB_MZERO:
+			bzero(dst, cplen);
+			break;
+		}
+		size -= cplen;
+		source += cplen;
+		m->m_len += cplen;
+		mleft -= cplen;
+		mbp->mb_count += cplen;
+	}
+	mbp->mb_cur = m;
+	mbp->mb_mleft = mleft;
+	return 0;
+}
+
+int
+mb_put_mbuf(struct mbchain *mbp, struct mbuf *m)
+{
+	mbp->mb_cur->m_next = m;
+	while (m) {
+		mbp->mb_count += m->m_len;
+		if (m->m_next == NULL)
+			break;
+		m = m->m_next;
+	}
+	mbp->mb_mleft = M_TRAILINGSPACE(m);
+	mbp->mb_cur = m;
+	return 0;
+}
+
+/*
+ * copies a uio scatter/gather list to an mbuf chain.
+ */
+int
+mb_put_uio(struct mbchain *mbp, struct uio *uiop, int size)
+{
+	long left;
+	int mtype, error;
+
+	mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER;
+
+	while (size > 0 && uiop->uio_resid) {
+		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
+			return EFBIG;
+		left = uiop->uio_iov->iov_len;
+		if (left == 0) {
+			uiop->uio_iov++;
+			uiop->uio_iovcnt--;
+			continue;
+		}
+		if (left > size)
+			left = size;
+		error = mb_put_mem(mbp, uiop->uio_iov->iov_base, left, mtype);
+		if (error)
+			return error;
+		uiop->uio_offset += left;
+		uiop->uio_resid -= left;
+		uiop->uio_iov->iov_base += left;
+		uiop->uio_iov->iov_len -= left;
+		size -= left;
+	}
+	return 0;
+}
+
+/*
+ * Routines for fetching data from an mbuf chain
+ */
+int
+md_init(struct mdchain *mdp)
+{
+	struct mbuf *m;
+
+	m = m_gethdr(M_TRYWAIT, MT_DATA);
+	if (m == NULL) 
+		return ENOBUFS;
+	m->m_len = 0;
+	md_initm(mdp, m);
+	return 0;
+}
+
+void
+md_initm(struct mdchain *mdp, struct mbuf *m)
+{
+	bzero(mdp, sizeof(*mdp));
+	mdp->md_top = mdp->md_cur = m;
+	mdp->md_pos = mtod(m, u_char*);
+}
+
+void
+md_done(struct mdchain *mdp)
+{
+	if (mdp->md_top) {
+		m_freem(mdp->md_top);
+		mdp->md_top = NULL;
+	}
+}
+
+/*
+ * Append a separate mbuf chain. It is caller responsibility to prevent
+ * multiple calls to fetch/record routines.
+ */
+void
+md_append_record(struct mdchain *mdp, struct mbuf *top)
+{
+	struct mbuf *m;
+
+	if (mdp->md_top == NULL) {
+		md_initm(mdp, top);
+		return;
+	}
+	m = mdp->md_top;
+	while (m->m_nextpkt)
+		m = m->m_nextpkt;
+	m->m_nextpkt = top;
+	top->m_nextpkt = NULL;
+	return;
+}
+
+/*
+ * Put next record in place of existing
+ */
+int
+md_next_record(struct mdchain *mdp)
+{
+	struct mbuf *m;
+
+	if (mdp->md_top == NULL)
+		return ENOENT;
+	m = mdp->md_top->m_nextpkt;
+	md_done(mdp);
+	if (m == NULL)
+		return ENOENT;
+	md_initm(mdp, m);
+	return 0;
+}
+
+int
+md_get_uint8(struct mdchain *mdp, u_int8_t *x)
+{
+	return md_get_mem(mdp, x, 1, MB_MINLINE);
+}
+
+int
+md_get_uint16(struct mdchain *mdp, u_int16_t *x)
+{
+	return md_get_mem(mdp, (caddr_t)x, 2, MB_MINLINE);
+}
+
+int
+md_get_uint16le(struct mdchain *mdp, u_int16_t *x)
+{
+	u_int16_t v;
+	int error = md_get_uint16(mdp, &v);
+
+	*x = letohs(v);
+	return error;
+}
+
+int
+md_get_uint16be(struct mdchain *mdp, u_int16_t *x) {
+	u_int16_t v;
+	int error = md_get_uint16(mdp, &v);
+
+	*x = betohs(v);
+	return error;
+}
+
+int
+md_get_uint32(struct mdchain *mdp, u_int32_t *x)
+{
+	return md_get_mem(mdp, (caddr_t)x, 4, MB_MINLINE);
+}
+
+int
+md_get_uint32be(struct mdchain *mdp, u_int32_t *x)
+{
+	u_int32_t v;
+	int error;
+
+	error = md_get_uint32(mdp, &v);
+	*x = betohl(v);
+	return error;
+}
+
+int
+md_get_uint32le(struct mdchain *mdp, u_int32_t *x)
+{
+	u_int32_t v;
+	int error;
+
+	error = md_get_uint32(mdp, &v);
+	*x = letohl(v);
+	return error;
+}
+
+int
+md_get_int64(struct mdchain *mdp, int64_t *x)
+{
+	return md_get_mem(mdp, (caddr_t)x, 8, MB_MINLINE);
+}
+
+int
+md_get_int64be(struct mdchain *mdp, int64_t *x)
+{
+	int64_t v;
+	int error;
+
+	error = md_get_int64(mdp, &v);
+	*x = betohq(v);
+	return error;
+}
+
+int
+md_get_int64le(struct mdchain *mdp, int64_t *x)
+{
+	int64_t v;
+	int error;
+
+	error = md_get_int64(mdp, &v);
+	*x = letohq(v);
+	return error;
+}
+
+int
+md_get_mem(struct mdchain *mdp, caddr_t target, int size, int type)
+{
+	struct mbuf *m = mdp->md_cur;
+	int error;
+	u_int count;
+	u_char *s;
+	
+	while (size > 0) {
+		if (m == NULL) {
+			MBERROR("incomplete copy\n");
+			return EBADRPC;
+		}
+		s = mdp->md_pos;
+		count = mtod(m, u_char*) + m->m_len - s;
+		if (count == 0) {
+			mdp->md_cur = m = m->m_next;
+			if (m)
+				s = mdp->md_pos = mtod(m, caddr_t);
+			continue;
+		}
+		if (count > size)
+			count = size;
+		size -= count;
+		mdp->md_pos += count;
+		if (target == NULL)
+			continue;
+		switch (type) {
+		    case MB_MUSER:
+			error = copyout(s, target, count);
+			if (error)
+				return error;
+			break;
+		    case MB_MSYSTEM:
+			bcopy(s, target, count);
+			break;
+		    case MB_MINLINE:
+			while (count--)
+				*target++ = *s++;
+			continue;
+		}
+		target += count;
+	}
+	return 0;
+}
+
+int
+md_get_mbuf(struct mdchain *mdp, int size, struct mbuf **ret)
+{
+	struct mbuf *m = mdp->md_cur, *rm;
+
+	rm = m_copym(m, mdp->md_pos - mtod(m, u_char*), size, M_TRYWAIT);
+	if (rm == NULL)
+		return EBADRPC;
+	md_get_mem(mdp, NULL, size, MB_MZERO);
+	*ret = rm;
+	return 0;
+}
+
+int
+md_get_uio(struct mdchain *mdp, struct uio *uiop, int size)
+{
+	char *uiocp;
+	long left;
+	int mtype, error;
+
+	mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER;
+	while (size > 0 && uiop->uio_resid) {
+		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
+			return EFBIG;
+		left = uiop->uio_iov->iov_len;
+		if (left == 0) {
+			uiop->uio_iov++;
+			uiop->uio_iovcnt--;
+			continue;
+		}
+		uiocp = uiop->uio_iov->iov_base;
+		if (left > size)
+			left = size;
+		error = md_get_mem(mdp, uiocp, left, mtype);
+		if (error)
+			return error;
+		uiop->uio_offset += left;
+		uiop->uio_resid -= left;
+		uiop->uio_iov->iov_base += left;
+		uiop->uio_iov->iov_len -= left;
+		size -= left;
+	}
+	return 0;
+}
diff --git a/sys/kern/subr_module.c b/sys/kern/subr_module.c
new file mode 100644
index 0000000..ce74eca
--- /dev/null
+++ b/sys/kern/subr_module.c
@@ -0,0 +1,266 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/linker.h>
+
+/*
+ * Preloaded module support
+ */
+
+caddr_t	preload_metadata;
+
+/*
+ * Search for the preloaded module (name)
+ */
+caddr_t
+preload_search_by_name(const char *name)
+{
+    caddr_t	curp;
+    u_int32_t	*hdr;
+    int		next;
+    
+    if (preload_metadata != NULL) {
+	
+	curp = preload_metadata;
+	for (;;) {
+	    hdr = (u_int32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Search for a MODINFO_NAME field */
+	    if ((hdr[0] == MODINFO_NAME) &&
+		!strcmp(name, curp + sizeof(u_int32_t) * 2))
+		return(curp);
+
+	    /* skip to next field */
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+    return(NULL);
+}
+
+/*
+ * Search for the first preloaded module of (type)
+ */
+caddr_t
+preload_search_by_type(const char *type)
+{
+    caddr_t	curp, lname;
+    u_int32_t	*hdr;
+    int		next;
+
+    if (preload_metadata != NULL) {
+
+	curp = preload_metadata;
+	lname = NULL;
+	for (;;) {
+	    hdr = (u_int32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* remember the start of each record */
+	    if (hdr[0] == MODINFO_NAME)
+		lname = curp;
+
+	    /* Search for a MODINFO_TYPE field */
+	    if ((hdr[0] == MODINFO_TYPE) &&
+		!strcmp(type, curp + sizeof(u_int32_t) * 2))
+		return(lname);
+
+	    /* skip to next field */
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+    return(NULL);
+}
+
+/*
+ * Walk through the preloaded module list
+ */
+caddr_t
+preload_search_next_name(caddr_t base)
+{
+    caddr_t	curp;
+    u_int32_t	*hdr;
+    int		next;
+    
+    if (preload_metadata != NULL) {
+	
+	/* Pick up where we left off last time */
+	if (base) {
+	    /* skip to next field */
+	    curp = base;
+	    hdr = (u_int32_t *)curp;
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	} else
+	    curp = preload_metadata;
+
+	for (;;) {
+	    hdr = (u_int32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Found a new record? */
+	    if (hdr[0] == MODINFO_NAME)
+		return curp;
+
+	    /* skip to next field */
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+    return(NULL);
+}
+
+/*
+ * Given a preloaded module handle (mod), return a pointer
+ * to the data for the attribute (inf).
+ */
+caddr_t
+preload_search_info(caddr_t mod, int inf)
+{
+    caddr_t	curp;
+    u_int32_t	*hdr;
+    u_int32_t	type = 0;
+    int		next;
+
+    curp = mod;
+    for (;;) {
+	hdr = (u_int32_t *)curp;
+	/* end of module data? */
+	if (hdr[0] == 0 && hdr[1] == 0)
+	    break;
+	/* 
+	 * We give up once we've looped back to what we were looking at 
+	 * first - this should normally be a MODINFO_NAME field.
+	 */
+	if (type == 0) {
+	    type = hdr[0];
+	} else {
+	    if (hdr[0] == type)
+		break;
+	}
+	
+	/* 
+	 * Attribute match? Return pointer to data.
+	 * Consumer may safely assume that size value precedes	
+	 * data.
+	 */
+	if (hdr[0] == inf)
+	    return(curp + (sizeof(u_int32_t) * 2));
+
+	/* skip to next field */
+	next = sizeof(u_int32_t) * 2 + hdr[1];
+	next = roundup(next, sizeof(u_long));
+	curp += next;
+    }
+    return(NULL);
+}
+
+/*
+ * Delete a preload record by name.
+ */
+void
+preload_delete_name(const char *name)
+{
+    caddr_t	curp;
+    u_int32_t	*hdr;
+    int		next;
+    int		clearing;
+    
+    if (preload_metadata != NULL) {
+	
+	clearing = 0;
+	curp = preload_metadata;
+	for (;;) {
+	    hdr = (u_int32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Search for a MODINFO_NAME field */
+	    if (hdr[0] == MODINFO_NAME) {
+		if (!strcmp(name, curp + sizeof(u_int32_t) * 2))
+		    clearing = 1;	/* got it, start clearing */
+		else if (clearing)
+		    clearing = 0;	/* at next one now.. better stop */
+	    }
+	    if (clearing)
+		hdr[0] = MODINFO_EMPTY;
+
+	    /* skip to next field */
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+}
+
+/* Called from locore on i386.  Convert physical pointers to kvm. Sigh. */
+void
+preload_bootstrap_relocate(vm_offset_t offset)
+{
+    caddr_t	curp;
+    u_int32_t	*hdr;
+    vm_offset_t	*ptr;
+    int		next;
+    
+    if (preload_metadata != NULL) {
+	
+	curp = preload_metadata;
+	for (;;) {
+	    hdr = (u_int32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Deal with the ones that we know we have to fix */
+	    switch (hdr[0]) {
+	    case MODINFO_ADDR:
+	    case MODINFO_METADATA|MODINFOMD_SSYM:
+	    case MODINFO_METADATA|MODINFOMD_ESYM:
+		ptr = (vm_offset_t *)(curp + (sizeof(u_int32_t) * 2));
+		*ptr += offset;
+		break;
+	    }
+	    /* The rest is beyond us for now */
+
+	    /* skip to next field */
+	    next = sizeof(u_int32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+}
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
new file mode 100644
index 0000000..820fe0107
--- /dev/null
+++ b/sys/kern/subr_param.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 1980, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)param.c	8.3 (Berkeley) 8/20/94
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include "opt_maxusers.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+
+#include <machine/vmparam.h>
+
+/*
+ * System parameter formulae.
+ */
+
+#ifndef HZ
+#define	HZ 100
+#endif
+#define	NPROC (20 + 16 * maxusers)
+#ifndef NBUF
+#define NBUF 0
+#endif
+#ifndef MAXFILES
+#define	MAXFILES (maxproc * 2)
+#endif
+
+int	hz;
+int	tick;
+int	maxusers;			/* base tunable */
+int	maxproc;			/* maximum # of processes */
+int	maxprocperuid;			/* max # of procs per user */
+int	maxfiles;			/* sys. wide open files limit */
+int	maxfilesperproc;		/* per-proc open files limit */
+int	ncallout;			/* maximum # of timer events */
+int	nbuf;
+int	nswbuf;
+int	maxswzone;			/* max swmeta KVA storage */
+int	maxbcache;			/* max buffer cache KVA storage */
+u_quad_t	maxtsiz;			/* max text size */
+u_quad_t	dfldsiz;			/* initial data size limit */
+u_quad_t	maxdsiz;			/* max data size */
+u_quad_t	dflssiz;			/* initial stack size limit */
+u_quad_t	maxssiz;			/* max stack size */
+u_quad_t	sgrowsiz;			/* amount to grow stack */
+
+/*
+ * These have to be allocated somewhere; allocating
+ * them here forces loader errors if this file is omitted
+ * (if they've been externed everywhere else; hah!).
+ */
+struct	buf *swbuf;
+
+/*
+ * Boot time overrides that are not scaled against main memory
+ */
+void
+init_param1(void)
+{
+
+	hz = HZ;
+	TUNABLE_INT_FETCH("kern.hz", &hz);
+	tick = 1000000 / hz;
+
+#ifdef VM_SWZONE_SIZE_MAX
+	maxswzone = VM_SWZONE_SIZE_MAX;
+#endif
+	TUNABLE_INT_FETCH("kern.maxswzone", &maxswzone);
+#ifdef VM_BCACHE_SIZE_MAX
+	maxbcache = VM_BCACHE_SIZE_MAX;
+#endif
+	TUNABLE_INT_FETCH("kern.maxbcache", &maxbcache);
+
+	maxtsiz = MAXTSIZ;
+	TUNABLE_QUAD_FETCH("kern.maxtsiz", &maxtsiz);
+	dfldsiz = DFLDSIZ;
+	TUNABLE_QUAD_FETCH("kern.dfldsiz", &dfldsiz);
+	maxdsiz = MAXDSIZ;
+	TUNABLE_QUAD_FETCH("kern.maxdsiz", &maxdsiz);
+	dflssiz = DFLSSIZ;
+	TUNABLE_QUAD_FETCH("kern.dflssiz", &dflssiz);
+	maxssiz = MAXSSIZ;
+	TUNABLE_QUAD_FETCH("kern.maxssiz", &maxssiz);
+	sgrowsiz = SGROWSIZ;
+	TUNABLE_QUAD_FETCH("kern.sgrowsiz", &sgrowsiz);
+}
+
+/*
+ * Boot time overrides that are scaled against main memory
+ */
+void
+init_param2(int physpages)
+{
+
+	/* Base parameters */
+	maxusers = MAXUSERS;
+	TUNABLE_INT_FETCH("kern.maxusers", &maxusers);
+	if (maxusers == 0) {
+		maxusers = physpages / (2 * 1024 * 1024 / PAGE_SIZE);
+		if (maxusers < 32)
+			maxusers = 32;
+		if (maxusers > 384)
+			maxusers = 384;
+	}
+
+	/*
+	 * The following can be overridden after boot via sysctl.  Note:
+	 * unless overriden, these macros are ultimately based on maxusers.
+	 */
+	maxproc = NPROC;
+	TUNABLE_INT_FETCH("kern.maxproc", &maxproc);
+	/*
+	 * Limit maxproc so that kmap entries cannot be exhausted by
+	 * processes.
+	 */
+	if (maxproc > (physpages / 12))
+		maxproc = physpages / 12;
+	maxfiles = MAXFILES;
+	TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles);
+	maxprocperuid = (maxproc * 9) / 10;
+	maxfilesperproc = (maxfiles * 9) / 10;
+
+	/*
+	 * Cannot be changed after boot.
+	 */
+	nbuf = NBUF;
+	TUNABLE_INT_FETCH("kern.nbuf", &nbuf);
+
+	ncallout = 16 + maxproc + maxfiles;
+	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
+}
diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c
new file mode 100644
index 0000000..132e957
--- /dev/null
+++ b/sys/kern/subr_pcpu.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2001 Wind River Systems, Inc.
+ * All rights reserved.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This module provides MI support for per-cpu data.
+ *
+ * Each architecture determines the mapping of logical CPU IDs to physical
+ * CPUs.  The requirements of this mapping are as follows:
+ *  - Logical CPU IDs must reside in the range 0 ... MAXCPU - 1.
+ *  - The mapping is not required to be dense.  That is, there may be
+ *    gaps in the mappings.
+ *  - The platform sets the value of MAXCPU in <machine/param.h>.
+ *  - It is suggested, but not required, that in the non-SMP case, the
+ *    platform define MAXCPU to be 1 and define the logical ID of the
+ *    sole CPU as 0.
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/linker_set.h>
+#include <sys/lock.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <ddb/ddb.h>
+
+static struct pcpu *cpuid_to_pcpu[MAXCPU];
+struct cpuhead cpuhead = SLIST_HEAD_INITIALIZER(cpuhead);
+
+/*
+ * Initialize the MI portions of a struct pcpu.
+ */
+void
+pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
+{
+
+	bzero(pcpu, size);
+	KASSERT(cpuid >= 0 && cpuid < MAXCPU,
+	    ("pcpu_init: invalid cpuid %d", cpuid));
+	pcpu->pc_cpuid = cpuid;
+	pcpu->pc_cpumask = 1 << cpuid;
+	cpuid_to_pcpu[cpuid] = pcpu;
+	SLIST_INSERT_HEAD(&cpuhead, pcpu, pc_allcpu);
+	cpu_pcpu_init(pcpu, cpuid, size);
+}
+
+/*
+ * Destroy a struct pcpu.
+ */
+void
+pcpu_destroy(struct pcpu *pcpu)
+{
+
+	SLIST_REMOVE(&cpuhead, pcpu, pcpu, pc_allcpu);
+	cpuid_to_pcpu[pcpu->pc_cpuid] = NULL;
+}
+
+/*
+ * Locate a struct pcpu by cpu id.
+ */
+struct pcpu *
+pcpu_find(u_int cpuid)
+{
+
+	return (cpuid_to_pcpu[cpuid]);
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(pcpu, db_show_pcpu)
+{
+	struct pcpu *pc;
+	struct thread *td;
+	int id;
+
+	if (have_addr)
+		id = ((addr >> 4) % 16) * 10 + (addr % 16);
+	else
+		id = PCPU_GET(cpuid);
+	pc = pcpu_find(id);
+	if (pc == NULL) {
+		db_printf("CPU %d not found\n", id);
+		return;
+	}
+	db_printf("cpuid        = %d\n", pc->pc_cpuid);
+	db_printf("curthread    = ");
+	td = pc->pc_curthread;
+	if (td != NULL)
+		db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid,
+		    td->td_proc->p_comm);
+	else
+		db_printf("none\n");
+	db_printf("curpcb       = %p\n", pc->pc_curpcb);
+	db_printf("fpcurthread  = ");
+	td = pc->pc_fpcurthread;
+	if (td != NULL)
+		db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid,
+		    td->td_proc->p_comm);
+	else
+		db_printf("none\n");
+	db_printf("idlethread   = ");
+	td = pc->pc_idlethread;
+	if (td != NULL)
+		db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid,
+		    td->td_proc->p_comm);
+	else
+		db_printf("none\n");
+	db_show_mdpcpu(pc);
+		
+#ifdef WITNESS
+	db_printf("spin locks held:\n");
+	witness_list_locks(&pc->pc_spinlocks);
+#endif
+}
+#endif
diff --git a/sys/kern/subr_power.c b/sys/kern/subr_power.c
new file mode 100644
index 0000000..7c96c9e
--- /dev/null
+++ b/sys/kern/subr_power.c
@@ -0,0 +1,107 @@
+/*-
+ * Copyright (c) 2001 Mitsuru IWASAKI
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+
+#include <sys/power.h>
+
+static u_int		 power_pm_type	= POWER_PM_TYPE_NONE;
+static power_pm_fn_t	 power_pm_fn	= NULL;
+static void		*power_pm_arg	= NULL;
+
+int
+power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg)
+{
+	int	error;
+
+	if (power_pm_type == POWER_PM_TYPE_NONE ||
+	    power_pm_type == pm_type) {
+		power_pm_type	= pm_type;
+		power_pm_fn	= pm_fn;
+		power_pm_arg	= pm_arg;
+		error = 0;
+	} else {
+		error = ENXIO;
+	}
+
+	return (error);
+}
+
+u_int
+power_pm_get_type(void)
+{
+
+	return (power_pm_type);
+}
+
+void
+power_pm_suspend(int state)
+{
+	if (power_pm_fn == NULL)
+		return;
+
+	if (state != POWER_SLEEP_STATE_STANDBY &&
+	    state != POWER_SLEEP_STATE_SUSPEND &&
+	    state != POWER_SLEEP_STATE_HIBERNATE)
+		return;
+
+	power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state);
+}
+
+/*
+ * Power profile.
+ */
+
+static int	power_profile_state = POWER_PROFILE_PERFORMANCE;
+
+int
+power_profile_get_state(void)
+{
+	return (power_profile_state);
+}
+
+void
+power_profile_set_state(int state) 
+{
+	int		changed;
+    
+	if (state != power_profile_state) {
+		power_profile_state = state;
+		changed = 1;
+		printf("system power profile changed to '%s'\n",
+		       (state == POWER_PROFILE_PERFORMANCE) ? "performance" : "economy");
+	} else {
+		changed = 0;
+	}
+
+	if (changed)
+		EVENTHANDLER_INVOKE(power_profile_change);
+}
+
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
new file mode 100644
index 0000000..7f9b790
--- /dev/null
+++ b/sys/kern/subr_prf.c
@@ -0,0 +1,905 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_prf.c	8.3 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/kernel.h>
+#include <sys/msgbuf.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/stdint.h>
+#include <sys/sysctl.h>
+#include <sys/tty.h>
+#include <sys/syslog.h>
+#include <sys/cons.h>
+#include <sys/uio.h>
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define TOCONS	0x01
+#define TOTTY	0x02
+#define TOLOG	0x04
+
+/* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */
+#define MAXNBUF	(sizeof(intmax_t) * NBBY + 1)
+
+struct putchar_arg {
+	int	flags;
+	int	pri;
+	struct	tty *tty;
+};
+
+struct snprintf_arg {
+	char	*str;
+	size_t	remain;
+};
+
+extern	int log_open;
+
+struct	tty *constty;			/* pointer to console "window" tty */
+
+static void (*v_putc)(int) = cnputc;	/* routine to putc on virtual console */
+static void  msglogchar(int c, int pri);
+static void  msgaddchar(int c, void *dummy);
+static void  putchar(int ch, void *arg);
+static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len);
+static void  snprintf_func(int ch, void *arg);
+
+static int consintr = 1;		/* Ok to handle console interrupts? */
+static int msgbufmapped;		/* Set when safe to use msgbuf */
+int msgbuftrigger;
+
+static int      log_console_output = 1;
+SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RW,
+    &log_console_output, 0, "");
+
+/*
+ * Warn that a system table is full.
+ */
+void
+tablefull(const char *tab)
+{
+
+	log(LOG_ERR, "%s: table is full\n", tab);
+}
+
+/*
+ * Uprintf prints to the controlling terminal for the current process.
+ * It may block if the tty queue is overfull.  No message is printed if
+ * the queue does not clear in a reasonable time.
+ */
+int
+uprintf(const char *fmt, ...)
+{
+	struct thread *td = curthread;
+	struct proc *p = td->td_proc;
+	va_list ap;
+	struct putchar_arg pca;
+	int retval;
+
+	if (td == NULL || td == PCPU_GET(idlethread))
+		return (0);
+
+	p = td->td_proc;
+	PROC_LOCK(p);
+	if ((p->p_flag & P_CONTROLT) == 0) {
+		PROC_UNLOCK(p);
+		return (0);
+	}
+	SESS_LOCK(p->p_session);
+	pca.tty = p->p_session->s_ttyp;
+	SESS_UNLOCK(p->p_session);
+	PROC_UNLOCK(p);
+	if (pca.tty == NULL)
+		return (0);
+	pca.flags = TOTTY;
+	va_start(ap, fmt);
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+	va_end(ap);
+
+	return (retval);
+}
+
+/*
+ * tprintf prints on the controlling terminal associated
+ * with the given session, possibly to the log as well.
+ */
+void
+tprintf(struct proc *p, int pri, const char *fmt, ...)
+{
+	struct tty *tp = NULL;
+	int flags = 0, shld = 0;
+	va_list ap;
+	struct putchar_arg pca;
+	int retval;
+
+	if (pri != -1)
+		flags |= TOLOG;
+	if (p != NULL) {
+		PROC_LOCK(p);
+		if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
+			SESS_LOCK(p->p_session);
+			SESSHOLD(p->p_session);
+			tp = p->p_session->s_ttyp;
+			SESS_UNLOCK(p->p_session);
+			PROC_UNLOCK(p);
+			shld++;
+			if (ttycheckoutq(tp, 0))
+				flags |= TOTTY;
+			else
+				tp = NULL;
+		} else
+			PROC_UNLOCK(p);
+	}
+	pca.pri = pri;
+	pca.tty = tp;
+	pca.flags = flags;
+	va_start(ap, fmt);
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+	va_end(ap);
+	if (shld) {
+		PROC_LOCK(p);
+		SESS_LOCK(p->p_session);
+		SESSRELE(p->p_session);
+		SESS_UNLOCK(p->p_session);
+		PROC_UNLOCK(p);
+	}
+	msgbuftrigger = 1;
+}
+
+/*
+ * Ttyprintf displays a message on a tty; it should be used only by
+ * the tty driver, or anything that knows the underlying tty will not
+ * be revoke(2)'d away.  Other callers should use tprintf.
+ */
+int
+ttyprintf(struct tty *tp, const char *fmt, ...)
+{
+	va_list ap;
+	struct putchar_arg pca;
+	int retval;
+
+	va_start(ap, fmt);
+	pca.tty = tp;
+	pca.flags = TOTTY;
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+	va_end(ap);
+	return (retval);
+}
+
+/*
+ * Log writes to the log buffer, and guarantees not to sleep (so can be
+ * called by interrupt routines).  If there is no process reading the
+ * log yet, it writes to the console also.
+ */
+void
+log(int level, const char *fmt, ...)
+{
+	va_list ap;
+	int retval;
+	struct putchar_arg pca;
+
+	pca.tty = NULL;
+	pca.pri = level;
+	pca.flags = log_open ? TOLOG : TOCONS;
+
+	va_start(ap, fmt);
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+	va_end(ap);
+
+	msgbuftrigger = 1;
+}
+
+#define CONSCHUNK 128
+
+void
+log_console(struct uio *uio)
+{
+	int c, i, error, iovlen, nl;
+	struct uio muio;
+	struct iovec *miov = NULL;
+	char *consbuffer;
+	int pri;
+
+	if (!log_console_output)
+		return;
+
+	pri = LOG_INFO | LOG_CONSOLE;
+	muio = *uio;
+	iovlen = uio->uio_iovcnt * sizeof (struct iovec);
+	MALLOC(miov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+	MALLOC(consbuffer, char *, CONSCHUNK, M_TEMP, M_WAITOK);
+	bcopy(muio.uio_iov, miov, iovlen);
+	muio.uio_iov = miov;
+	uio = &muio;
+
+	nl = 0;
+	while (uio->uio_resid > 0) {
+		c = imin(uio->uio_resid, CONSCHUNK);
+		error = uiomove(consbuffer, c, uio);
+		if (error != 0)
+			return;
+		for (i = 0; i < c; i++) {
+			msglogchar(consbuffer[i], pri);
+			if (consbuffer[i] == '\n')
+				nl = 1;
+			else
+				nl = 0;
+		}
+	}
+	if (!nl)
+		msglogchar('\n', pri);
+	msgbuftrigger = 1;
+	FREE(miov, M_TEMP);
+	FREE(consbuffer, M_TEMP);
+	return;
+}
+
+int
+printf(const char *fmt, ...)
+{
+	va_list ap;
+	int savintr;
+	struct putchar_arg pca;
+	int retval;
+
+	savintr = consintr;		/* disable interrupts */
+	consintr = 0;
+	va_start(ap, fmt);
+	pca.tty = NULL;
+	pca.flags = TOCONS | TOLOG;
+	pca.pri = -1;
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+	va_end(ap);
+	if (!panicstr)
+		msgbuftrigger = 1;
+	consintr = savintr;		/* reenable interrupts */
+	return (retval);
+}
+
+int
+vprintf(const char *fmt, va_list ap)
+{
+	int savintr;
+	struct putchar_arg pca;
+	int retval;
+
+	savintr = consintr;		/* disable interrupts */
+	consintr = 0;
+	pca.tty = NULL;
+	pca.flags = TOCONS | TOLOG;
+	pca.pri = -1;
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+	if (!panicstr)
+		msgbuftrigger = 1;
+	consintr = savintr;		/* reenable interrupts */
+	return (retval);
+}
+
+/*
+ * Print a character on console or users terminal.  If destination is
+ * the console then the last bunch of characters are saved in msgbuf for
+ * inspection later.
+ */
+static void
+putchar(int c, void *arg)
+{
+	struct putchar_arg *ap = (struct putchar_arg*) arg;
+	int flags = ap->flags;
+	struct tty *tp = ap->tty;
+	if (panicstr)
+		constty = NULL;
+	if ((flags & TOCONS) && tp == NULL && constty) {
+		tp = constty;
+		flags |= TOTTY;
+	}
+	if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 &&
+	    (flags & TOCONS) && tp == constty)
+		constty = NULL;
+	if ((flags & TOLOG))
+		msglogchar(c, ap->pri);
+	if ((flags & TOCONS) && constty == NULL && c != '\0')
+		(*v_putc)(c);
+}
+
+/*
+ * Scaled down version of sprintf(3).
+ */
+int
+sprintf(char *buf, const char *cfmt, ...)
+{
+	int retval;
+	va_list ap;
+
+	va_start(ap, cfmt);
+	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+	buf[retval] = '\0';
+	va_end(ap);
+	return (retval);
+}
+
+/*
+ * Scaled down version of vsprintf(3).
+ */
+int
+vsprintf(char *buf, const char *cfmt, va_list ap)
+{
+	int retval;
+
+	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+	buf[retval] = '\0';
+	return (retval);
+}
+
+/*
+ * Scaled down version of snprintf(3).
+ */
+int
+snprintf(char *str, size_t size, const char *format, ...)
+{
+	int retval;
+	va_list ap;
+
+	va_start(ap, format);
+	retval = vsnprintf(str, size, format, ap);
+	va_end(ap);
+	return(retval);
+}
+
+/*
+ * Scaled down version of vsnprintf(3).
+ */
+int
+vsnprintf(char *str, size_t size, const char *format, va_list ap)
+{
+	struct snprintf_arg info;
+	int retval;
+
+	info.str = str;
+	info.remain = size;
+	retval = kvprintf(format, snprintf_func, &info, 10, ap);
+	if (info.remain >= 1)
+		*info.str++ = '\0';
+	return (retval);
+}
+
+static void
+snprintf_func(int ch, void *arg)
+{
+	struct snprintf_arg *const info = arg;
+
+	if (info->remain >= 2) {
+		*info->str++ = ch;
+		info->remain--;
+	}
+}
+
+/*
+ * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse
+ * order; return an optional length and a pointer to the last character
+ * written in the buffer (i.e., the first character of the string).
+ * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
+ */
+static char *
+ksprintn(char *nbuf, uintmax_t num, int base, int *lenp)
+{
+	char *p;
+
+	p = nbuf;
+	*p = '\0';
+	do {
+		*++p = hex2ascii(num % base);
+	} while (num /= base);
+	if (lenp)
+		*lenp = p - nbuf;
+	return (p);
+}
+
+/*
+ * Scaled down version of printf(3).
+ *
+ * Two additional formats:
+ *
+ * The format %b is supported to decode error registers.
+ * Its usage is:
+ *
+ *	printf("reg=%b\n", regval, "<base><arg>*");
+ *
+ * where <base> is the output base expressed as a control character, e.g.
+ * \10 gives octal; \20 gives hex.  Each arg is a sequence of characters,
+ * the first of which gives the bit number to be inspected (origin 1), and
+ * the next characters (up to a control character, i.e. a character <= 32),
+ * give the name of the register.  Thus:
+ *
+ *	kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
+ *
+ * would produce output:
+ *
+ *	reg=3<BITTWO,BITONE>
+ *
+ * XXX:  %D  -- Hexdump, takes pointer and separator string:
+ *		("%6D", ptr, ":")   -> XX:XX:XX:XX:XX:XX
+ *		("%*D", len, ptr, " " -> XX XX XX XX ...
+ */
+int
+kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
+{
+#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
+	char nbuf[MAXNBUF];
+	char *d;
+	const char *p, *percent, *q;
+	u_char *up;
+	int ch, n;
+	uintmax_t num;
+	int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
+	int jflag;
+	int dwidth;
+	char padc;
+	int retval = 0;
+
+	num = 0;
+	if (!func)
+		d = (char *) arg;
+	else
+		d = NULL;
+
+	if (fmt == NULL)
+		fmt = "(fmt null)\n";
+
+	if (radix < 2 || radix > 36)
+		radix = 10;
+
+	for (;;) {
+		padc = ' ';
+		width = 0;
+		while ((ch = (u_char)*fmt++) != '%') {
+			if (ch == '\0')
+				return (retval);
+			PCHAR(ch);
+		}
+		percent = fmt - 1;
+		qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
+		sign = 0; dot = 0; dwidth = 0;
+		jflag = 0;
+reswitch:	switch (ch = (u_char)*fmt++) {
+		case '.':
+			dot = 1;
+			goto reswitch;
+		case '#':
+			sharpflag = 1;
+			goto reswitch;
+		case '+':
+			sign = 1;
+			goto reswitch;
+		case '-':
+			ladjust = 1;
+			goto reswitch;
+		case '%':
+			PCHAR(ch);
+			break;
+		case '*':
+			if (!dot) {
+				width = va_arg(ap, int);
+				if (width < 0) {
+					ladjust = !ladjust;
+					width = -width;
+				}
+			} else {
+				dwidth = va_arg(ap, int);
+			}
+			goto reswitch;
+		case '0':
+			if (!dot) {
+				padc = '0';
+				goto reswitch;
+			}
+		case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+				for (n = 0;; ++fmt) {
+					n = n * 10 + ch - '0';
+					ch = *fmt;
+					if (ch < '0' || ch > '9')
+						break;
+				}
+			if (dot)
+				dwidth = n;
+			else
+				width = n;
+			goto reswitch;
+		case 'b':
+			num = va_arg(ap, int);
+			p = va_arg(ap, char *);
+			for (q = ksprintn(nbuf, num, *p++, NULL); *q;)
+				PCHAR(*q--);
+
+			if (num == 0)
+				break;
+
+			for (tmp = 0; *p;) {
+				n = *p++;
+				if (num & (1 << (n - 1))) {
+					PCHAR(tmp ? ',' : '<');
+					for (; (n = *p) > ' '; ++p)
+						PCHAR(n);
+					tmp = 1;
+				} else
+					for (; *p > ' '; ++p)
+						continue;
+			}
+			if (tmp)
+				PCHAR('>');
+			break;
+		case 'c':
+			PCHAR(va_arg(ap, int));
+			break;
+		case 'D':
+			up = va_arg(ap, u_char *);
+			p = va_arg(ap, char *);
+			if (!width)
+				width = 16;
+			while(width--) {
+				PCHAR(hex2ascii(*up >> 4));
+				PCHAR(hex2ascii(*up & 0x0f));
+				up++;
+				if (width)
+					for (q=p;*q;q++)
+						PCHAR(*q);
+			}
+			break;
+		case 'd':
+			base = 10;
+			sign = 1;
+			goto handle_sign;
+		case 'j':
+			jflag = 1;
+			goto reswitch;
+		case 'l':
+			if (lflag) {
+				lflag = 0;
+				qflag = 1;
+			} else
+				lflag = 1;
+			goto reswitch;
+		case 'n':
+			if (jflag)
+				*(va_arg(ap, intmax_t *)) = retval;
+			else if (qflag)
+				*(va_arg(ap, quad_t *)) = retval;
+			else if (lflag)
+				*(va_arg(ap, long *)) = retval;
+			else
+				*(va_arg(ap, int *)) = retval;
+			break;
+		case 'o':
+			base = 8;
+			goto handle_nosign;
+		case 'p':
+			base = 16;
+			sharpflag = (width == 0);
+			sign = 0;
+			num = (uintptr_t)va_arg(ap, void *);
+			goto number;
+		case 'q':
+			qflag = 1;
+			goto reswitch;
+		case 'r':
+			base = radix;
+			if (sign)
+				goto handle_sign;
+			goto handle_nosign;
+		case 's':
+			p = va_arg(ap, char *);
+			if (p == NULL)
+				p = "(null)";
+			if (!dot)
+				n = strlen (p);
+			else
+				for (n = 0; n < dwidth && p[n]; n++)
+					continue;
+
+			width -= n;
+
+			if (!ladjust && width > 0)
+				while (width--)
+					PCHAR(padc);
+			while (n--)
+				PCHAR(*p++);
+			if (ladjust && width > 0)
+				while (width--)
+					PCHAR(padc);
+			break;
+		case 'u':
+			base = 10;
+			goto handle_nosign;
+		case 'x':
+		case 'X':
+			base = 16;
+			goto handle_nosign;
+		case 'z':
+			base = 16;
+			if (sign)
+				goto handle_sign;
+handle_nosign:
+			sign = 0;
+			if (jflag)
+				num = va_arg(ap, uintmax_t);
+			else if (qflag)
+				num = va_arg(ap, u_quad_t);
+			else if (lflag)
+				num = va_arg(ap, u_long);
+			else
+				num = va_arg(ap, u_int);
+			goto number;
+handle_sign:
+			if (jflag)
+				num = va_arg(ap, intmax_t);
+			else if (qflag)
+				num = va_arg(ap, quad_t);
+			else if (lflag)
+				num = va_arg(ap, long);
+			else
+				num = va_arg(ap, int);
+number:
+			if (sign && (intmax_t)num < 0) {
+				neg = 1;
+				num = -(intmax_t)num;
+			}
+			p = ksprintn(nbuf, num, base, &tmp);
+			if (sharpflag && num != 0) {
+				if (base == 8)
+					tmp++;
+				else if (base == 16)
+					tmp += 2;
+			}
+			if (neg)
+				tmp++;
+
+			if (!ladjust && width && (width -= tmp) > 0)
+				while (width--)
+					PCHAR(padc);
+			if (neg)
+				PCHAR('-');
+			if (sharpflag && num != 0) {
+				if (base == 8) {
+					PCHAR('0');
+				} else if (base == 16) {
+					PCHAR('0');
+					PCHAR('x');
+				}
+			}
+
+			while (*p)
+				PCHAR(*p--);
+
+			if (ladjust && width && (width -= tmp) > 0)
+				while (width--)
+					PCHAR(padc);
+
+			break;
+		default:
+			while (percent < fmt)
+				PCHAR(*percent++);
+			break;
+		}
+	}
+#undef PCHAR
+}
+
+/*
+ * Put character in log buffer with a particular priority.
+ */
+static void
+msglogchar(int c, int pri)
+{
+	static int lastpri = -1;
+	static int dangling;
+	char nbuf[MAXNBUF];
+	char *p;
+
+	if (!msgbufmapped)
+		return;
+	if (c == '\0' || c == '\r')
+		return;
+	if (pri != -1 && pri != lastpri) {
+		if (dangling) {
+			msgaddchar('\n', NULL);
+			dangling = 0;
+		}
+		msgaddchar('<', NULL);
+		for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL); *p;)
+			msgaddchar(*p--, NULL);
+		msgaddchar('>', NULL);
+		lastpri = pri;
+	}
+	msgaddchar(c, NULL);
+	if (c == '\n') {
+		dangling = 0;
+		lastpri = -1;
+	} else {
+		dangling = 1;
+	}
+}
+
+/*
+ * Put char in log buffer
+ */
+static void
+msgaddchar(int c, void *dummy)
+{
+	struct msgbuf *mbp;
+
+	if (!msgbufmapped)
+		return;
+	mbp = msgbufp;
+	mbp->msg_ptr[mbp->msg_bufx++] = c;
+	if (mbp->msg_bufx >= mbp->msg_size)
+		mbp->msg_bufx = 0;
+	/* If the buffer is full, keep the most recent data. */
+	if (mbp->msg_bufr == mbp->msg_bufx) {
+		if (++mbp->msg_bufr >= mbp->msg_size)
+			mbp->msg_bufr = 0;
+	}
+}
+
+static void
+msgbufcopy(struct msgbuf *oldp)
+{
+	int pos;
+
+	pos = oldp->msg_bufr;
+	while (pos != oldp->msg_bufx) {
+		msglogchar(oldp->msg_ptr[pos], -1);
+		if (++pos >= oldp->msg_size)
+			pos = 0;
+	}
+}
+
+void
+msgbufinit(void *ptr, size_t size)
+{
+	char *cp;
+	static struct msgbuf *oldp = NULL;
+
+	size -= sizeof(*msgbufp);
+	cp = (char *)ptr;
+	msgbufp = (struct msgbuf *) (cp + size);
+	if (msgbufp->msg_magic != MSG_MAGIC || msgbufp->msg_size != size ||
+	    msgbufp->msg_bufx >= size || msgbufp->msg_bufr >= size) {
+		bzero(cp, size);
+		bzero(msgbufp, sizeof(*msgbufp));
+		msgbufp->msg_magic = MSG_MAGIC;
+		msgbufp->msg_size = (char *)msgbufp - cp;
+	}
+	msgbufp->msg_ptr = cp;
+	if (msgbufmapped && oldp != msgbufp)
+		msgbufcopy(oldp);
+	msgbufmapped = 1;
+	oldp = msgbufp;
+}
+
+SYSCTL_DECL(_security_bsd);
+
+static int unprivileged_read_msgbuf = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_read_msgbuf,
+    CTLFLAG_RW, &unprivileged_read_msgbuf, 0,
+    "Unprivileged processes may read the kernel message buffer");
+
+/* Sysctls for accessing/clearing the msgbuf */
+static int
+sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+
+	if (!unprivileged_read_msgbuf) {
+		error = suser(req->td);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * Unwind the buffer, so that it's linear (possibly starting with
+	 * some initial nulls).
+	 */
+	error = sysctl_handle_opaque(oidp, msgbufp->msg_ptr + msgbufp->msg_bufx,
+	    msgbufp->msg_size - msgbufp->msg_bufx, req);
+	if (error)
+		return (error);
+	if (msgbufp->msg_bufx > 0) {
+		error = sysctl_handle_opaque(oidp, msgbufp->msg_ptr,
+		    msgbufp->msg_bufx, req);
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, msgbuf, CTLTYPE_STRING | CTLFLAG_RD,
+    0, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer");
+
+static int msgbuf_clear;
+
+static int
+sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+	if (!error && req->newptr) {
+		/* Clear the buffer and reset write pointer */
+		bzero(msgbufp->msg_ptr, msgbufp->msg_size);
+		msgbufp->msg_bufr = msgbufp->msg_bufx = 0;
+		msgbuf_clear = 0;
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, &msgbuf_clear, 0,
+    sysctl_kern_msgbuf_clear, "I", "Clear kernel message buffer");
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(msgbuf, db_show_msgbuf)
+{
+	int i, j;
+
+	if (!msgbufmapped) {
+		db_printf("msgbuf not mapped yet\n");
+		return;
+	}
+	db_printf("msgbufp = %p\n", msgbufp);
+	db_printf("magic = %x, size = %d, r= %d, w = %d, ptr = %p\n",
+	    msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_bufr,
+	    msgbufp->msg_bufx, msgbufp->msg_ptr);
+	for (i = 0; i < msgbufp->msg_size; i++) {
+		j = (i + msgbufp->msg_bufr) % msgbufp->msg_size;
+		db_printf("%c", msgbufp->msg_ptr[j]);
+	}
+	db_printf("\n");
+}
+
+#endif /* DDB */
diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c
new file mode 100644
index 0000000..706863d
--- /dev/null
+++ b/sys/kern/subr_prof.c
@@ -0,0 +1,531 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_prof.c	8.3 (Berkeley) 9/23/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+
+#ifdef GPROF
+#include <sys/malloc.h>
+#include <sys/gmon.h>
+#undef MCOUNT
+
+static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
+
+static void kmstartup(void *);
+SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL)
+
+struct gmonparam _gmonparam = { GMON_PROF_OFF };
+
+#ifdef GUPROF
+#include <machine/asmacros.h>
+
+void
+nullfunc_loop_profiled()
+{
+	int i;
+
+	for (i = 0; i < CALIB_SCALE; i++)
+		nullfunc_profiled();
+}
+
+#define	nullfunc_loop_profiled_end	nullfunc_profiled	/* XXX */
+
+void
+nullfunc_profiled()
+{
+}
+#endif /* GUPROF */
+
+/*
+ * Update the histograms to support extending the text region arbitrarily.
+ * This is done slightly naively (no sparse regions), so will waste slight
+ * amounts of memory, but will overall work nicely enough to allow profiling
+ * of KLDs.
+ */
+void
+kmupetext(uintfptr_t nhighpc)
+{
+	struct gmonparam np;	/* slightly large */
+	struct gmonparam *p = &_gmonparam;
+	char *cp;
+
+	GIANT_REQUIRED;
+	bcopy(p, &np, sizeof(*p));
+	np.highpc = ROUNDUP(nhighpc, HISTFRACTION * sizeof(HISTCOUNTER));
+	if (np.highpc <= p->highpc)
+		return;
+	np.textsize = np.highpc - p->lowpc;
+	np.kcountsize = np.textsize / HISTFRACTION;
+	np.hashfraction = HASHFRACTION;
+	np.fromssize = np.textsize / HASHFRACTION;
+	np.tolimit = np.textsize * ARCDENSITY / 100;
+	if (np.tolimit < MINARCS)
+		np.tolimit = MINARCS;
+	else if (np.tolimit > MAXARCS)
+		np.tolimit = MAXARCS;
+	np.tossize = np.tolimit * sizeof(struct tostruct);
+	cp = malloc(np.kcountsize + np.fromssize + np.tossize,
+	    M_GPROF, M_WAITOK);
+	/*
+	 * Check for something else extending highpc while we slept.
+	 */
+	if (np.highpc <= p->highpc) {
+		free(cp, M_GPROF);
+		return;
+	}
+	np.tos = (struct tostruct *)cp;
+	cp += np.tossize;
+	np.kcount = (HISTCOUNTER *)cp;
+	cp += np.kcountsize;
+	np.froms = (u_short *)cp;
+#ifdef GUPROF
+	/* Reinitialize pointers to overhead counters. */
+	np.cputime_count = &KCOUNT(&np, PC_TO_I(&np, cputime));
+	np.mcount_count = &KCOUNT(&np, PC_TO_I(&np, mcount));
+	np.mexitcount_count = &KCOUNT(&np, PC_TO_I(&np, mexitcount));
+#endif
+	critical_enter();
+	bcopy(p->tos, np.tos, p->tossize);
+	bzero((char *)np.tos + p->tossize, np.tossize - p->tossize);
+	bcopy(p->kcount, np.kcount, p->kcountsize);
+	bzero((char *)np.kcount + p->kcountsize, np.kcountsize -
+	    p->kcountsize);
+	bcopy(p->froms, np.froms, p->fromssize);
+	bzero((char *)np.froms + p->fromssize, np.fromssize - p->fromssize);
+	cp = (char *)p->tos;
+	bcopy(&np, p, sizeof(*p));
+	critical_exit();
+	free(cp, M_GPROF);
+}
+
+static void
+kmstartup(dummy)
+	void *dummy;
+{
+	char *cp;
+	struct gmonparam *p = &_gmonparam;
+#ifdef GUPROF
+	int cputime_overhead;
+	int empty_loop_time;
+	int i;
+	int mcount_overhead;
+	int mexitcount_overhead;
+	int nullfunc_loop_overhead;
+	int nullfunc_loop_profiled_time;
+	uintfptr_t tmp_addr;
+#endif
+
+	/*
+	 * Round lowpc and highpc to multiples of the density we're using
+	 * so the rest of the scaling (here and in gprof) stays in ints.
+	 */
+	p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
+	p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
+	p->textsize = p->highpc - p->lowpc;
+	printf("Profiling kernel, textsize=%lu [%x..%x]\n",
+	       p->textsize, p->lowpc, p->highpc);
+	p->kcountsize = p->textsize / HISTFRACTION;
+	p->hashfraction = HASHFRACTION;
+	p->fromssize = p->textsize / HASHFRACTION;
+	p->tolimit = p->textsize * ARCDENSITY / 100;
+	if (p->tolimit < MINARCS)
+		p->tolimit = MINARCS;
+	else if (p->tolimit > MAXARCS)
+		p->tolimit = MAXARCS;
+	p->tossize = p->tolimit * sizeof(struct tostruct);
+	cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize,
+	    M_GPROF, M_WAITOK | M_ZERO);
+	p->tos = (struct tostruct *)cp;
+	cp += p->tossize;
+	p->kcount = (HISTCOUNTER *)cp;
+	cp += p->kcountsize;
+	p->froms = (u_short *)cp;
+
+#ifdef GUPROF
+	/* Initialize pointers to overhead counters. */
+	p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
+	p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
+	p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));
+
+	/*
+	 * Disable interrupts to avoid interference while we calibrate
+	 * things.
+	 */
+	critical_enter();
+
+	/*
+	 * Determine overheads.
+	 * XXX this needs to be repeated for each useful timer/counter.
+	 */
+	cputime_overhead = 0;
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+		cputime_overhead += cputime();
+
+	empty_loop();
+	startguprof(p);
+	empty_loop();
+	empty_loop_time = cputime();
+
+	nullfunc_loop_profiled();
+
+	/*
+	 * Start profiling.  There won't be any normal function calls since
+	 * interrupts are disabled, but we will call the profiling routines
+	 * directly to determine their overheads.
+	 */
+	p->state = GMON_PROF_HIRES;
+
+	startguprof(p);
+	nullfunc_loop_profiled();
+
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+#if defined(__i386__) && __GNUC__ >= 2
+		__asm("pushl %0; call __mcount; popl %%ecx"
+		      :
+		      : "i" (profil)
+		      : "ax", "bx", "cx", "dx", "memory");
+#else
+#error
+#endif
+	mcount_overhead = KCOUNT(p, PC_TO_I(p, profil));
+
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+#if defined(__i386__) && __GNUC__ >= 2
+		    __asm("call " __XSTRING(HIDENAME(mexitcount)) "; 1:"
+			  : : : "ax", "bx", "cx", "dx", "memory");
+	__asm("movl $1b,%0" : "=rm" (tmp_addr));
+#else
+#error
+#endif
+	mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));
+
+	p->state = GMON_PROF_OFF;
+	stopguprof(p);
+
+	critical_exit();
+
+	nullfunc_loop_profiled_time = 0;
+	for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled;
+	     tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end;
+	     tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
+		nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
+#define CALIB_DOSCALE(count)	(((count) + CALIB_SCALE / 3) / CALIB_SCALE)
+#define	c2n(count, freq)	((int)((count) * 1000000000LL / freq))
+	printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
+	       CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
+	       CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
+	       CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
+	       CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
+	       CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
+	cputime_overhead -= empty_loop_time;
+	mcount_overhead -= empty_loop_time;
+	mexitcount_overhead -= empty_loop_time;
+
+	/*-
+	 * Profiling overheads are determined by the times between the
+	 * following events:
+	 *	MC1: mcount() is called
+	 *	MC2: cputime() (called from mcount()) latches the timer
+	 *	MC3: mcount() completes
+	 *	ME1: mexitcount() is called
+	 *	ME2: cputime() (called from mexitcount()) latches the timer
+	 *	ME3: mexitcount() completes.
+	 * The times between the events vary slightly depending on instruction
+	 * combination and cache misses, etc.  Attempt to determine the
+	 * minimum times.  These can be subtracted from the profiling times
+	 * without much risk of reducing the profiling times below what they
+	 * would be when profiling is not configured.  Abbreviate:
+	 *	ab = minimum time between MC1 and MC3
+	 *	a  = minumum time between MC1 and MC2
+	 *	b  = minimum time between MC2 and MC3
+	 *	cd = minimum time between ME1 and ME3
+	 *	c  = minimum time between ME1 and ME2
+	 *	d  = minimum time between ME2 and ME3.
+	 * These satisfy the relations:
+	 *	ab            <= mcount_overhead		(just measured)
+	 *	a + b         <= ab
+	 *	        cd    <= mexitcount_overhead		(just measured)
+	 *	        c + d <= cd
+	 *	a         + d <= nullfunc_loop_profiled_time	(just measured)
+	 *	a >= 0, b >= 0, c >= 0, d >= 0.
+	 * Assume that ab and cd are equal to the minimums.
+	 */
+	p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
+	p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
+	p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
+					       - cputime_overhead);
+	nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
+	p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
+						     - nullfunc_loop_overhead)
+						    / 4);
+	p->mexitcount_pre_overhead = p->mexitcount_overhead
+				     + p->cputime_overhead
+				     - p->mexitcount_post_overhead;
+	p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
+				 - p->mexitcount_post_overhead;
+	p->mcount_post_overhead = p->mcount_overhead
+				  + p->cputime_overhead
+				  - p->mcount_pre_overhead;
+	printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
+	       c2n(p->cputime_overhead, p->profrate),
+	       c2n(p->mcount_overhead, p->profrate),
+	       c2n(p->mcount_pre_overhead, p->profrate),
+	       c2n(p->mcount_post_overhead, p->profrate),
+	       c2n(p->cputime_overhead, p->profrate),
+	       c2n(p->mexitcount_overhead, p->profrate),
+	       c2n(p->mexitcount_pre_overhead, p->profrate),
+	       c2n(p->mexitcount_post_overhead, p->profrate));
+	printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
+	       p->cputime_overhead, p->mcount_overhead,
+	       p->mcount_pre_overhead, p->mcount_post_overhead,
+	       p->cputime_overhead, p->mexitcount_overhead,
+	       p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
+#endif /* GUPROF */
+}
+
+/*
+ * Return kernel profiling information.
+ */
+static int
+sysctl_kern_prof(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	struct gmonparam *gp = &_gmonparam;
+	int error;
+	int state;
+
+	/* all sysctl names at this level are terminal */
+	if (namelen != 1)
+		return (ENOTDIR);		/* overloaded */
+
+	switch (name[0]) {
+	case GPROF_STATE:
+		state = gp->state;
+		error = sysctl_handle_int(oidp, &state, 0, req);
+		if (error)
+			return (error);
+		if (!req->newptr)
+			return (0);
+		if (state == GMON_PROF_OFF) {
+			gp->state = state;
+			stopprofclock(&proc0);
+			stopguprof(gp);
+		} else if (state == GMON_PROF_ON) {
+			gp->state = GMON_PROF_OFF;
+			stopguprof(gp);
+			gp->profrate = profhz;
+			startprofclock(&proc0);
+			gp->state = state;
+#ifdef GUPROF
+		} else if (state == GMON_PROF_HIRES) {
+			gp->state = GMON_PROF_OFF;
+			stopprofclock(&proc0);
+			startguprof(gp);
+			gp->state = state;
+#endif
+		} else if (state != gp->state)
+			return (EINVAL);
+		return (0);
+	case GPROF_COUNT:
+		return (sysctl_handle_opaque(oidp, 
+			gp->kcount, gp->kcountsize, req));
+	case GPROF_FROMS:
+		return (sysctl_handle_opaque(oidp, 
+			gp->froms, gp->fromssize, req));
+	case GPROF_TOS:
+		return (sysctl_handle_opaque(oidp, 
+			gp->tos, gp->tossize, req));
+	case GPROF_GMONPARAM:
+		return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
+	default:
+		return (EOPNOTSUPP);
+	}
+	/* NOTREACHED */
+}
+
+SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
+#endif /* GPROF */
+
+/*
+ * Profiling system call.
+ *
+ * The scale factor is a fixed point number with 16 bits of fraction, so that
+ * 1.0 is represented as 0x10000.  A scale factor of 0 turns off profiling.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct profil_args {
+	caddr_t	samples;
+	size_t	size;
+	size_t	offset;
+	u_int	scale;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+profil(td, uap)
+	struct thread *td;
+	register struct profil_args *uap;
+{
+	register struct uprof *upp;
+	int s;
+	int error = 0;
+
+	mtx_lock(&Giant);
+
+	if (uap->scale > (1 << 16)) {
+		error = EINVAL;
+		goto done2;
+	}
+	if (uap->scale == 0) {
+		stopprofclock(td->td_proc);
+		goto done2;
+	}
+	upp = &td->td_proc->p_stats->p_prof;
+
+	/* Block profile interrupts while changing state. */
+	s = splstatclock();
+	upp->pr_off = uap->offset;
+	upp->pr_scale = uap->scale;
+	upp->pr_base = uap->samples;
+	upp->pr_size = uap->size;
+	startprofclock(td->td_proc);
+	splx(s);
+
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Scale is a fixed-point number with the binary point 16 bits
+ * into the value, and is <= 1.0.  pc is at most 32 bits, so the
+ * intermediate result is at most 48 bits.
+ */
+#define	PC_TO_INDEX(pc, prof) \
+	((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
+	    (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
+
+/*
+ * Collect user-level profiling statistics; called on a profiling tick,
+ * when a process is running in user-mode.  This routine may be called
+ * from an interrupt context.  We try to update the user profiling buffers
+ * cheaply with fuswintr() and suswintr().  If that fails, we revert to
+ * an AST that will vector us to trap() with a context in which copyin
+ * and copyout will work.  Trap will then call addupc_task().
+ *
+ * Note that we may (rarely) not get around to the AST soon enough, and
+ * lose profile ticks when the next tick overwrites this one, but in this
+ * case the system is overloaded and the profile is probably already
+ * inaccurate.
+ */
+void
+addupc_intr(ke, pc, ticks)
+	register struct kse *ke;
+	register uintptr_t pc;
+	u_int ticks;
+{
+	register struct uprof *prof;
+	register caddr_t addr;
+	register u_int i;
+	register int v;
+
+	if (ticks == 0)
+		return;
+	prof = &ke->ke_proc->p_stats->p_prof;
+	if (pc < prof->pr_off ||
+	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
+		return;			/* out of range; ignore */
+
+	addr = prof->pr_base + i;
+	if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
+		mtx_lock_spin(&sched_lock);
+		prof->pr_addr = pc;
+		prof->pr_ticks = ticks;
+		ke->ke_flags |= KEF_OWEUPC | KEF_ASTPENDING ;
+		mtx_unlock_spin(&sched_lock);
+	}
+}
+
+/*
+ * Much like before, but we can afford to take faults here.  If the
+ * update fails, we simply turn off profiling.
+ */
+void
+addupc_task(ke, pc, ticks)
+	register struct kse *ke;
+	register uintptr_t pc;
+	u_int ticks;
+{
+	struct proc *p = ke->ke_proc;
+	register struct uprof *prof;
+	register caddr_t addr;
+	register u_int i;
+	u_short v;
+
+	if (ticks == 0)
+		return;
+
+	prof = &p->p_stats->p_prof;
+	if (pc < prof->pr_off ||
+	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
+		return;
+
+	addr = prof->pr_base + i;
+	if (copyin(addr, (caddr_t)&v, sizeof(v)) == 0) {
+		v += ticks;
+		if (copyout((caddr_t)&v, addr, sizeof(v)) == 0)
+			return;
+	}
+	stopprofclock(p);
+}
diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c
new file mode 100644
index 0000000..85af088
--- /dev/null
+++ b/sys/kern/subr_rman.c
@@ -0,0 +1,609 @@
+/*
+ * Copyright 1998 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission.  M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied
+ * warranty.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * The kernel resource manager.  This code is responsible for keeping track
+ * of hardware resources which are apportioned out to various drivers.
+ * It does not actually assign those resources, and it is not expected
+ * that end-device drivers will call into this code directly.  Rather,
+ * the code which implements the buses that those devices are attached to,
+ * and the code which manages CPU resources, will call this code, and the
+ * end-device drivers will make upcalls to that code to actually perform
+ * the allocation.
+ *
+ * There are two sorts of resources managed by this code.  The first is
+ * the more familiar array (RMAN_ARRAY) type; resources in this class
+ * consist of a sequence of individually-allocatable objects which have
+ * been numbered in some well-defined order.  Most of the resources
+ * are of this type, as it is the most familiar.  The second type is
+ * called a gauge (RMAN_GAUGE), and models fungible resources (i.e.,
+ * resources in which each instance is indistinguishable from every
+ * other instance).  The principal anticipated application of gauges
+ * is in the context of power consumption, where a bus may have a specific
+ * power budget which all attached devices share.  RMAN_GAUGE is not
+ * implemented yet.
+ *
+ * For array resources, we make one simplifying assumption: two clients
+ * sharing the same resource must use the same range of indices.  That
+ * is to say, sharing of overlapping-but-not-identical regions is not
+ * permitted.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/bus.h>		/* XXX debugging */
+#include <machine/bus.h>
+#include <sys/rman.h>
+
+#ifdef RMAN_DEBUG
+#define DPRINTF(params) printf##params
+#else
+#define DPRINTF(params)
+#endif
+
+static MALLOC_DEFINE(M_RMAN, "rman", "Resource manager");
+
+struct	rman_head rman_head;
+static	struct mtx rman_mtx; /* mutex to protect rman_head */
+static	int int_rman_activate_resource(struct rman *rm, struct resource *r,
+				       struct resource **whohas);
+static	int int_rman_deactivate_resource(struct resource *r);
+static	int int_rman_release_resource(struct rman *rm, struct resource *r);
+
+int
+rman_init(struct rman *rm)
+{
+	static int once;
+
+	if (once == 0) {
+		once = 1;
+		TAILQ_INIT(&rman_head);
+		mtx_init(&rman_mtx, "rman head", NULL, MTX_DEF);
+	}
+
+	if (rm->rm_type == RMAN_UNINIT)
+		panic("rman_init");
+	if (rm->rm_type == RMAN_GAUGE)
+		panic("implement RMAN_GAUGE");
+
+	TAILQ_INIT(&rm->rm_list);
+	rm->rm_mtx = malloc(sizeof *rm->rm_mtx, M_RMAN, M_NOWAIT | M_ZERO);
+	if (rm->rm_mtx == 0)
+		return ENOMEM;
+	mtx_init(rm->rm_mtx, "rman", NULL, MTX_DEF);
+
+	mtx_lock(&rman_mtx);
+	TAILQ_INSERT_TAIL(&rman_head, rm, rm_link);
+	mtx_unlock(&rman_mtx);
+	return 0;
+}
+
+/*
+ * NB: this interface is not robust against programming errors which
+ * add multiple copies of the same region.
+ */
+int
+rman_manage_region(struct rman *rm, u_long start, u_long end)
+{
+	struct resource *r, *s;
+
+	r = malloc(sizeof *r, M_RMAN, M_NOWAIT | M_ZERO);
+	if (r == 0)
+		return ENOMEM;
+	r->r_start = start;
+	r->r_end = end;
+	r->r_rm = rm;
+
+	mtx_lock(rm->rm_mtx);
+	for (s = TAILQ_FIRST(&rm->rm_list);	
+	     s && s->r_end < r->r_start;
+	     s = TAILQ_NEXT(s, r_link))
+		;
+
+	if (s == NULL) {
+		TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link);
+	} else {
+		TAILQ_INSERT_BEFORE(s, r, r_link);
+	}
+
+	mtx_unlock(rm->rm_mtx);
+	return 0;
+}
+
+int
+rman_fini(struct rman *rm)
+{
+	struct resource *r;
+
+	mtx_lock(rm->rm_mtx);
+	TAILQ_FOREACH(r, &rm->rm_list, r_link) {
+		if (r->r_flags & RF_ALLOCATED) {
+			mtx_unlock(rm->rm_mtx);
+			return EBUSY;
+		}
+	}
+
+	/*
+	 * There really should only be one of these if we are in this
+	 * state and the code is working properly, but it can't hurt.
+	 */
+	while (!TAILQ_EMPTY(&rm->rm_list)) {
+		r = TAILQ_FIRST(&rm->rm_list);
+		TAILQ_REMOVE(&rm->rm_list, r, r_link);
+		free(r, M_RMAN);
+	}
+	mtx_unlock(rm->rm_mtx);
+	mtx_lock(&rman_mtx);
+	TAILQ_REMOVE(&rman_head, rm, rm_link);
+	mtx_unlock(&rman_mtx);
+	mtx_destroy(rm->rm_mtx);
+	free(rm->rm_mtx, M_RMAN);
+
+	return 0;
+}
+
+struct resource *
+rman_reserve_resource_bound(struct rman *rm, u_long start, u_long end,
+		      u_long count, u_long bound,  u_int flags,
+		      struct device *dev)
+{
+	u_int	want_activate;
+	struct	resource *r, *s, *rv;
+	u_long	rstart, rend, amask, bmask;
+
+	rv = 0;
+
+	DPRINTF(("rman_reserve_resource: <%s> request: [%#lx, %#lx], length "
+	       "%#lx, flags %u, device %s\n", rm->rm_descr, start, end, count,
+	       flags, dev == NULL ? "<null>" : device_get_nameunit(dev)));
+	want_activate = (flags & RF_ACTIVE);
+	flags &= ~RF_ACTIVE;
+
+	mtx_lock(rm->rm_mtx);
+
+	for (r = TAILQ_FIRST(&rm->rm_list); 
+	     r && r->r_end < start;
+	     r = TAILQ_NEXT(r, r_link))
+		;
+
+	if (r == NULL) {
+		DPRINTF(("could not find a region\n"));
+		goto out;
+	}
+
+	amask = (1ul << RF_ALIGNMENT(flags)) - 1;
+	/* If bound is 0, bmask will also be 0 */
+	bmask = ~(bound - 1);
+	/*
+	 * First try to find an acceptable totally-unshared region.
+	 */
+	for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
+		DPRINTF(("considering [%#lx, %#lx]\n", s->r_start, s->r_end));
+		if (s->r_start > end) {
+			DPRINTF(("s->r_start (%#lx) > end (%#lx)\n", s->r_start, end));
+			break;
+		}
+		if (s->r_flags & RF_ALLOCATED) {
+			DPRINTF(("region is allocated\n"));
+			continue;
+		}
+		rstart = ulmax(s->r_start, start);
+		/*
+		 * Try to find a region by adjusting to boundary and alignment
+		 * until both conditions are satisfied. This is not an optimal
+		 * algorithm, but in most cases it isn't really bad, either.
+		 */
+		do {
+			rstart = (rstart + amask) & ~amask;
+			if (((rstart ^ (rstart + count)) & bmask) != 0)
+				rstart += bound - (rstart & ~bmask);
+		} while ((rstart & amask) != 0 && rstart < end &&
+		    rstart < s->r_end);
+		rend = ulmin(s->r_end, ulmax(rstart + count, end));
+		DPRINTF(("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n",
+		       rstart, rend, (rend - rstart + 1), count));
+
+		if ((rend - rstart + 1) >= count) {
+			DPRINTF(("candidate region: [%#lx, %#lx], size %#lx\n",
+			       rend, rstart, (rend - rstart + 1)));
+			if ((s->r_end - s->r_start + 1) == count) {
+				DPRINTF(("candidate region is entire chunk\n"));
+				rv = s;
+				rv->r_flags |= RF_ALLOCATED | flags;
+				rv->r_dev = dev;
+				goto out;
+			}
+
+			/*
+			 * If s->r_start < rstart and
+			 *    s->r_end > rstart + count - 1, then
+			 * we need to split the region into three pieces
+			 * (the middle one will get returned to the user).
+			 * Otherwise, we are allocating at either the
+			 * beginning or the end of s, so we only need to
+			 * split it in two.  The first case requires
+			 * two new allocations; the second requires but one.
+			 */
+			rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO);
+			if (rv == 0)
+				goto out;
+			rv->r_start = rstart;
+			rv->r_end = rstart + count - 1;
+			rv->r_flags = flags | RF_ALLOCATED;
+			rv->r_dev = dev;
+			rv->r_rm = rm;
+			
+			if (s->r_start < rv->r_start && s->r_end > rv->r_end) {
+				DPRINTF(("splitting region in three parts: "
+				       "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n",
+				       s->r_start, rv->r_start - 1,
+				       rv->r_start, rv->r_end,
+				       rv->r_end + 1, s->r_end));
+				/*
+				 * We are allocating in the middle.
+				 */
+				r = malloc(sizeof *r, M_RMAN, M_NOWAIT|M_ZERO);
+				if (r == 0) {
+					free(rv, M_RMAN);
+					rv = 0;
+					goto out;
+				}
+				r->r_start = rv->r_end + 1;
+				r->r_end = s->r_end;
+				r->r_flags = s->r_flags;
+				r->r_rm = rm;
+				s->r_end = rv->r_start - 1;
+				TAILQ_INSERT_AFTER(&rm->rm_list, s, rv,
+						     r_link);
+				TAILQ_INSERT_AFTER(&rm->rm_list, rv, r,
+						     r_link);
+			} else if (s->r_start == rv->r_start) {
+				DPRINTF(("allocating from the beginning\n"));
+				/*
+				 * We are allocating at the beginning.
+				 */
+				s->r_start = rv->r_end + 1;
+				TAILQ_INSERT_BEFORE(s, rv, r_link);
+			} else {
+				DPRINTF(("allocating at the end\n"));
+				/*
+				 * We are allocating at the end.
+				 */
+				s->r_end = rv->r_start - 1;
+				TAILQ_INSERT_AFTER(&rm->rm_list, s, rv,
+						     r_link);
+			}
+			goto out;
+		}
+	}
+
+	/*
+	 * Now find an acceptable shared region, if the client's requirements
+	 * allow sharing.  By our implementation restriction, a candidate
+	 * region must match exactly by both size and sharing type in order
+	 * to be considered compatible with the client's request.  (The
+	 * former restriction could probably be lifted without too much
+	 * additional work, but this does not seem warranted.)
+	 */
+	DPRINTF(("no unshared regions found\n"));
+	if ((flags & (RF_SHAREABLE | RF_TIMESHARE)) == 0)
+		goto out;
+
+	for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
+		if (s->r_start > end)
+			break;
+		if ((s->r_flags & flags) != flags)
+			continue;
+		rstart = ulmax(s->r_start, start);
+		rend = ulmin(s->r_end, ulmax(start + count, end));
+		if (s->r_start >= start && s->r_end <= end
+		    && (s->r_end - s->r_start + 1) == count &&
+		    (s->r_start & amask) == 0 &&
+		    ((s->r_start ^ s->r_end) & bmask) == 0) {
+			rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO);
+			if (rv == 0)
+				goto out;
+			rv->r_start = s->r_start;
+			rv->r_end = s->r_end;
+			rv->r_flags = s->r_flags & 
+				(RF_ALLOCATED | RF_SHAREABLE | RF_TIMESHARE);
+			rv->r_dev = dev;
+			rv->r_rm = rm;
+			if (s->r_sharehead == 0) {
+				s->r_sharehead = malloc(sizeof *s->r_sharehead,
+						M_RMAN, M_NOWAIT | M_ZERO);
+				if (s->r_sharehead == 0) {
+					free(rv, M_RMAN);
+					rv = 0;
+					goto out;
+				}
+				LIST_INIT(s->r_sharehead);
+				LIST_INSERT_HEAD(s->r_sharehead, s, 
+						 r_sharelink);
+				s->r_flags |= RF_FIRSTSHARE;
+			}
+			rv->r_sharehead = s->r_sharehead;
+			LIST_INSERT_HEAD(s->r_sharehead, rv, r_sharelink);
+			goto out;
+		}
+	}
+
+	/*
+	 * We couldn't find anything.
+	 */
+out:
+	/*
+	 * If the user specified RF_ACTIVE in the initial flags,
+	 * which is reflected in `want_activate', we attempt to atomically
+	 * activate the resource.  If this fails, we release the resource
+	 * and indicate overall failure.  (This behavior probably doesn't
+	 * make sense for RF_TIMESHARE-type resources.)
+	 */
+	if (rv && want_activate) {
+		struct resource *whohas;
+		if (int_rman_activate_resource(rm, rv, &whohas)) {
+			int_rman_release_resource(rm, rv);
+			rv = 0;
+		}
+	}
+			
+	mtx_unlock(rm->rm_mtx);
+	return (rv);
+}
+
+struct resource *
+rman_reserve_resource(struct rman *rm, u_long start, u_long end, u_long count,
+		      u_int flags, struct device *dev)
+{
+
+	return (rman_reserve_resource_bound(rm, start, end, count, 0, flags,
+	    dev));
+}
+
+static int
+int_rman_activate_resource(struct rman *rm, struct resource *r,
+			   struct resource **whohas)
+{
+	struct resource *s;
+	int ok;
+
+	/*
+	 * If we are not timesharing, then there is nothing much to do.
+	 * If we already have the resource, then there is nothing at all to do.
+	 * If we are not on a sharing list with anybody else, then there is
+	 * little to do.
+	 */
+	if ((r->r_flags & RF_TIMESHARE) == 0
+	    || (r->r_flags & RF_ACTIVE) != 0
+	    || r->r_sharehead == 0) {
+		r->r_flags |= RF_ACTIVE;
+		return 0;
+	}
+
+	ok = 1;
+	for (s = LIST_FIRST(r->r_sharehead); s && ok;
+	     s = LIST_NEXT(s, r_sharelink)) {
+		if ((s->r_flags & RF_ACTIVE) != 0) {
+			ok = 0;
+			*whohas = s;
+		}
+	}
+	if (ok) {
+		r->r_flags |= RF_ACTIVE;
+		return 0;
+	}
+	return EBUSY;
+}
+
+int
+rman_activate_resource(struct resource *r)
+{
+	int rv;
+	struct resource *whohas;
+	struct rman *rm;
+
+	rm = r->r_rm;
+	mtx_lock(rm->rm_mtx);
+	rv = int_rman_activate_resource(rm, r, &whohas);
+	mtx_unlock(rm->rm_mtx);
+	return rv;
+}
+
+int
+rman_await_resource(struct resource *r, int pri, int timo)
+{
+	int	rv;
+	struct	resource *whohas;
+	struct	rman *rm;
+
+	rm = r->r_rm;
+	mtx_lock(rm->rm_mtx);
+	for (;;) {
+		rv = int_rman_activate_resource(rm, r, &whohas);
+		if (rv != EBUSY)
+			return (rv);	/* returns with mutex held */
+
+		if (r->r_sharehead == 0)
+			panic("rman_await_resource");
+		whohas->r_flags |= RF_WANTED;
+		rv = msleep(r->r_sharehead, rm->rm_mtx, pri, "rmwait", timo);
+		if (rv) {
+			mtx_unlock(rm->rm_mtx);
+			return (rv);
+		}
+	}
+}
+
+static int
+int_rman_deactivate_resource(struct resource *r)
+{
+	struct	rman *rm;
+
+	rm = r->r_rm;
+	r->r_flags &= ~RF_ACTIVE;
+	if (r->r_flags & RF_WANTED) {
+		r->r_flags &= ~RF_WANTED;
+		wakeup(r->r_sharehead);
+	}
+	return 0;
+}
+
+int
+rman_deactivate_resource(struct resource *r)
+{
+	struct	rman *rm;
+
+	rm = r->r_rm;
+	mtx_lock(rm->rm_mtx);
+	int_rman_deactivate_resource(r);
+	mtx_unlock(rm->rm_mtx);
+	return 0;
+}
+
+static int
+int_rman_release_resource(struct rman *rm, struct resource *r)
+{
+	struct	resource *s, *t;
+
+	if (r->r_flags & RF_ACTIVE)
+		int_rman_deactivate_resource(r);
+
+	/*
+	 * Check for a sharing list first.  If there is one, then we don't
+	 * have to think as hard.
+	 */
+	if (r->r_sharehead) {
+		/*
+		 * If a sharing list exists, then we know there are at
+		 * least two sharers.
+		 *
+		 * If we are in the main circleq, appoint someone else.
+		 */
+		LIST_REMOVE(r, r_sharelink);
+		s = LIST_FIRST(r->r_sharehead);
+		if (r->r_flags & RF_FIRSTSHARE) {
+			s->r_flags |= RF_FIRSTSHARE;
+			TAILQ_INSERT_BEFORE(r, s, r_link);
+			TAILQ_REMOVE(&rm->rm_list, r, r_link);
+		}
+
+		/*
+		 * Make sure that the sharing list goes away completely
+		 * if the resource is no longer being shared at all.
+		 */
+		if (LIST_NEXT(s, r_sharelink) == 0) {
+			free(s->r_sharehead, M_RMAN);
+			s->r_sharehead = 0;
+			s->r_flags &= ~RF_FIRSTSHARE;
+		}
+		goto out;
+	}
+
+	/*
+	 * Look at the adjacent resources in the list and see if our
+	 * segment can be merged with any of them.
+	 */
+	s = TAILQ_PREV(r, resource_head, r_link);
+	t = TAILQ_NEXT(r, r_link);
+
+	if (s != NULL && (s->r_flags & RF_ALLOCATED) == 0
+	    && t != NULL && (t->r_flags & RF_ALLOCATED) == 0) {
+		/*
+		 * Merge all three segments.
+		 */
+		s->r_end = t->r_end;
+		TAILQ_REMOVE(&rm->rm_list, r, r_link);
+		TAILQ_REMOVE(&rm->rm_list, t, r_link);
+		free(t, M_RMAN);
+	} else if (s != NULL && (s->r_flags & RF_ALLOCATED) == 0) {
+		/*
+		 * Merge previous segment with ours.
+		 */
+		s->r_end = r->r_end;
+		TAILQ_REMOVE(&rm->rm_list, r, r_link);
+	} else if (t != NULL && (t->r_flags & RF_ALLOCATED) == 0) {
+		/*
+		 * Merge next segment with ours.
+		 */
+		t->r_start = r->r_start;
+		TAILQ_REMOVE(&rm->rm_list, r, r_link);
+	} else {
+		/*
+		 * At this point, we know there is nothing we
+		 * can potentially merge with, because on each
+		 * side, there is either nothing there or what is
+		 * there is still allocated.  In that case, we don't
+		 * want to remove r from the list; we simply want to
+		 * change it to an unallocated region and return
+		 * without freeing anything.
+		 */
+		r->r_flags &= ~RF_ALLOCATED;
+		return 0;
+	}
+
+out:
+	free(r, M_RMAN);
+	return 0;
+}
+
+int
+rman_release_resource(struct resource *r)
+{
+	int	rv;
+	struct	rman *rm = r->r_rm;
+
+	mtx_lock(rm->rm_mtx);
+	rv = int_rman_release_resource(rm, r);
+	mtx_unlock(rm->rm_mtx);
+	return (rv);
+}
+
+uint32_t
+rman_make_alignment_flags(uint32_t size)
+{
+	int	i;
+
+	/*
+	 * Find the hightest bit set, and add one if more than one bit
+	 * set.  We're effectively computing the ceil(log2(size)) here.
+	 */
+	for (i = 31; i > 0; i--)
+		if ((1 << i) & size)
+			break;
+	if (~(1 << i) & size)
+		i++;
+
+	return(RF_ALIGNMENT_LOG2(i));
+}
diff --git a/sys/kern/subr_rtc.c b/sys/kern/subr_rtc.c
new file mode 100644
index 0000000..a79e331
--- /dev/null
+++ b/sys/kern/subr_rtc.c
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: Utah $Hdr: clock.c 1.18 91/01/21$
+ *	from: @(#)clock.c	8.2 (Berkeley) 1/12/94
+ *	from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ *	and
+ *	from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Helpers for time-of-day clocks. This is useful for architectures that need
+ * support multiple models of such clocks, and generally serves to make the
+ * code more machine-independent.
+ * If the clock in question can also be used as a time counter, the driver
+ * needs to initiate this.
+ * This code is not yet used by all architectures.
+ */
+
+/*
+ * Generic routines to convert between a POSIX date
+ * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
+ * Derived from NetBSD arch/hp300/hp300/clock.c
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+#include <sys/timetc.h>
+
+#include "clock_if.h"
+
+static __inline int leapyear(int year);
+static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS);
+
+#define	FEBRUARY	2
+#define	days_in_year(y) 	(leapyear(y) ? 366 : 365)
+#define	days_in_month(y, m) \
+	(month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
+/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
+#define	day_of_week(days)	(((days) + 4) % 7)
+
+static const int month_days[12] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+static device_t clock_dev = NULL;
+static long clock_res;
+
+int adjkerntz;		/* local offset from GMT in seconds */
+int disable_rtc_set;	/* disable resettodr() if != 0 */
+int wall_cmos_clock;	/* wall CMOS clock assumed if != 0 */
+
+/*
+ * These have traditionally been in machdep, but should probably be moved to
+ * kern.
+ */
+SYSCTL_PROC(_machdep, OID_AUTO, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
+	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
+
+SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set,
+	CTLFLAG_RW, &disable_rtc_set, 0, "");
+
+SYSCTL_INT(_machdep, OID_AUTO, wall_cmos_clock,
+	CTLFLAG_RW, &wall_cmos_clock, 0, "");
+
+static int
+sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
+		req);
+	if (!error && req->newptr)
+		resettodr();
+	return (error);
+}
+
+/*
+ * This inline avoids some unnecessary modulo operations
+ * as compared with the usual macro:
+ *   ( ((year % 4) == 0 &&
+ *      (year % 100) != 0) ||
+ *     ((year % 400) == 0) )
+ * It is otherwise equivalent.
+ */
+static __inline int
+leapyear(int year)
+{
+	int rv = 0;
+
+	if ((year & 3) == 0) {
+		rv = 1;
+		if ((year % 100) == 0) {
+			rv = 0;
+			if ((year % 400) == 0)
+				rv = 1;
+		}
+	}
+	return (rv);
+}
+
+int
+clock_ct_to_ts(struct clocktime *ct, struct timespec *ts)
+{
+	time_t secs;
+	int i, year, days;
+
+	year = ct->year;
+
+	/* Sanity checks. */
+	if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 ||
+	    ct->day > days_in_month(year, ct->mon) ||
+	    ct->hour > 23 ||  ct->min > 59 || ct->sec > 59 ||
+	    ct->year > 2037)		/* time_t overflow */
+		return (EINVAL);
+
+	/*
+	 * Compute days since start of time
+	 * First from years, then from months.
+	 */
+	days = 0;
+	for (i = POSIX_BASE_YEAR; i < year; i++)
+		days += days_in_year(i);
+
+	/* Months */
+	for (i = 1; i < ct->mon; i++)
+	  	days += days_in_month(year, i);
+	days += (ct->day - 1);
+
+	/* Another sanity check. */
+	if (ct->dow != -1 && ct->dow != day_of_week(days))
+		return (EINVAL);
+
+	/* Add hours, minutes, seconds. */
+	secs = ((days * 24 + ct->hour) * 60 + ct->min) * 60 + ct->sec;
+
+	ts->tv_sec = secs;
+	ts->tv_nsec = ct->nsec;
+	return (0);
+}
+
+void
+clock_ts_to_ct(struct timespec *ts, struct clocktime *ct)
+{
+	int i, year, days;
+	time_t rsec;	/* remainder seconds */
+	time_t secs;
+
+	secs = ts->tv_sec;
+	days = secs / SECDAY;
+	rsec = secs % SECDAY;
+
+	ct->dow = day_of_week(days);
+
+	/* Subtract out whole years, counting them in i. */
+	for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++)
+		days -= days_in_year(year);
+	ct->year = year;
+
+	/* Subtract out whole months, counting them in i. */
+	for (i = 1; days >= days_in_month(year, i); i++)
+		days -= days_in_month(year, i);
+	ct->mon = i;
+
+	/* Days are what is left over (+1) from all that. */
+	ct->day = days + 1;
+
+	/* Hours, minutes, seconds are easy */
+	ct->hour = rsec / 3600;
+	rsec = rsec % 3600;
+	ct->min  = rsec / 60;
+	rsec = rsec % 60;
+	ct->sec  = rsec;
+	ct->nsec = ts->tv_nsec;
+}
+
+void
+clock_register(device_t dev, long res)
+{
+
+	if (clock_dev != NULL) {
+		if (clock_res > res) {
+			if (bootverbose) {
+				device_printf(dev, "not installed as "
+				    "time-of-day clock: clock %s has higher "
+				    "resolution\n", device_get_name(clock_dev));
+			}
+			return;
+		} else {
+			if (bootverbose) {
+				device_printf(clock_dev, "removed as "
+				    "time-of-day clock: clock %s has higher "
+				    "resolution\n", device_get_name(dev));
+			}
+		}
+	}
+	clock_dev = dev;
+	clock_res = res;
+	if (bootverbose) {
+		device_printf(dev, "registered as a time-of-day clock "
+		    "(resolution %ldus)\n", res);
+	}
+}
+
+/*
+ * inittodr and settodr derived from the i386 versions written
+ * by Christoph Robitschko <chmr@edvz.tu-graz.ac.at>,  reintroduced and
+ * updated by Chris Stenton <chris@gnome.co.uk> 8/10/94
+ */
+
+/*
+ * Initialize the time of day register, based on the time base which is, e.g.
+ * from a filesystem.
+ */
+void
+inittodr(time_t base)
+{
+	struct timespec diff, ref, ts;
+	int error;
+
+	if (base) {
+		ref.tv_sec = base;
+		ref.tv_nsec = 0;
+		tc_setclock(&ref);
+	}
+
+	if (clock_dev == NULL) {
+		printf("warning: no time-of-day clock registered, system time "
+		    "will not be set accurately\n");
+		return;
+	}
+	error = CLOCK_GETTIME(clock_dev, &ts);
+	if (error != 0 && error != EINVAL) {
+		printf("warning: clock_gettime failed (%d), the system time "
+		    "will not be set accurately\n", error);
+		return;
+	}
+	if (error == EINVAL || ts.tv_sec < 0) {
+		printf("Invalid time in real time clock.\n");
+		printf("Check and reset the date immediately!\n");
+	}
+
+	ts.tv_sec += tz.tz_minuteswest * 60 +
+	    (wall_cmos_clock ? adjkerntz : 0);
+
+	if (timespeccmp(&ref, &ts, >)) {
+		diff = ref;
+		timespecsub(&ref, &ts);
+	} else {
+		diff = ts;
+		timespecsub(&diff, &ref);
+	}
+	if (ts.tv_sec >= 2) {
+		/* badly off, adjust it */
+		tc_setclock(&ts);
+	}
+}
+
+/*
+ * Write system time back to RTC
+ */
+void
+resettodr()
+{
+	struct timespec ts;
+	int error;
+
+	if (disable_rtc_set || clock_dev == NULL)
+		return;
+
+	getnanotime(&ts);
+	ts.tv_sec -= tz.tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0);
+	if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0) {
+		printf("warning: clock_settime failed (%d), time-of-day clock "
+		    "not adjusted to system time\n", error);
+		return;
+	}
+}
diff --git a/sys/kern/subr_sbuf.c b/sys/kern/subr_sbuf.c
new file mode 100644
index 0000000..6c910e6
--- /dev/null
+++ b/sys/kern/subr_sbuf.c
@@ -0,0 +1,560 @@
+/*-
+ * Copyright (c) 2000 Poul-Henning Kamp and Dag-Erling Co�dan Sm�rgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *      $FreeBSD$
+ */
+
+#include <sys/param.h>
+
+#ifdef _KERNEL
+#include <sys/ctype.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/uio.h>
+#include <machine/stdarg.h>
+#else /* _KERNEL */
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif /* _KERNEL */
+
+#include <sys/sbuf.h>
+
+#ifdef _KERNEL
+MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers");
+#define	SBMALLOC(size)		malloc(size, M_SBUF, M_WAITOK)
+#define	SBFREE(buf)		free(buf, M_SBUF)
+#else /* _KERNEL */
+#define	KASSERT(e, m)
+#define	SBMALLOC(size)		malloc(size)
+#define	SBFREE(buf)		free(buf)
+#define	min(x,y)		MIN(x,y)
+#endif /* _KERNEL */
+
+/*
+ * Predicates
+ */
+#define	SBUF_ISDYNAMIC(s)	((s)->s_flags & SBUF_DYNAMIC)
+#define	SBUF_ISDYNSTRUCT(s)	((s)->s_flags & SBUF_DYNSTRUCT)
+#define	SBUF_ISFINISHED(s)	((s)->s_flags & SBUF_FINISHED)
+#define	SBUF_HASOVERFLOWED(s)	((s)->s_flags & SBUF_OVERFLOWED)
+#define	SBUF_HASROOM(s)		((s)->s_len < (s)->s_size - 1)
+#define	SBUF_FREESPACE(s)	((s)->s_size - (s)->s_len - 1)
+#define	SBUF_CANEXTEND(s)	((s)->s_flags & SBUF_AUTOEXTEND)
+
+/*
+ * Set / clear flags
+ */
+#define	SBUF_SETFLAG(s, f)	do { (s)->s_flags |= (f); } while (0)
+#define	SBUF_CLEARFLAG(s, f)	do { (s)->s_flags &= ~(f); } while (0)
+
+#define	SBUF_MINEXTENDSIZE	16		/* Should be power of 2. */
+#define	SBUF_MAXEXTENDSIZE	PAGE_SIZE
+#define	SBUF_MAXEXTENDINCR	PAGE_SIZE
+
+/*
+ * Debugging support
+ */
+#if defined(_KERNEL) && defined(INVARIANTS)
+static void
+_assert_sbuf_integrity(const char *fun, struct sbuf *s)
+{
+	KASSERT(s != NULL,
+	    ("%s called with a NULL sbuf pointer", fun));
+	KASSERT(s->s_buf != NULL,
+	    ("%s called with uninitialized or corrupt sbuf", fun));
+	KASSERT(s->s_len < s->s_size,
+	    ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size));
+}
+
+static void
+_assert_sbuf_state(const char *fun, struct sbuf *s, int state)
+{
+	KASSERT((s->s_flags & SBUF_FINISHED) == state,
+	    ("%s called with %sfinished or corrupt sbuf", fun,
+	    (state ? "un" : "")));
+}
+#define	assert_sbuf_integrity(s) _assert_sbuf_integrity(__func__, (s))
+#define	assert_sbuf_state(s, i)	 _assert_sbuf_state(__func__, (s), (i))
+#else /* _KERNEL && INVARIANTS */
+#define	assert_sbuf_integrity(s) do { } while (0)
+#define	assert_sbuf_state(s, i)	 do { } while (0)
+#endif /* _KERNEL && INVARIANTS */
+
+static int
+sbuf_extendsize(int size)
+{
+	int newsize;
+
+	newsize = SBUF_MINEXTENDSIZE;
+	while (newsize < size) {
+		if (newsize < SBUF_MAXEXTENDSIZE)
+			newsize *= 2;
+		else
+			newsize += SBUF_MAXEXTENDINCR;
+	}
+
+	return (newsize);
+}
+
+
+/*
+ * Extend an sbuf.
+ */
+static int
+sbuf_extend(struct sbuf *s, int addlen)
+{
+	char *newbuf;
+	int newsize;
+
+	if (!SBUF_CANEXTEND(s))
+		return (-1);
+
+	newsize = sbuf_extendsize(s->s_size + addlen);
+	newbuf = (char *)SBMALLOC(newsize);
+	if (newbuf == NULL)
+		return (-1);
+	bcopy(s->s_buf, newbuf, s->s_size);
+	if (SBUF_ISDYNAMIC(s))
+		SBFREE(s->s_buf);
+	else
+		SBUF_SETFLAG(s, SBUF_DYNAMIC);
+	s->s_buf = newbuf;
+	s->s_size = newsize;
+	return (0);
+}
+
+/*
+ * Initialize an sbuf.
+ * If buf is non-NULL, it points to a static or already-allocated string
+ * big enough to hold at least length characters.
+ */
+struct sbuf *
+sbuf_new(struct sbuf *s, char *buf, int length, int flags)
+{
+	KASSERT(length >= 0,
+	    ("attempt to create an sbuf of negative length (%d)", length));
+	KASSERT((flags & ~SBUF_USRFLAGMSK) == 0,
+	    ("%s called with invalid flags", __func__));
+
+	flags &= SBUF_USRFLAGMSK;
+	if (s == NULL) {
+		s = (struct sbuf *)SBMALLOC(sizeof *s);
+		if (s == NULL)
+			return (NULL);
+		bzero(s, sizeof *s);
+		s->s_flags = flags;
+		SBUF_SETFLAG(s, SBUF_DYNSTRUCT);
+	} else {
+		bzero(s, sizeof *s);
+		s->s_flags = flags;
+	}
+	s->s_size = length;
+	if (buf) {
+		s->s_buf = buf;
+		return (s);
+	}
+	if (flags & SBUF_AUTOEXTEND)
+		s->s_size = sbuf_extendsize(s->s_size);
+	s->s_buf = (char *)SBMALLOC(s->s_size);
+	if (s->s_buf == NULL) {
+		if (SBUF_ISDYNSTRUCT(s))
+			SBFREE(s);
+		return (NULL);
+	}
+	SBUF_SETFLAG(s, SBUF_DYNAMIC);
+	return (s);
+}
+
+#ifdef _KERNEL
+/*
+ * Create an sbuf with uio data
+ */
+struct sbuf *
+sbuf_uionew(struct sbuf *s, struct uio *uio, int *error)
+{
+	KASSERT(uio != NULL,
+	    ("%s called with NULL uio pointer", __func__));
+	KASSERT(error != NULL,
+	    ("%s called with NULL error pointer", __func__));
+
+	s = sbuf_new(s, NULL, uio->uio_resid + 1, 0);
+	if (s == NULL) {
+		*error = ENOMEM;
+		return (NULL);
+	}
+	*error = uiomove(s->s_buf, uio->uio_resid, uio);
+	if (*error != 0) {
+		sbuf_delete(s);
+		return (NULL);
+	}
+	s->s_len = s->s_size - 1;
+	*error = 0;
+	return (s);
+}
+#endif
+
+/*
+ * Clear an sbuf and reset its position.
+ */
+void
+sbuf_clear(struct sbuf *s)
+{
+	assert_sbuf_integrity(s);
+	/* don't care if it's finished or not */
+
+	SBUF_CLEARFLAG(s, SBUF_FINISHED);
+	SBUF_CLEARFLAG(s, SBUF_OVERFLOWED);
+	s->s_len = 0;
+}
+
+/*
+ * Set the sbuf's end position to an arbitrary value.
+ * Effectively truncates the sbuf at the new position.
+ */
+int
+sbuf_setpos(struct sbuf *s, int pos)
+{
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	
+	KASSERT(pos >= 0,
+	    ("attempt to seek to a negative position (%d)", pos));
+	KASSERT(pos < s->s_size,
+	    ("attempt to seek past end of sbuf (%d >= %d)", pos, s->s_size));
+	       
+	if (pos < 0 || pos > s->s_len)
+		return (-1);
+	s->s_len = pos;
+	return (0);
+}
+
+/*
+ * Append a byte string to an sbuf.
+ */
+int
+sbuf_bcat(struct sbuf *s, const char *str, size_t len)
+{
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	
+	if (SBUF_HASOVERFLOWED(s))
+		return (-1);
+	
+	for (; len; len--) {
+		if (!SBUF_HASROOM(s) && sbuf_extend(s, len) < 0)
+			break;
+		s->s_buf[s->s_len++] = *str++;
+	}
+	if (len) {
+		SBUF_SETFLAG(s, SBUF_OVERFLOWED);
+		return (-1);
+	}
+	return (0);
+}
+
+#ifdef _KERNEL
+/*
+ * Copy a byte string from userland into an sbuf.
+ */
+int
+sbuf_bcopyin(struct sbuf *s, const void *uaddr, size_t len)
+{
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	if (SBUF_HASOVERFLOWED(s))
+		return (-1);
+
+	if (len == 0)
+		return (0);
+	if (len > SBUF_FREESPACE(s)) {
+		sbuf_extend(s, len - SBUF_FREESPACE(s));
+		len = min(len, SBUF_FREESPACE(s));
+	}
+	if (copyin(uaddr, s->s_buf + s->s_len, len) != 0)
+		return (-1);
+	s->s_len += len;
+	
+	return (0);
+}
+#endif
+
+/*
+ * Copy a byte string into an sbuf.
+ */
+int
+sbuf_bcpy(struct sbuf *s, const char *str, size_t len)
+{
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	
+	sbuf_clear(s);
+	return (sbuf_bcat(s, str, len));
+}
+
+/*
+ * Append a string to an sbuf.
+ */
+int
+sbuf_cat(struct sbuf *s, const char *str)
+{
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	
+	if (SBUF_HASOVERFLOWED(s))
+		return (-1);
+	
+	while (*str) {
+		if (!SBUF_HASROOM(s) && sbuf_extend(s, strlen(str)) < 0)
+			break;
+		s->s_buf[s->s_len++] = *str++;
+	}
+	if (*str) {
+		SBUF_SETFLAG(s, SBUF_OVERFLOWED);
+		return (-1);
+	}
+	return (0);
+}
+
+#ifdef _KERNEL
+/*
+ * Append a string from userland to an sbuf.
+ */
+int
+sbuf_copyin(struct sbuf *s, const void *uaddr, size_t len)
+{
+	size_t done;
+	
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	if (SBUF_HASOVERFLOWED(s))
+		return (-1);
+
+	if (len == 0)
+		len = SBUF_FREESPACE(s);	/* XXX return 0? */
+	if (len > SBUF_FREESPACE(s)) {
+		sbuf_extend(s, len);
+		len = min(len, SBUF_FREESPACE(s));
+	}
+	switch (copyinstr(uaddr, s->s_buf + s->s_len, len + 1, &done)) {
+	case ENAMETOOLONG:
+		SBUF_SETFLAG(s, SBUF_OVERFLOWED);
+		/* fall through */
+	case 0:
+		s->s_len += done - 1;
+		break;
+	default:
+		return (-1);	/* XXX */
+	}
+	
+	return (0);
+}
+#endif
+
+/*
+ * Copy a string into an sbuf.
+ */
+int
+sbuf_cpy(struct sbuf *s, const char *str)
+{
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	
+	sbuf_clear(s);
+	return (sbuf_cat(s, str));
+}
+
+/*
+ * Format the given argument list and append the resulting string to an sbuf.
+ */
+int
+sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
+{
+	int len;
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	KASSERT(fmt != NULL,
+	    ("%s called with a NULL format string", __func__));
+
+	if (SBUF_HASOVERFLOWED(s))
+		return (-1);
+
+	do {
+		len = vsnprintf(&s->s_buf[s->s_len], SBUF_FREESPACE(s) + 1,
+		    fmt, ap);
+	} while (len > SBUF_FREESPACE(s) &&
+	    sbuf_extend(s, len - SBUF_FREESPACE(s)) == 0);
+
+	/*
+	 * s->s_len is the length of the string, without the terminating nul.
+	 * When updating s->s_len, we must subtract 1 from the length that
+	 * we passed into vsnprintf() because that length includes the
+	 * terminating nul.
+	 *
+	 * vsnprintf() returns the amount that would have been copied,
+	 * given sufficient space, hence the min() calculation below.
+	 */
+	s->s_len += min(len, SBUF_FREESPACE(s));
+	if (!SBUF_HASROOM(s) && !SBUF_CANEXTEND(s))
+		SBUF_SETFLAG(s, SBUF_OVERFLOWED);
+
+	KASSERT(s->s_len < s->s_size,
+	    ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size));
+
+	if (SBUF_HASOVERFLOWED(s))
+		return (-1);
+	return (0);
+}
+
+/*
+ * Format the given arguments and append the resulting string to an sbuf.
+ */
+int
+sbuf_printf(struct sbuf *s, const char *fmt, ...)
+{
+	va_list ap;
+	int result;
+
+	va_start(ap, fmt);
+	result = sbuf_vprintf(s, fmt, ap);
+	va_end(ap);
+	return(result);
+}
+
+/*
+ * Append a character to an sbuf.
+ */
+int
+sbuf_putc(struct sbuf *s, int c)
+{
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	
+	if (SBUF_HASOVERFLOWED(s))
+		return (-1);
+	
+	if (!SBUF_HASROOM(s) && sbuf_extend(s, 1) < 0) {
+		SBUF_SETFLAG(s, SBUF_OVERFLOWED);
+		return (-1);
+	}
+	if (c != '\0')
+	    s->s_buf[s->s_len++] = c;
+	return (0);
+}
+
+/*
+ * Trim whitespace characters from end of an sbuf.
+ */
+int
+sbuf_trim(struct sbuf *s)
+{
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	
+	if (SBUF_HASOVERFLOWED(s))
+		return (-1);
+	
+	while (s->s_len && isspace(s->s_buf[s->s_len-1]))
+		--s->s_len;
+
+	return (0);
+}
+
+/*
+ * Check if an sbuf overflowed
+ */
+int
+sbuf_overflowed(struct sbuf *s)
+{
+    return SBUF_HASOVERFLOWED(s);
+}
+
+/*
+ * Finish off an sbuf.
+ */
+void
+sbuf_finish(struct sbuf *s)
+{
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	
+	s->s_buf[s->s_len] = '\0';
+	SBUF_CLEARFLAG(s, SBUF_OVERFLOWED);
+	SBUF_SETFLAG(s, SBUF_FINISHED);
+}
+
+/*
+ * Return a pointer to the sbuf data.
+ */
+char *
+sbuf_data(struct sbuf *s)
+{
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, SBUF_FINISHED);
+	
+	return s->s_buf;
+}
+
+/*
+ * Return the length of the sbuf data.
+ */
+int
+sbuf_len(struct sbuf *s)
+{
+	assert_sbuf_integrity(s);
+	/* don't care if it's finished or not */
+	
+	if (SBUF_HASOVERFLOWED(s))
+		return (-1);
+	return s->s_len;
+}
+
+/*
+ * Clear an sbuf, free its buffer if necessary.
+ */
+void
+sbuf_delete(struct sbuf *s)
+{
+	int isdyn;
+
+	assert_sbuf_integrity(s);
+	/* don't care if it's finished or not */
+	
+	if (SBUF_ISDYNAMIC(s))
+		SBFREE(s->s_buf);
+	isdyn = SBUF_ISDYNSTRUCT(s);
+	bzero(s, sizeof *s);
+	if (isdyn)
+		SBFREE(s);
+}
diff --git a/sys/kern/subr_scanf.c b/sys/kern/subr_scanf.c
new file mode 100644
index 0000000..13f02b8
--- /dev/null
+++ b/sys/kern/subr_scanf.c
@@ -0,0 +1,628 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Chris Torek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ * From: Id: vfscanf.c,v 1.13 1998/09/25 12:20:27 obrien Exp 
+ * From: static char sccsid[] = "@(#)strtol.c	8.1 (Berkeley) 6/4/93";
+ * From: static char sccsid[] = "@(#)strtoul.c	8.1 (Berkeley) 6/4/93";
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ctype.h>
+#include <machine/limits.h>
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define	BUF		32 	/* Maximum length of numeric string. */
+
+/*
+ * Flags used during conversion.
+ */
+#define	LONG		0x01	/* l: long or double */
+#define	SHORT		0x04	/* h: short */
+#define	SUPPRESS	0x08	/* suppress assignment */
+#define	POINTER		0x10	/* weird %p pointer (`fake hex') */
+#define	NOSKIP		0x20	/* do not skip blanks */
+#define	QUAD		0x400
+
+/*
+ * The following are used in numeric conversions only:
+ * SIGNOK, NDIGITS, DPTOK, and EXPOK are for floating point;
+ * SIGNOK, NDIGITS, PFXOK, and NZDIGITS are for integral.
+ */
+#define	SIGNOK		0x40	/* +/- is (still) legal */
+#define	NDIGITS		0x80	/* no digits detected */
+
+#define	DPTOK		0x100	/* (float) decimal point is still legal */
+#define	EXPOK		0x200	/* (float) exponent (e+3, etc) still legal */
+
+#define	PFXOK		0x100	/* 0x prefix is (still) legal */
+#define	NZDIGITS	0x200	/* no zero digits detected */
+
+/*
+ * Conversion types.
+ */
+#define	CT_CHAR		0	/* %c conversion */
+#define	CT_CCL		1	/* %[...] conversion */
+#define	CT_STRING	2	/* %s conversion */
+#define	CT_INT		3	/* integer, i.e., strtoq or strtouq */
+typedef u_quad_t (*ccfntype)(const char *, char **, int);
+
+static const u_char *__sccl(char *, const u_char *);
+
+int
+sscanf(const char *ibuf, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	
+	va_start(ap, fmt);
+	ret = vsscanf(ibuf, fmt, ap);
+	va_end(ap);
+	return(ret);
+}
+
+int
+vsscanf(const char *inp, char const *fmt0, va_list ap)
+{
+	int inr;
+	const u_char *fmt = (const u_char *)fmt0;
+	int c;			/* character from format, or conversion */
+	size_t width;		/* field width, or 0 */
+	char *p;		/* points into all kinds of strings */
+	int n;			/* handy integer */
+	int flags;		/* flags as defined above */
+	char *p0;		/* saves original value of p when necessary */
+	int nassigned;		/* number of fields assigned */
+	int nconversions;	/* number of conversions */
+	int nread;		/* number of characters consumed from fp */
+	int base;		/* base argument to strtoq/strtouq */
+	ccfntype ccfn;		/* conversion function (strtoq/strtouq) */
+	char ccltab[256];	/* character class table for %[...] */
+	char buf[BUF];		/* buffer for numeric conversions */
+
+	/* `basefix' is used to avoid `if' tests in the integer scanner */
+	static short basefix[17] =
+		{ 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
+
+	inr = strlen(inp);
+	
+	nassigned = 0;
+	nconversions = 0;
+	nread = 0;
+	base = 0;		/* XXX just to keep gcc happy */
+	ccfn = NULL;		/* XXX just to keep gcc happy */
+	for (;;) {
+		c = *fmt++;
+		if (c == 0)
+			return (nassigned);
+		if (isspace(c)) {
+			while (inr > 0 && isspace(*inp))
+				nread++, inr--, inp++;
+			continue;
+		}
+		if (c != '%')
+			goto literal;
+		width = 0;
+		flags = 0;
+		/*
+		 * switch on the format.  continue if done;
+		 * break once format type is derived.
+		 */
+again:		c = *fmt++;
+		switch (c) {
+		case '%':
+literal:
+			if (inr <= 0)
+				goto input_failure;
+			if (*inp != c)
+				goto match_failure;
+			inr--, inp++;
+			nread++;
+			continue;
+
+		case '*':
+			flags |= SUPPRESS;
+			goto again;
+		case 'l':
+			flags |= LONG;
+			goto again;
+		case 'q':
+			flags |= QUAD;
+			goto again;
+		case 'h':
+			flags |= SHORT;
+			goto again;
+
+		case '0': case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+			width = width * 10 + c - '0';
+			goto again;
+
+		/*
+		 * Conversions.
+		 *
+		 */
+		case 'd':
+			c = CT_INT;
+			ccfn = (ccfntype)strtoq;
+			base = 10;
+			break;
+
+		case 'i':
+			c = CT_INT;
+			ccfn = (ccfntype)strtoq;
+			base = 0;
+			break;
+
+		case 'o':
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 8;
+			break;
+
+		case 'u':
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 10;
+			break;
+
+		case 'x':
+			flags |= PFXOK;	/* enable 0x prefixing */
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 16;
+			break;
+
+		case 's':
+			c = CT_STRING;
+			break;
+
+		case '[':
+			fmt = __sccl(ccltab, fmt);
+			flags |= NOSKIP;
+			c = CT_CCL;
+			break;
+
+		case 'c':
+			flags |= NOSKIP;
+			c = CT_CHAR;
+			break;
+
+		case 'p':	/* pointer format is like hex */
+			flags |= POINTER | PFXOK;
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 16;
+			break;
+
+		case 'n':
+			nconversions++;
+			if (flags & SUPPRESS)	/* ??? */
+				continue;
+			if (flags & SHORT)
+				*va_arg(ap, short *) = nread;
+			else if (flags & LONG)
+				*va_arg(ap, long *) = nread;
+			else if (flags & QUAD)
+				*va_arg(ap, quad_t *) = nread;
+			else
+				*va_arg(ap, int *) = nread;
+			continue;
+		}
+
+		/*
+		 * We have a conversion that requires input.
+		 */
+		if (inr <= 0)
+			goto input_failure;
+
+		/*
+		 * Consume leading white space, except for formats
+		 * that suppress this.
+		 */
+		if ((flags & NOSKIP) == 0) {
+			while (isspace(*inp)) {
+				nread++;
+				if (--inr > 0)
+					inp++;
+				else 
+					goto input_failure;
+			}
+			/*
+			 * Note that there is at least one character in
+			 * the buffer, so conversions that do not set NOSKIP
+			 * can no longer result in an input failure.
+			 */
+		}
+
+		/*
+		 * Do the conversion.
+		 */
+		switch (c) {
+
+		case CT_CHAR:
+			/* scan arbitrary characters (sets NOSKIP) */
+			if (width == 0)
+				width = 1;
+			if (flags & SUPPRESS) {
+				size_t sum = 0;
+				for (;;) {
+					if ((n = inr) < width) {
+						sum += n;
+						width -= n;
+						inp += n;
+						if (sum == 0)
+							goto input_failure;
+							break;
+					} else {
+						sum += width;
+						inr -= width;
+						inp += width;
+						break;
+					}
+				}
+				nread += sum;
+			} else {
+				bcopy(inp, va_arg(ap, char *), width);
+				inr -= width;
+				inp += width;
+				nread += width;
+				nassigned++;
+			}
+			nconversions++;
+			break;
+
+		case CT_CCL:
+			/* scan a (nonempty) character class (sets NOSKIP) */
+			if (width == 0)
+				width = (size_t)~0;	/* `infinity' */
+			/* take only those things in the class */
+			if (flags & SUPPRESS) {
+				n = 0;
+				while (ccltab[(unsigned char)*inp]) {
+					n++, inr--, inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0) {
+						if (n == 0)
+							goto input_failure;
+						break;
+					}
+				}
+				if (n == 0)
+					goto match_failure;
+			} else {
+				p0 = p = va_arg(ap, char *);
+				while (ccltab[(unsigned char)*inp]) {
+					inr--;
+					*p++ = *inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0) {
+						if (p == p0)
+							goto input_failure;
+						break;
+					}
+				}
+				n = p - p0;
+				if (n == 0)
+					goto match_failure;
+				*p = 0;
+				nassigned++;
+			}
+			nread += n;
+			nconversions++;
+			break;
+
+		case CT_STRING:
+			/* like CCL, but zero-length string OK, & no NOSKIP */
+			if (width == 0)
+				width = (size_t)~0;
+			if (flags & SUPPRESS) {
+				n = 0;
+				while (!isspace(*inp)) {
+					n++, inr--, inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0)
+						break;
+				}
+				nread += n;
+			} else {
+				p0 = p = va_arg(ap, char *);
+				while (!isspace(*inp)) {
+					inr--;
+					*p++ = *inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0)
+						break;
+				}
+				*p = 0;
+				nread += p - p0;
+				nassigned++;
+			}
+			nconversions++;
+			continue;
+
+		case CT_INT:
+			/* scan an integer as if by strtoq/strtouq */
+#ifdef hardway
+			if (width == 0 || width > sizeof(buf) - 1)
+				width = sizeof(buf) - 1;
+#else
+			/* size_t is unsigned, hence this optimisation */
+			if (--width > sizeof(buf) - 2)
+				width = sizeof(buf) - 2;
+			width++;
+#endif
+			flags |= SIGNOK | NDIGITS | NZDIGITS;
+			for (p = buf; width; width--) {
+				c = *inp;
+				/*
+				 * Switch on the character; `goto ok'
+				 * if we accept it as a part of number.
+				 */
+				switch (c) {
+
+				/*
+				 * The digit 0 is always legal, but is
+				 * special.  For %i conversions, if no
+				 * digits (zero or nonzero) have been
+				 * scanned (only signs), we will have
+				 * base==0.  In that case, we should set
+				 * it to 8 and enable 0x prefixing.
+				 * Also, if we have not scanned zero digits
+				 * before this, do not turn off prefixing
+				 * (someone else will turn it off if we
+				 * have scanned any nonzero digits).
+				 */
+				case '0':
+					if (base == 0) {
+						base = 8;
+						flags |= PFXOK;
+					}
+					if (flags & NZDIGITS)
+					    flags &= ~(SIGNOK|NZDIGITS|NDIGITS);
+					else
+					    flags &= ~(SIGNOK|PFXOK|NDIGITS);
+					goto ok;
+
+				/* 1 through 7 always legal */
+				case '1': case '2': case '3':
+				case '4': case '5': case '6': case '7':
+					base = basefix[base];
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* digits 8 and 9 ok iff decimal or hex */
+				case '8': case '9':
+					base = basefix[base];
+					if (base <= 8)
+						break;	/* not legal here */
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* letters ok iff hex */
+				case 'A': case 'B': case 'C':
+				case 'D': case 'E': case 'F':
+				case 'a': case 'b': case 'c':
+				case 'd': case 'e': case 'f':
+					/* no need to fix base here */
+					if (base <= 10)
+						break;	/* not legal here */
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* sign ok only as first character */
+				case '+': case '-':
+					if (flags & SIGNOK) {
+						flags &= ~SIGNOK;
+						goto ok;
+					}
+					break;
+
+				/* x ok iff flag still set & 2nd char */
+				case 'x': case 'X':
+					if (flags & PFXOK && p == buf + 1) {
+						base = 16;	/* if %i */
+						flags &= ~PFXOK;
+						goto ok;
+					}
+					break;
+				}
+
+				/*
+				 * If we got here, c is not a legal character
+				 * for a number.  Stop accumulating digits.
+				 */
+				break;
+		ok:
+				/*
+				 * c is legal: store it and look at the next.
+				 */
+				*p++ = c;
+				if (--inr > 0)
+					inp++;
+				else 
+					break;		/* end of input */
+			}
+			/*
+			 * If we had only a sign, it is no good; push
+			 * back the sign.  If the number ends in `x',
+			 * it was [sign] '0' 'x', so push back the x
+			 * and treat it as [sign] '0'.
+			 */
+			if (flags & NDIGITS) {
+				if (p > buf) {
+					inp--;
+					inr++;
+				}
+				goto match_failure;
+			}
+			c = ((u_char *)p)[-1];
+			if (c == 'x' || c == 'X') {
+				--p;
+				inp--;
+				inr++;
+			}
+			if ((flags & SUPPRESS) == 0) {
+				u_quad_t res;
+
+				*p = 0;
+				res = (*ccfn)(buf, (char **)NULL, base);
+				if (flags & POINTER)
+					*va_arg(ap, void **) =
+						(void *)(uintptr_t)res;
+				else if (flags & SHORT)
+					*va_arg(ap, short *) = res;
+				else if (flags & LONG)
+					*va_arg(ap, long *) = res;
+				else if (flags & QUAD)
+					*va_arg(ap, quad_t *) = res;
+				else
+					*va_arg(ap, int *) = res;
+				nassigned++;
+			}
+			nread += p - buf;
+			nconversions++;
+			break;
+
+		}
+	}
+input_failure:
+	return (nconversions != 0 ? nassigned : -1);
+match_failure:
+	return (nassigned);
+}
+
+/*
+ * Fill in the given table from the scanset at the given format
+ * (just after `[').  Return a pointer to the character past the
+ * closing `]'.  The table has a 1 wherever characters should be
+ * considered part of the scanset.
+ */
+static const u_char *
+__sccl(char *tab, const u_char *fmt)
+{
+	int c, n, v;
+
+	/* first `clear' the whole table */
+	c = *fmt++;		/* first char hat => negated scanset */
+	if (c == '^') {
+		v = 1;		/* default => accept */
+		c = *fmt++;	/* get new first char */
+	} else
+		v = 0;		/* default => reject */
+
+	/* XXX: Will not work if sizeof(tab*) > sizeof(char) */
+	for (n = 0; n < 256; n++)
+		     tab[n] = v;	/* memset(tab, v, 256) */
+
+	if (c == 0)
+		return (fmt - 1);/* format ended before closing ] */
+
+	/*
+	 * Now set the entries corresponding to the actual scanset
+	 * to the opposite of the above.
+	 *
+	 * The first character may be ']' (or '-') without being special;
+	 * the last character may be '-'.
+	 */
+	v = 1 - v;
+	for (;;) {
+		tab[c] = v;		/* take character c */
+doswitch:
+		n = *fmt++;		/* and examine the next */
+		switch (n) {
+
+		case 0:			/* format ended too soon */
+			return (fmt - 1);
+
+		case '-':
+			/*
+			 * A scanset of the form
+			 *	[01+-]
+			 * is defined as `the digit 0, the digit 1,
+			 * the character +, the character -', but
+			 * the effect of a scanset such as
+			 *	[a-zA-Z0-9]
+			 * is implementation defined.  The V7 Unix
+			 * scanf treats `a-z' as `the letters a through
+			 * z', but treats `a-a' as `the letter a, the
+			 * character -, and the letter a'.
+			 *
+			 * For compatibility, the `-' is not considerd
+			 * to define a range if the character following
+			 * it is either a close bracket (required by ANSI)
+			 * or is not numerically greater than the character
+			 * we just stored in the table (c).
+			 */
+			n = *fmt;
+			if (n == ']' || n < c) {
+				c = '-';
+				break;	/* resume the for(;;) */
+			}
+			fmt++;
+			/* fill in the range */
+			do {
+			    tab[++c] = v;
+			} while (c < n);
+			c = n;
+			/*
+			 * Alas, the V7 Unix scanf also treats formats
+			 * such as [a-c-e] as `the letters a through e'.
+			 * This too is permitted by the standard....
+			 */
+			goto doswitch;
+			break;
+
+		case ']':		/* end of scanset */
+			return (fmt);
+
+		default:		/* just another character */
+			c = n;
+			break;
+		}
+	}
+	/* NOTREACHED */
+}
+
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
new file mode 100644
index 0000000..9dad93b
--- /dev/null
+++ b/sys/kern/subr_smp.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2001
+ *	John Baldwin <jhb@FreeBSD.org>.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY JOHN BALDWIN AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL JOHN BALDWIN OR THE VOICES IN HIS HEAD
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This module holds the global variables and machine independent functions
+ * used for the kernel SMP support.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <machine/smp.h>
+
+volatile u_int stopped_cpus;
+volatile u_int started_cpus;
+
+void (*cpustop_restartfunc)(void);
+int mp_ncpus;
+
+volatile int smp_started;
+u_int all_cpus;
+u_int mp_maxid;
+
+SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD, NULL, "Kernel SMP");
+
+int smp_active = 0;	/* are the APs allowed to run? */
+SYSCTL_INT(_kern_smp, OID_AUTO, active, CTLFLAG_RW, &smp_active, 0, "");
+
+int smp_cpus = 1;	/* how many cpu's running */
+SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD, &smp_cpus, 0, "");
+
+/* Enable forwarding of a signal to a process running on a different CPU */
+static int forward_signal_enabled = 1;
+SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
+	   &forward_signal_enabled, 0, "");
+
+/* Enable forwarding of roundrobin to all other cpus */
+static int forward_roundrobin_enabled = 1;
+SYSCTL_INT(_kern_smp, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW,
+	   &forward_roundrobin_enabled, 0, "");
+
+/* Variables needed for SMP rendezvous. */
+static void (*smp_rv_setup_func)(void *arg);
+static void (*smp_rv_action_func)(void *arg);
+static void (*smp_rv_teardown_func)(void *arg);
+static void *smp_rv_func_arg;
+static volatile int smp_rv_waiters[2];
+static struct mtx smp_rv_mtx;
+static int mp_probe_status;
+
+/*
+ * Initialize MI SMP variables.
+ */
+static void
+mp_probe(void *dummy)
+{
+	mp_probe_status = cpu_mp_probe();
+}
+SYSINIT(cpu_mp_probe, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_probe, NULL)
+
+/*
+ * Call the MD SMP initialization code.
+ */
+static void
+mp_start(void *dummy)
+{
+
+	/* Probe for MP hardware. */
+	if (mp_probe_status == 0)
+		return;
+
+	mtx_init(&smp_rv_mtx, "smp rendezvous", NULL, MTX_SPIN);
+	cpu_mp_start();
+	printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
+	    mp_ncpus);
+	cpu_mp_announce();
+}
+SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_SECOND, mp_start, NULL)
+
+void
+forward_signal(struct thread *td)
+{
+	int id;
+
+	/*
+	 * signotify() has already set KEF_ASTPENDING and PS_NEEDSIGCHECK on
+	 * this process, so all we need to do is poke it if it is currently
+	 * executing so that it executes ast().
+	 */
+	mtx_assert(&sched_lock, MA_OWNED);
+	KASSERT(td->td_proc->p_stat == SRUN,
+	    ("forward_signal: process is not SRUN"));
+
+	CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
+
+	if (!smp_started || cold || panicstr)
+		return;
+	if (!forward_signal_enabled)
+		return;
+
+	/* No need to IPI ourself. */
+	if (td == curthread)
+		return;
+
+	id = td->td_kse->ke_oncpu;
+	if (id == NOCPU)
+		return;
+	ipi_selected(1 << id, IPI_AST);
+}
+
+void
+forward_roundrobin(void)
+{
+	struct pcpu *pc;
+	struct thread *td;
+	u_int id, map;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+
+	CTR0(KTR_SMP, "forward_roundrobin()");
+
+	if (!smp_started || cold || panicstr)
+		return;
+	if (!forward_roundrobin_enabled)
+		return;
+	map = 0;
+	SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
+		td = pc->pc_curthread;
+		id = pc->pc_cpumask;
+		if (id != PCPU_GET(cpumask) && (id & stopped_cpus) == 0 &&
+		    td != pc->pc_idlethread) {
+			td->td_kse->ke_flags |= KEF_NEEDRESCHED;
+			map |= id;
+		}
+	}
+	ipi_selected(map, IPI_AST);
+}
+
+/*
+ * When called the executing CPU will send an IPI to all other CPUs
+ *  requesting that they halt execution.
+ *
+ * Usually (but not necessarily) called with 'other_cpus' as its arg.
+ *
+ *  - Signals all CPUs in map to stop.
+ *  - Waits for each to stop.
+ *
+ * Returns:
+ *  -1: error
+ *   0: NA
+ *   1: ok
+ *
+ * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs
+ *            from executing at same time.
+ */
+int
+stop_cpus(u_int map)
+{
+	int i;
+
+	if (!smp_started)
+		return 0;
+
+	CTR1(KTR_SMP, "stop_cpus(%x)", map);
+
+	/* send the stop IPI to all CPUs in map */
+	ipi_selected(map, IPI_STOP);
+	
+	i = 0;
+	while ((atomic_load_acq_int(&stopped_cpus) & map) != map) {
+		/* spin */
+		i++;
+#ifdef DIAGNOSTIC
+		if (i == 100000) {
+			printf("timeout stopping cpus\n");
+			break;
+		}
+#endif
+	}
+
+	return 1;
+}
+
+
+/*
+ * Called by a CPU to restart stopped CPUs. 
+ *
+ * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
+ *
+ *  - Signals all CPUs in map to restart.
+ *  - Waits for each to restart.
+ *
+ * Returns:
+ *  -1: error
+ *   0: NA
+ *   1: ok
+ */
+int
+restart_cpus(u_int map)
+{
+
+	if (!smp_started)
+		return 0;
+
+	CTR1(KTR_SMP, "restart_cpus(%x)", map);
+
+	/* signal other cpus to restart */
+	atomic_store_rel_int(&started_cpus, map);
+
+	/* wait for each to clear its bit */
+	while ((atomic_load_acq_int(&stopped_cpus) & map) != 0)
+		;	/* nothing */
+
+	return 1;
+}
+
+/*
+ * All-CPU rendezvous.  CPUs are signalled, all execute the setup function 
+ * (if specified), rendezvous, execute the action function (if specified),
+ * rendezvous again, execute the teardown function (if specified), and then
+ * resume.
+ *
+ * Note that the supplied external functions _must_ be reentrant and aware
+ * that they are running in parallel and in an unknown lock context.
+ */
+void
+smp_rendezvous_action(void)
+{
+
+	/* setup function */
+	if (smp_rv_setup_func != NULL)
+		smp_rv_setup_func(smp_rv_func_arg);
+	/* spin on entry rendezvous */
+	atomic_add_int(&smp_rv_waiters[0], 1);
+	while (atomic_load_acq_int(&smp_rv_waiters[0]) < mp_ncpus)
+		;	/* nothing */
+	/* action function */
+	if (smp_rv_action_func != NULL)
+		smp_rv_action_func(smp_rv_func_arg);
+	/* spin on exit rendezvous */
+	atomic_add_int(&smp_rv_waiters[1], 1);
+	while (atomic_load_acq_int(&smp_rv_waiters[1]) < mp_ncpus)
+		;	/* nothing */
+	/* teardown function */
+	if (smp_rv_teardown_func != NULL)
+		smp_rv_teardown_func(smp_rv_func_arg);
+}
+
+void
+smp_rendezvous(void (* setup_func)(void *), 
+	       void (* action_func)(void *),
+	       void (* teardown_func)(void *),
+	       void *arg)
+{
+
+	if (!smp_started) {
+		if (setup_func != NULL)
+			setup_func(arg);
+		if (action_func != NULL)
+			action_func(arg);
+		if (teardown_func != NULL)
+			teardown_func(arg);
+		return;
+	}
+		
+	/* obtain rendezvous lock */
+	mtx_lock_spin(&smp_rv_mtx);
+
+	/* set static function pointers */
+	smp_rv_setup_func = setup_func;
+	smp_rv_action_func = action_func;
+	smp_rv_teardown_func = teardown_func;
+	smp_rv_func_arg = arg;
+	smp_rv_waiters[0] = 0;
+	smp_rv_waiters[1] = 0;
+
+	/* signal other processors, which will enter the IPI with interrupts off */
+	ipi_all_but_self(IPI_RENDEZVOUS);
+
+	/* call executor function */
+	smp_rendezvous_action();
+
+	/* release lock */
+	mtx_unlock_spin(&smp_rv_mtx);
+}
diff --git a/sys/kern/subr_taskqueue.c b/sys/kern/subr_taskqueue.c
new file mode 100644
index 0000000..19a93ad
--- /dev/null
+++ b/sys/kern/subr_taskqueue.c
@@ -0,0 +1,223 @@
+/*-
+ * Copyright (c) 2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/taskqueue.h>
+
+static MALLOC_DEFINE(M_TASKQUEUE, "taskqueue", "Task Queues");
+
+static STAILQ_HEAD(taskqueue_list, taskqueue) taskqueue_queues;
+
+static void	*taskqueue_ih;
+static struct mtx taskqueue_queues_mutex;
+
+struct taskqueue {
+	STAILQ_ENTRY(taskqueue)	tq_link;
+	STAILQ_HEAD(, task)	tq_queue;
+	const char		*tq_name;
+	taskqueue_enqueue_fn	tq_enqueue;
+	void			*tq_context;
+	int			tq_draining;
+	struct mtx		tq_mutex;
+};
+
+static void	init_taskqueue_list(void *data);
+
+static void
+init_taskqueue_list(void *data __unused)
+{
+
+	mtx_init(&taskqueue_queues_mutex, "taskqueue list", NULL, MTX_DEF);
+	STAILQ_INIT(&taskqueue_queues);
+}
+SYSINIT(taskqueue_list, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_taskqueue_list,
+    NULL);
+
+struct taskqueue *
+taskqueue_create(const char *name, int mflags,
+		 taskqueue_enqueue_fn enqueue, void *context)
+{
+	struct taskqueue *queue;
+
+	queue = malloc(sizeof(struct taskqueue), M_TASKQUEUE, mflags | M_ZERO);
+	if (!queue)
+		return 0;
+
+	STAILQ_INIT(&queue->tq_queue);
+	queue->tq_name = name;
+	queue->tq_enqueue = enqueue;
+	queue->tq_context = context;
+	queue->tq_draining = 0;
+	mtx_init(&queue->tq_mutex, "taskqueue", NULL, MTX_DEF);
+
+	mtx_lock(&taskqueue_queues_mutex);
+	STAILQ_INSERT_TAIL(&taskqueue_queues, queue, tq_link);
+	mtx_unlock(&taskqueue_queues_mutex);
+
+	return queue;
+}
+
+void
+taskqueue_free(struct taskqueue *queue)
+{
+
+	mtx_lock(&queue->tq_mutex);
+	queue->tq_draining = 1;
+	mtx_unlock(&queue->tq_mutex);
+
+	taskqueue_run(queue);
+
+	mtx_lock(&taskqueue_queues_mutex);
+	STAILQ_REMOVE(&taskqueue_queues, queue, taskqueue, tq_link);
+	mtx_unlock(&taskqueue_queues_mutex);
+
+	mtx_destroy(&queue->tq_mutex);
+	free(queue, M_TASKQUEUE);
+}
+
+/*
+ * Returns with the taskqueue locked.
+ */
+struct taskqueue *
+taskqueue_find(const char *name)
+{
+	struct taskqueue *queue;
+
+	mtx_lock(&taskqueue_queues_mutex);
+	STAILQ_FOREACH(queue, &taskqueue_queues, tq_link) {
+		mtx_lock(&queue->tq_mutex);
+		if (!strcmp(queue->tq_name, name)) {
+			mtx_unlock(&taskqueue_queues_mutex);
+			return queue;
+		}
+		mtx_unlock(&queue->tq_mutex);
+	}
+	mtx_unlock(&taskqueue_queues_mutex);
+	return 0;
+}
+
+int
+taskqueue_enqueue(struct taskqueue *queue, struct task *task)
+{
+	struct task *ins;
+	struct task *prev;
+
+	mtx_lock(&queue->tq_mutex);
+
+	/*
+	 * Don't allow new tasks on a queue which is being freed.
+	 */
+	if (queue->tq_draining) {
+		mtx_unlock(&queue->tq_mutex);
+		return EPIPE;
+	}
+
+	/*
+	 * Count multiple enqueues.
+	 */
+	if (task->ta_pending) {
+		task->ta_pending++;
+		mtx_unlock(&queue->tq_mutex);
+		return 0;
+	}
+
+	/*
+	 * Optimise the case when all tasks have the same priority.
+	 */
+	prev = STAILQ_LAST(&queue->tq_queue, task, ta_link);
+	if (!prev || prev->ta_priority >= task->ta_priority) {
+		STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link);
+	} else {
+		prev = 0;
+		for (ins = STAILQ_FIRST(&queue->tq_queue); ins;
+		     prev = ins, ins = STAILQ_NEXT(ins, ta_link))
+			if (ins->ta_priority < task->ta_priority)
+				break;
+
+		if (prev)
+			STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link);
+		else
+			STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link);
+	}
+
+	task->ta_pending = 1;
+	if (queue->tq_enqueue)
+		queue->tq_enqueue(queue->tq_context);
+
+	mtx_unlock(&queue->tq_mutex);
+
+	return 0;
+}
+
+void
+taskqueue_run(struct taskqueue *queue)
+{
+	struct task *task;
+	int pending;
+
+	mtx_lock(&queue->tq_mutex);
+	while (STAILQ_FIRST(&queue->tq_queue)) {
+		/*
+		 * Carefully remove the first task from the queue and
+		 * zero its pending count.
+		 */
+		task = STAILQ_FIRST(&queue->tq_queue);
+		STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
+		pending = task->ta_pending;
+		task->ta_pending = 0;
+		mtx_unlock(&queue->tq_mutex);
+
+		task->ta_func(task->ta_context, pending);
+
+		mtx_lock(&queue->tq_mutex);
+	}
+	mtx_unlock(&queue->tq_mutex);
+}
+
+static void
+taskqueue_swi_enqueue(void *context)
+{
+	swi_sched(taskqueue_ih, 0);
+}
+
+static void
+taskqueue_swi_run(void *dummy)
+{
+	taskqueue_run(taskqueue_swi);
+}
+
+TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, 0,
+		 swi_add(NULL, "task queue", taskqueue_swi_run, NULL, SWI_TQ, 0,
+		     &taskqueue_ih)); 
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
new file mode 100644
index 0000000..3b415de
--- /dev/null
+++ b/sys/kern/subr_trap.c
@@ -0,0 +1,209 @@
+/*-
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the University of Utah, and William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
+ * $FreeBSD$
+ */
+
+#ifdef __i386__
+#include "opt_npx.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/systm.h>
+#include <sys/vmmeter.h>
+#include <machine/cpu.h>
+#include <machine/pcb.h>
+
+/*
+ * Define the code needed before returning to user mode, for
+ * trap and syscall.
+ *
+ * MPSAFE
+ */
+void
+userret(td, frame, oticks)
+	struct thread *td;
+	struct trapframe *frame;
+	u_int oticks;
+{
+	struct proc *p = td->td_proc;
+	struct kse *ke = td->td_kse; 
+	struct ksegrp *kg = td->td_ksegrp;
+
+#ifdef INVARIANTS
+	/* Check that we called signotify() enough. */
+	mtx_lock(&Giant);
+	PROC_LOCK(p);
+	mtx_lock_spin(&sched_lock);
+	if (SIGPENDING(p) && ((p->p_sflag & PS_NEEDSIGCHK) == 0 ||
+	    (p->p_kse.ke_flags & KEF_ASTPENDING) == 0))
+		printf("failed to set signal flags proprly for ast()\n");
+	mtx_unlock_spin(&sched_lock);
+	PROC_UNLOCK(p);
+	mtx_unlock(&Giant);
+#endif
+
+	/*
+	 * XXX we cheat slightly on the locking here to avoid locking in
+	 * the usual case.  Setting td_priority here is essentially an
+	 * incomplete workaround for not setting it properly elsewhere.
+	 * Now that some interrupt handlers are threads, not setting it
+	 * properly elsewhere can clobber it in the window between setting
+	 * it here and returning to user mode, so don't waste time setting
+	 * it perfectly here.
+	 */
+	if (td->td_priority != kg->kg_user_pri) {
+		mtx_lock_spin(&sched_lock);
+		td->td_priority = kg->kg_user_pri;
+		mtx_unlock_spin(&sched_lock);
+	}
+
+	/*
+	 * Charge system time if profiling.
+	 *
+	 * XXX should move PS_PROFIL to a place that can obviously be
+	 * accessed safely without sched_lock.
+	 */
+	if (p->p_sflag & PS_PROFIL) {
+		quad_t ticks;
+
+		mtx_lock_spin(&sched_lock);
+		ticks = ke->ke_sticks - oticks;
+		mtx_unlock_spin(&sched_lock);
+		addupc_task(ke, TRAPF_PC(frame), (u_int)ticks * psratio);
+	}
+}
+
+/*
+ * Process an asynchronous software trap.
+ * This is relatively easy.
+ * This function will return with preemption disabled.
+ */
+void
+ast(framep)
+	struct trapframe *framep;
+{
+	struct thread *td = curthread;
+	struct proc *p = td->td_proc;
+	struct kse *ke = td->td_kse;
+	struct ksegrp *kg = td->td_ksegrp;
+	u_int prticks, sticks;
+	int sflag;
+	int flags;
+	int sig;
+#if defined(DEV_NPX) && !defined(SMP)
+	int ucode;
+#endif
+
+	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
+#ifdef WITNESS
+	if (witness_list(td))
+		panic("Returning to user mode with mutex(s) held");
+#endif
+	mtx_assert(&Giant, MA_NOTOWNED);
+	mtx_assert(&sched_lock, MA_NOTOWNED);
+	prticks = 0;		/* XXX: Quiet warning. */
+	td->td_frame = framep;
+	/*
+	 * This updates the p_sflag's for the checks below in one
+	 * "atomic" operation with turning off the astpending flag.
+	 * If another AST is triggered while we are handling the
+	 * AST's saved in sflag, the astpending flag will be set and
+	 * ast() will be called again.
+	 */
+	mtx_lock_spin(&sched_lock);
+	sticks = ke->ke_sticks;
+	sflag = p->p_sflag;
+	flags = ke->ke_flags;
+	p->p_sflag &= ~(PS_ALRMPEND | PS_NEEDSIGCHK | PS_PROFPEND);
+	ke->ke_flags &= ~(KEF_ASTPENDING | KEF_NEEDRESCHED | KEF_OWEUPC);
+	cnt.v_soft++;
+	if (flags & KEF_OWEUPC && sflag & PS_PROFIL) {
+		prticks = p->p_stats->p_prof.pr_ticks;
+		p->p_stats->p_prof.pr_ticks = 0;
+	}
+	mtx_unlock_spin(&sched_lock);
+
+	if (td->td_ucred != p->p_ucred) 
+		cred_update_thread(td);
+	if (flags & KEF_OWEUPC && sflag & PS_PROFIL)
+		addupc_task(ke, p->p_stats->p_prof.pr_addr, prticks);
+	if (sflag & PS_ALRMPEND) {
+		PROC_LOCK(p);
+		psignal(p, SIGVTALRM);
+		PROC_UNLOCK(p);
+	}
+#if defined(DEV_NPX) && !defined(SMP)
+	if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) {
+		atomic_clear_int(&PCPU_GET(curpcb)->pcb_flags,
+		    PCB_NPXTRAP);
+		ucode = npxtrap();
+		if (ucode != -1) {
+			trapsignal(p, SIGFPE, ucode);
+		}
+	}
+#endif
+	if (sflag & PS_PROFPEND) {
+		PROC_LOCK(p);
+		psignal(p, SIGPROF);
+		PROC_UNLOCK(p);
+	}
+	if (flags & KEF_NEEDRESCHED) {
+		mtx_lock_spin(&sched_lock);
+		td->td_priority = kg->kg_user_pri;
+		setrunqueue(td);
+		p->p_stats->p_ru.ru_nivcsw++;
+		mi_switch();
+		mtx_unlock_spin(&sched_lock);
+	}
+	if (sflag & PS_NEEDSIGCHK) {
+		PROC_LOCK(p);
+		while ((sig = cursig(p)) != 0)
+			postsig(sig);
+		PROC_UNLOCK(p);
+	}
+
+	userret(td, framep, sticks);
+	mtx_assert(&Giant, MA_NOTOWNED);
+}
diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c
new file mode 100644
index 0000000..08bca8d
--- /dev/null
+++ b/sys/kern/subr_turnstile.c
@@ -0,0 +1,986 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ *    promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Machine independent bits of mutex implementation.
+ */
+
+#include "opt_adaptive_mutexes.h"
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/stdint.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+
+#include <machine/atomic.h>
+#include <machine/bus.h>
+#include <machine/clock.h>
+#include <machine/cpu.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+/*
+ * Internal utility macros.
+ */
+#define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
+
+#define mtx_owner(m)	(mtx_unowned((m)) ? NULL \
+	: (struct thread *)((m)->mtx_lock & MTX_FLAGMASK))
+
+/* XXXKSE This test will change. */
+#define	thread_running(td)						\
+	((td)->td_kse != NULL && (td)->td_kse->ke_oncpu != NOCPU)
+    
+/*
+ * Lock classes for sleep and spin mutexes.
+ */
+struct lock_class lock_class_mtx_sleep = {
+	"sleep mutex",
+	LC_SLEEPLOCK | LC_RECURSABLE
+};
+struct lock_class lock_class_mtx_spin = {
+	"spin mutex",
+	LC_SPINLOCK | LC_RECURSABLE
+};
+
+/*
+ * System-wide mutexes
+ */
+struct mtx sched_lock;
+struct mtx Giant;
+
+/*
+ * Prototypes for non-exported routines.
+ */
+static void	propagate_priority(struct thread *);
+
+static void
+propagate_priority(struct thread *td)
+{
+	int pri = td->td_priority;
+	struct mtx *m = td->td_blocked;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	for (;;) {
+		struct thread *td1;
+
+		td = mtx_owner(m);
+
+		if (td == NULL) {
+			/*
+			 * This really isn't quite right. Really
+			 * ought to bump priority of thread that
+			 * next acquires the mutex.
+			 */
+			MPASS(m->mtx_lock == MTX_CONTESTED);
+			return;
+		}
+
+		MPASS(td->td_proc->p_magic == P_MAGIC);
+		KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));
+		if (td->td_priority <= pri) /* lower is higher priority */
+			return;
+
+		/*
+		 * Bump this thread's priority.
+		 */
+		td->td_priority = pri;
+
+		/*
+		 * If lock holder is actually running, just bump priority.
+		 */
+		if (thread_running(td)) {
+			MPASS(td->td_proc->p_stat == SRUN
+			|| td->td_proc->p_stat == SZOMB
+			|| td->td_proc->p_stat == SSTOP);
+			return;
+		}
+
+#ifndef SMP
+		/*
+		 * For UP, we check to see if td is curthread (this shouldn't
+		 * ever happen however as it would mean we are in a deadlock.)
+		 */
+		KASSERT(td != curthread, ("Deadlock detected"));
+#endif
+
+		/*
+		 * If on run queue move to new run queue, and quit.
+		 * XXXKSE this gets a lot more complicated under threads
+		 * but try anyhow.
+		 */
+		if (td->td_proc->p_stat == SRUN) {
+			MPASS(td->td_blocked == NULL);
+			remrunqueue(td);
+			setrunqueue(td);
+			return;
+		}
+
+		/*
+		 * If we aren't blocked on a mutex, we should be.
+		 */
+		KASSERT(td->td_proc->p_stat == SMTX, (
+		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
+		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,
+		    m->mtx_object.lo_name));
+
+		/*
+		 * Pick up the mutex that td is blocked on.
+		 */
+		m = td->td_blocked;
+		MPASS(m != NULL);
+
+		/*
+		 * Check if the thread needs to be moved up on
+		 * the blocked chain
+		 */
+		if (td == TAILQ_FIRST(&m->mtx_blocked)) {
+			continue;
+		}
+
+		td1 = TAILQ_PREV(td, threadqueue, td_blkq);
+		if (td1->td_priority <= pri) {
+			continue;
+		}
+
+		/*
+		 * Remove thread from blocked chain and determine where
+		 * it should be moved up to.  Since we know that td1 has
+		 * a lower priority than td, we know that at least one
+		 * thread in the chain has a lower priority and that
+		 * td1 will thus not be NULL after the loop.
+		 */
+		TAILQ_REMOVE(&m->mtx_blocked, td, td_blkq);
+		TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq) {
+			MPASS(td1->td_proc->p_magic == P_MAGIC);
+			if (td1->td_priority > pri)
+				break;
+		}
+
+		MPASS(td1 != NULL);
+		TAILQ_INSERT_BEFORE(td1, td, td_blkq);
+		CTR4(KTR_LOCK,
+		    "propagate_priority: p %p moved before %p on [%p] %s",
+		    td, td1, m, m->mtx_object.lo_name);
+	}
+}
+
+#ifdef MUTEX_PROFILING
+SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging");
+SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling");
+static int mutex_prof_enable = 0;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW,
+    &mutex_prof_enable, 0, "Enable tracing of mutex holdtime");
+
+struct mutex_prof {
+	const char *name;
+	const char *file;
+	int line;
+#define MPROF_MAX 0
+#define MPROF_TOT 1
+#define MPROF_CNT 2
+#define MPROF_AVG 3
+	uintmax_t counter[4];
+	struct mutex_prof *next;
+};
+
+/*
+ * mprof_buf is a static pool of profiling records to avoid possible
+ * reentrance of the memory allocation functions.
+ *
+ * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE.
+ */
+#define NUM_MPROF_BUFFERS 1000
+static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS];
+static int first_free_mprof_buf;
+#define MPROF_HASH_SIZE 1009
+static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE];
+
+static int mutex_prof_acquisitions;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD,
+    &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded");
+static int mutex_prof_records;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD,
+    &mutex_prof_records, 0, "Number of profiling records");
+static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD,
+    &mutex_prof_maxrecords, 0, "Maximum number of profiling records");
+static int mutex_prof_rejected;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD,
+    &mutex_prof_rejected, 0, "Number of rejected profiling records");
+static int mutex_prof_hashsize = MPROF_HASH_SIZE;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD,
+    &mutex_prof_hashsize, 0, "Hash size");
+static int mutex_prof_collisions = 0;
+SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD,
+    &mutex_prof_collisions, 0, "Number of hash collisions");
+
+/*
+ * mprof_mtx protects the profiling buffers and the hash.
+ */
+static struct mtx mprof_mtx;
+MTX_SYSINIT(mprof, &mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET);
+
+static u_int64_t
+nanoseconds(void)
+{
+	struct timespec tv;
+
+	nanotime(&tv);
+	return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec);
+}
+
+static int
+dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf *sb;
+	int error, i;
+
+	if (first_free_mprof_buf == 0)
+		return SYSCTL_OUT(req, "No locking recorded",
+		    sizeof("No locking recorded"));
+
+	sb = sbuf_new(NULL, NULL, 1024, SBUF_AUTOEXTEND);
+	sbuf_printf(sb, "%12s %12s %12s %12s %s\n",
+	    "max", "total", "count", "average", "name");
+	mtx_lock_spin(&mprof_mtx);
+	for (i = 0; i < first_free_mprof_buf; ++i)
+		sbuf_printf(sb, "%12ju %12ju %12ju %12ju %s:%d (%s)\n",
+		    mprof_buf[i].counter[MPROF_MAX] / 1000,
+		    mprof_buf[i].counter[MPROF_TOT] / 1000,
+		    mprof_buf[i].counter[MPROF_CNT],
+		    mprof_buf[i].counter[MPROF_AVG] / 1000,
+		    mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name);
+	mtx_unlock_spin(&mprof_mtx);
+	sbuf_finish(sb);
+	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+	sbuf_delete(sb);
+	return (error);
+}
+SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
+    NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics");
+#endif
+
+/*
+ * Function versions of the inlined __mtx_* macros.  These are used by
+ * modules and can also be called from assembly language if needed.
+ */
+void
+_mtx_lock_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+	_get_sleep_lock(m, curthread, opts, file, line);
+	LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	    line);
+	WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+#ifdef MUTEX_PROFILING
+	/* don't reset the timer when/if recursing */
+	if (m->acqtime == 0) {
+		m->file = file;
+		m->line = line;
+		m->acqtime = mutex_prof_enable ? nanoseconds() : 0;
+		++mutex_prof_acquisitions;
+	}
+#endif
+}
+
+void
+_mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+	mtx_assert(m, MA_OWNED);
+#ifdef MUTEX_PROFILING
+	if (m->acqtime != 0) {
+		static const char *unknown = "(unknown)";
+		struct mutex_prof *mpp;
+		u_int64_t acqtime, now;
+		const char *p, *q;
+		volatile u_int hash;
+
+		now = nanoseconds();
+		acqtime = m->acqtime;
+		m->acqtime = 0;
+		if (now <= acqtime)
+			goto out;
+		for (p = file; strncmp(p, "../", 3) == 0; p += 3)
+			/* nothing */ ;
+		if (p == NULL || *p == '\0')
+			p = unknown;
+		for (hash = line, q = p; *q != '\0'; ++q)
+			hash = (hash * 2 + *q) % MPROF_HASH_SIZE;
+		mtx_lock_spin(&mprof_mtx);
+		for (mpp = mprof_hash[hash]; mpp != NULL; mpp = mpp->next)
+			if (mpp->line == line && strcmp(mpp->file, p) == 0)
+				break;
+		if (mpp == NULL) {
+			/* Just exit if we cannot get a trace buffer */
+			if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) {
+				++mutex_prof_rejected;
+				goto unlock;
+			}
+			mpp = &mprof_buf[first_free_mprof_buf++];
+			mpp->name = mtx_name(m);
+			mpp->file = p;
+			mpp->line = line;
+			mpp->next = mprof_hash[hash];
+			if (mprof_hash[hash] != NULL)
+				++mutex_prof_collisions;
+			mprof_hash[hash] = mpp;	
+			++mutex_prof_records;
+		}
+		/*
+		 * Record if the mutex has been held longer now than ever
+		 * before
+		 */
+		if ((now - acqtime) > mpp->counter[MPROF_MAX])
+			mpp->counter[MPROF_MAX] = now - acqtime;
+		mpp->counter[MPROF_TOT] += now - acqtime;
+		mpp->counter[MPROF_CNT] += 1;
+		mpp->counter[MPROF_AVG] =
+		    mpp->counter[MPROF_TOT] / mpp->counter[MPROF_CNT];
+unlock:
+		mtx_unlock_spin(&mprof_mtx);
+	}
+out:
+#endif
+ 	WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	    line);
+	_rel_sleep_lock(m, curthread, opts, file, line);
+}
+
+void
+_mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+#if defined(SMP) || LOCK_DEBUG > 0
+	_get_spin_lock(m, curthread, opts, file, line);
+#else
+	critical_enter();
+#endif
+	LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	    line);
+	WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+}
+
+void
+_mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+	mtx_assert(m, MA_OWNED);
+ 	WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	    line);
+#if defined(SMP) || LOCK_DEBUG > 0
+	_rel_spin_lock(m);
+#else
+	critical_exit();
+#endif
+}
+
+/*
+ * The important part of mtx_trylock{,_flags}()
+ * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that
+ * if we're called, it's because we know we don't already own this lock.
+ */
+int
+_mtx_trylock(struct mtx *m, int opts, const char *file, int line)
+{
+	int rval;
+
+	MPASS(curthread != NULL);
+
+	rval = _obtain_lock(m, curthread);
+
+	LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line);
+	if (rval) {
+		/*
+		 * We do not handle recursion in _mtx_trylock; see the
+		 * note at the top of the routine.
+		 */
+		KASSERT(!mtx_recursed(m),
+		    ("mtx_trylock() called on a recursed mutex"));
+		WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+	}
+
+	return (rval);
+}
+
+/*
+ * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
+ *
+ * We call this if the lock is either contested (i.e. we need to go to
+ * sleep waiting for it), or if we need to recurse on it.
+ */
+void
+_mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
+{
+	struct thread *td = curthread;
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+	struct thread *owner;
+#endif
+
+	if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)td) {
+		m->mtx_recurse++;
+		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
+		return;
+	}
+
+	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		CTR4(KTR_LOCK,
+		    "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
+		    m->mtx_object.lo_name, (void *)m->mtx_lock, file, line);
+
+	while (!_obtain_lock(m, td)) {
+		uintptr_t v;
+		struct thread *td1;
+
+		mtx_lock_spin(&sched_lock);
+		/*
+		 * Check if the lock has been released while spinning for
+		 * the sched_lock.
+		 */
+		if ((v = m->mtx_lock) == MTX_UNOWNED) {
+			mtx_unlock_spin(&sched_lock);
+#ifdef __i386__
+			ia32_pause();
+#endif
+			continue;
+		}
+
+		/*
+		 * The mutex was marked contested on release. This means that
+		 * there are threads blocked on it.
+		 */
+		if (v == MTX_CONTESTED) {
+			td1 = TAILQ_FIRST(&m->mtx_blocked);
+			MPASS(td1 != NULL);
+			m->mtx_lock = (uintptr_t)td | MTX_CONTESTED;
+
+			if (td1->td_priority < td->td_priority)
+				td->td_priority = td1->td_priority; 
+			mtx_unlock_spin(&sched_lock);
+			return;
+		}
+
+		/*
+		 * If the mutex isn't already contested and a failure occurs
+		 * setting the contested bit, the mutex was either released
+		 * or the state of the MTX_RECURSED bit changed.
+		 */
+		if ((v & MTX_CONTESTED) == 0 &&
+		    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
+			(void *)(v | MTX_CONTESTED))) {
+			mtx_unlock_spin(&sched_lock);
+#ifdef __i386__
+			ia32_pause();
+#endif
+			continue;
+		}
+
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+		/*
+		 * If the current owner of the lock is executing on another
+		 * CPU, spin instead of blocking.
+		 */
+		owner = (struct thread *)(v & MTX_FLAGMASK);
+		if (m != &Giant && thread_running(owner)) {
+			mtx_unlock_spin(&sched_lock);
+			while (mtx_owner(m) == owner && thread_running(owner)) {
+#ifdef __i386__
+				ia32_pause();
+#endif
+			}
+			continue;
+		}
+#endif	/* SMP && ADAPTIVE_MUTEXES */
+
+		/*
+		 * We definitely must sleep for this lock.
+		 */
+		mtx_assert(m, MA_NOTOWNED);
+
+#ifdef notyet
+		/*
+		 * If we're borrowing an interrupted thread's VM context, we
+		 * must clean up before going to sleep.
+		 */
+		if (td->td_ithd != NULL) {
+			struct ithd *it = td->td_ithd;
+
+			if (it->it_interrupted) {
+				if (LOCK_LOG_TEST(&m->mtx_object, opts))
+					CTR2(KTR_LOCK,
+				    "_mtx_lock_sleep: %p interrupted %p",
+					    it, it->it_interrupted);
+				intr_thd_fixup(it);
+			}
+		}
+#endif
+
+		/*
+		 * Put us on the list of threads blocked on this mutex.
+		 */
+		if (TAILQ_EMPTY(&m->mtx_blocked)) {
+			td1 = mtx_owner(m);
+			LIST_INSERT_HEAD(&td1->td_contested, m, mtx_contested);
+			TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq);
+		} else {
+			TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq)
+				if (td1->td_priority > td->td_priority)
+					break;
+			if (td1)
+				TAILQ_INSERT_BEFORE(td1, td, td_blkq);
+			else
+				TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq);
+		}
+
+		/*
+		 * Save who we're blocked on.
+		 */
+		td->td_blocked = m;
+		td->td_mtxname = m->mtx_object.lo_name;
+		td->td_proc->p_stat = SMTX;
+		propagate_priority(td);
+
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR3(KTR_LOCK,
+			    "_mtx_lock_sleep: p %p blocked on [%p] %s", td, m,
+			    m->mtx_object.lo_name);
+
+		td->td_proc->p_stats->p_ru.ru_nvcsw++;
+		mi_switch();
+
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR3(KTR_LOCK,
+			  "_mtx_lock_sleep: p %p free from blocked on [%p] %s",
+			  td, m, m->mtx_object.lo_name);
+
+		mtx_unlock_spin(&sched_lock);
+	}
+
+	return;
+}
+
+/*
+ * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
+ *
+ * This is only called if we need to actually spin for the lock. Recursion
+ * is handled inline.
+ */
+void
+_mtx_lock_spin(struct mtx *m, int opts, const char *file, int line)
+{
+	int i = 0;
+
+	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
+
+	for (;;) {
+		if (_obtain_lock(m, curthread))
+			break;
+
+		/* Give interrupts a chance while we spin. */
+		critical_exit();
+		while (m->mtx_lock != MTX_UNOWNED) {
+			if (i++ < 10000000) {
+#ifdef __i386__
+				ia32_pause();
+#endif
+				continue;
+			}
+			if (i < 60000000)
+				DELAY(1);
+#ifdef DDB
+			else if (!db_active)
+#else
+			else
+#endif
+				panic("spin lock %s held by %p for > 5 seconds",
+				    m->mtx_object.lo_name, (void *)m->mtx_lock);
+#ifdef __i386__
+			ia32_pause();
+#endif
+		}
+		critical_enter();
+	}
+
+	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
+
+	return;
+}
+
+/*
+ * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
+ *
+ * We are only called here if the lock is recursed or contested (i.e. we
+ * need to wake up a blocked thread).
+ */
+void
+_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
+{
+	struct thread *td, *td1;
+	struct mtx *m1;
+	int pri;
+
+	td = curthread;
+
+	if (mtx_recursed(m)) {
+		if (--(m->mtx_recurse) == 0)
+			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
+		return;
+	}
+
+	mtx_lock_spin(&sched_lock);
+	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
+
+	td1 = TAILQ_FIRST(&m->mtx_blocked);
+#if defined(SMP) && defined(ADAPTIVE_MUTEXES)
+	if (td1 == NULL) {
+		_release_lock_quick(m);
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m);
+		mtx_unlock_spin(&sched_lock);
+		return;
+	}
+#endif
+	MPASS(td->td_proc->p_magic == P_MAGIC);
+	MPASS(td1->td_proc->p_magic == P_MAGIC);
+
+	TAILQ_REMOVE(&m->mtx_blocked, td1, td_blkq);
+
+	if (TAILQ_EMPTY(&m->mtx_blocked)) {
+		LIST_REMOVE(m, mtx_contested);
+		_release_lock_quick(m);
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
+	} else
+		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);
+
+	pri = PRI_MAX;
+	LIST_FOREACH(m1, &td->td_contested, mtx_contested) {
+		int cp = TAILQ_FIRST(&m1->mtx_blocked)->td_priority;
+		if (cp < pri)
+			pri = cp;
+	}
+
+	if (pri > td->td_base_pri)
+		pri = td->td_base_pri;
+	td->td_priority = pri;
+
+	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p",
+		    m, td1);
+
+	td1->td_blocked = NULL;
+	td1->td_proc->p_stat = SRUN;
+	setrunqueue(td1);
+
+	if (td->td_critnest == 1 && td1->td_priority < pri) {
+#ifdef notyet
+		if (td->td_ithd != NULL) {
+			struct ithd *it = td->td_ithd;
+
+			if (it->it_interrupted) {
+				if (LOCK_LOG_TEST(&m->mtx_object, opts))
+					CTR2(KTR_LOCK,
+				    "_mtx_unlock_sleep: %p interrupted %p",
+					    it, it->it_interrupted);
+				intr_thd_fixup(it);
+			}
+		}
+#endif
+		setrunqueue(td);
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR2(KTR_LOCK,
+			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
+			    (void *)m->mtx_lock);
+
+		td->td_proc->p_stats->p_ru.ru_nivcsw++;
+		mi_switch();
+		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+			CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
+			    m, (void *)m->mtx_lock);
+	}
+
+	mtx_unlock_spin(&sched_lock);
+
+	return;
+}
+
+/*
+ * All the unlocking of MTX_SPIN locks is done inline.
+ * See the _rel_spin_lock() macro for the details. 
+ */
+
+/*
+ * The backing function for the INVARIANTS-enabled mtx_assert()
+ */
+#ifdef INVARIANT_SUPPORT
+void
+_mtx_assert(struct mtx *m, int what, const char *file, int line)
+{
+
+	if (panicstr != NULL)
+		return;
+	switch (what) {
+	case MA_OWNED:
+	case MA_OWNED | MA_RECURSED:
+	case MA_OWNED | MA_NOTRECURSED:
+		if (!mtx_owned(m))
+			panic("mutex %s not owned at %s:%d",
+			    m->mtx_object.lo_name, file, line);
+		if (mtx_recursed(m)) {
+			if ((what & MA_NOTRECURSED) != 0)
+				panic("mutex %s recursed at %s:%d",
+				    m->mtx_object.lo_name, file, line);
+		} else if ((what & MA_RECURSED) != 0) {
+			panic("mutex %s unrecursed at %s:%d",
+			    m->mtx_object.lo_name, file, line);
+		}
+		break;
+	case MA_NOTOWNED:
+		if (mtx_owned(m))
+			panic("mutex %s owned at %s:%d",
+			    m->mtx_object.lo_name, file, line);
+		break;
+	default:
+		panic("unknown mtx_assert at %s:%d", file, line);
+	}
+}
+#endif
+
+/*
+ * The MUTEX_DEBUG-enabled mtx_validate()
+ *
+ * Most of these checks have been moved off into the LO_INITIALIZED flag
+ * maintained by the witness code.
+ */
+#ifdef MUTEX_DEBUG
+
+void	mtx_validate(struct mtx *);
+
+void
+mtx_validate(struct mtx *m)
+{
+
+/*
+ * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
+ * we can re-enable the kernacc() checks.
+ */
+#ifndef __alpha__
+	/*
+	 * Can't call kernacc() from early init386(), especially when
+	 * initializing Giant mutex, because some stuff in kernacc()
+	 * requires Giant itself.
+	 */ 
+	if (!cold)
+		if (!kernacc((caddr_t)m, sizeof(m),
+		    VM_PROT_READ | VM_PROT_WRITE))
+			panic("Can't read and write to mutex %p", m);
+#endif
+}
+#endif
+
+/*
+ * General init routine used by the MTX_SYSINIT() macro.
+ */
+void
+mtx_sysinit(void *arg)
+{
+	struct mtx_args *margs = arg;
+
+	mtx_init(margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts);
+}
+
+/*
+ * Mutex initialization routine; initialize lock `m' of type contained in
+ * `opts' with options contained in `opts' and name `name.'  The optional
+ * lock type `type' is used as a general lock category name for use with
+ * witness.
+ */ 
+void
+mtx_init(struct mtx *m, const char *name, const char *type, int opts)
+{
+	struct lock_object *lock;
+
+	MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
+	    MTX_SLEEPABLE | MTX_NOWITNESS | MTX_DUPOK)) == 0);
+
+#ifdef MUTEX_DEBUG
+	/* Diagnostic and error correction */
+	mtx_validate(m);
+#endif
+
+	lock = &m->mtx_object;
+	KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
+	    ("mutex %s %p already initialized", name, m));
+	bzero(m, sizeof(*m));
+	if (opts & MTX_SPIN)
+		lock->lo_class = &lock_class_mtx_spin;
+	else
+		lock->lo_class = &lock_class_mtx_sleep;
+	lock->lo_name = name;
+	lock->lo_type = type != NULL ? type : name;
+	if (opts & MTX_QUIET)
+		lock->lo_flags = LO_QUIET;
+	if (opts & MTX_RECURSE)
+		lock->lo_flags |= LO_RECURSABLE;
+	if (opts & MTX_SLEEPABLE)
+		lock->lo_flags |= LO_SLEEPABLE;
+	if ((opts & MTX_NOWITNESS) == 0)
+		lock->lo_flags |= LO_WITNESS;
+	if (opts & MTX_DUPOK)
+		lock->lo_flags |= LO_DUPOK;
+
+	m->mtx_lock = MTX_UNOWNED;
+	TAILQ_INIT(&m->mtx_blocked);
+
+	LOCK_LOG_INIT(lock, opts);
+
+	WITNESS_INIT(lock);
+}
+
+/*
+ * Remove lock `m' from all_mtx queue.  We don't allow MTX_QUIET to be
+ * passed in as a flag here because if the corresponding mtx_init() was
+ * called with MTX_QUIET set, then it will already be set in the mutex's
+ * flags.
+ */
+void
+mtx_destroy(struct mtx *m)
+{
+
+	LOCK_LOG_DESTROY(&m->mtx_object, 0);
+
+	if (!mtx_owned(m))
+		MPASS(mtx_unowned(m));
+	else {
+		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
+
+		/* Tell witness this isn't locked to make it happy. */
+		WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__,
+		    __LINE__);
+	}
+
+	WITNESS_DESTROY(&m->mtx_object);
+}
+
+/*
+ * Intialize the mutex code and system mutexes.  This is called from the MD
+ * startup code prior to mi_startup().  The per-CPU data space needs to be
+ * setup before this is called.
+ */
+void
+mutex_init(void)
+{
+
+	/* Setup thread0 so that mutexes work. */
+	LIST_INIT(&thread0.td_contested);
+
+	/*
+	 * Initialize mutexes.
+	 */
+	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
+	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
+	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+	mtx_lock(&Giant);
+}
+
+/*
+ * Encapsulated Giant mutex routines.  These routines provide encapsulation
+ * control for the Giant mutex, allowing sysctls to be used to turn on and
+ * off Giant around certain subsystems.  The default value for the sysctls
+ * are set to what developers believe is stable and working in regards to
+ * the Giant pushdown.  Developers should not turn off Giant via these
+ * sysctls unless they know what they are doing.
+ *
+ * Callers of mtx_lock_giant() are expected to pass the return value to an
+ * accompanying mtx_unlock_giant() later on.  If multiple subsystems are 
+ * effected by a Giant wrap, all related sysctl variables must be zero for
+ * the subsystem call to operate without Giant (as determined by the caller).
+ */
+
+SYSCTL_NODE(_kern, OID_AUTO, giant, CTLFLAG_RD, NULL, "Giant mutex manipulation");
+
+static int kern_giant_all = 0;
+SYSCTL_INT(_kern_giant, OID_AUTO, all, CTLFLAG_RW, &kern_giant_all, 0, "");
+
+int kern_giant_proc = 1;	/* Giant around PROC locks */
+int kern_giant_file = 1;	/* Giant around struct file & filedesc */
+int kern_giant_ucred = 1;	/* Giant around ucred */
+SYSCTL_INT(_kern_giant, OID_AUTO, proc, CTLFLAG_RW, &kern_giant_proc, 0, "");
+SYSCTL_INT(_kern_giant, OID_AUTO, file, CTLFLAG_RW, &kern_giant_file, 0, "");
+SYSCTL_INT(_kern_giant, OID_AUTO, ucred, CTLFLAG_RW, &kern_giant_ucred, 0, "");
+
+int
+mtx_lock_giant(int sysctlvar)
+{
+	if (sysctlvar || kern_giant_all) {
+		mtx_lock(&Giant);
+		return(1);
+	}
+	return(0);
+}
+
+void
+mtx_unlock_giant(int s)
+{
+	if (s)
+		mtx_unlock(&Giant);
+}
+
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
new file mode 100644
index 0000000..182221d
--- /dev/null
+++ b/sys/kern/subr_witness.c
@@ -0,0 +1,1488 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ *    promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Implementation of the `witness' lock verifier.  Originally implemented for
+ * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
+ * classes in FreeBSD.
+ */
+
+/*
+ *	Main Entry: witness
+ *	Pronunciation: 'wit-n&s
+ *	Function: noun
+ *	Etymology: Middle English witnesse, from Old English witnes knowledge,
+ *	    testimony, witness, from 2wit
+ *	Date: before 12th century
+ *	1 : attestation of a fact or event : TESTIMONY
+ *	2 : one that gives evidence; specifically : one who testifies in
+ *	    a cause or before a judicial tribunal
+ *	3 : one asked to be present at a transaction so as to be able to
+ *	    testify to its having taken place
+ *	4 : one who has personal knowledge of something
+ *	5 a : something serving as evidence or proof : SIGN
+ *	  b : public affirmation by word or example of usually
+ *	      religious faith or conviction <the heroic witness to divine
+ *	      life -- Pilot>
+ *	6 capitalized : a member of the Jehovah's Witnesses 
+ */
+
+#include "opt_ddb.h"
+#include "opt_witness.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <ddb/ddb.h>
+
+#define WITNESS_COUNT 200
+#define WITNESS_CHILDCOUNT (WITNESS_COUNT * 4)
+/*
+ * XXX: This is somewhat bogus, as we assume here that at most 1024 threads
+ * will hold LOCK_NCHILDREN * 2 locks.  We handle failure ok, and we should
+ * probably be safe for the most part, but it's still a SWAG.
+ */
+#define LOCK_CHILDCOUNT (MAXCPU + 1024) * 2
+
+#define	WITNESS_NCHILDREN 6
+
+struct witness_child_list_entry;
+
+struct witness {
+	const	char *w_name;
+	struct	lock_class *w_class;
+	STAILQ_ENTRY(witness) w_list;		/* List of all witnesses. */
+	STAILQ_ENTRY(witness) w_typelist;	/* Witnesses of a type. */
+	struct	witness_child_list_entry *w_children;	/* Great evilness... */
+	const	char *w_file;
+	int	w_line;
+	u_int	w_level;
+	u_int	w_refcount;
+	u_char	w_Giant_squawked:1;
+	u_char	w_other_squawked:1;
+	u_char	w_same_squawked:1;
+};
+
+struct witness_child_list_entry {
+	struct	witness_child_list_entry *wcl_next;
+	struct	witness *wcl_children[WITNESS_NCHILDREN];
+	u_int	wcl_count;
+};
+
+STAILQ_HEAD(witness_list, witness);
+
+struct witness_blessed {
+	const	char *b_lock1;
+	const	char *b_lock2;
+};
+
+struct witness_order_list_entry {
+	const	char *w_name;
+	struct	lock_class *w_class;
+};
+
+static struct	witness *enroll(const char *description,
+				struct lock_class *lock_class);
+static int	itismychild(struct witness *parent, struct witness *child);
+static void	removechild(struct witness *parent, struct witness *child);
+static int	isitmychild(struct witness *parent, struct witness *child);
+static int	isitmydescendant(struct witness *parent, struct witness *child);
+static int	blessed(struct witness *, struct witness *);
+static void	witness_display_list(void(*prnt)(const char *fmt, ...),
+				     struct witness_list *list);
+static void	witness_displaydescendants(void(*)(const char *fmt, ...),
+					   struct witness *);
+static void	witness_leveldescendents(struct witness *parent, int level);
+static void	witness_levelall(void);
+static struct	witness *witness_get(void);
+static void	witness_free(struct witness *m);
+static struct	witness_child_list_entry *witness_child_get(void);
+static void	witness_child_free(struct witness_child_list_entry *wcl);
+static struct	lock_list_entry *witness_lock_list_get(void);
+static void	witness_lock_list_free(struct lock_list_entry *lle);
+static void	witness_display(void(*)(const char *fmt, ...));
+static struct	lock_instance *find_instance(struct lock_list_entry *lock_list,
+					     struct lock_object *lock);
+
+MALLOC_DEFINE(M_WITNESS, "witness", "witness structure");
+
+static int witness_watch = 1;
+TUNABLE_INT("debug.witness_watch", &witness_watch);
+SYSCTL_INT(_debug, OID_AUTO, witness_watch, CTLFLAG_RD, &witness_watch, 0, "");
+
+#ifdef DDB
+/*
+ * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
+ * drop into kdebug() when:
+ *	- a lock heirarchy violation occurs
+ *	- locks are held when going to sleep.
+ */
+#ifdef WITNESS_DDB
+int	witness_ddb = 1;
+#else
+int	witness_ddb = 0;
+#endif
+TUNABLE_INT("debug.witness_ddb", &witness_ddb);
+SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
+#endif /* DDB */
+
+#ifdef WITNESS_SKIPSPIN
+int	witness_skipspin = 1;
+#else
+int	witness_skipspin = 0;
+#endif
+TUNABLE_INT("debug.witness_skipspin", &witness_skipspin);
+SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
+    "");
+
+static struct mtx w_mtx;
+static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
+static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
+static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
+static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
+static struct witness_child_list_entry *w_child_free = NULL;
+static struct lock_list_entry *w_lock_list_free = NULL;
+static int witness_dead;	/* fatal error, probably no memory */
+
+static struct witness w_data[WITNESS_COUNT];
+static struct witness_child_list_entry w_childdata[WITNESS_CHILDCOUNT];
+static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
+
+static struct witness_order_list_entry order_lists[] = {
+	{ "Giant", &lock_class_mtx_sleep },
+	{ "proctree", &lock_class_sx },
+	{ "allproc", &lock_class_sx },
+	{ "sigio lock", &lock_class_mtx_sleep },
+	{ "process group", &lock_class_mtx_sleep },
+	{ "process lock", &lock_class_mtx_sleep },
+	{ "session", &lock_class_mtx_sleep },
+	{ "uidinfo hash", &lock_class_mtx_sleep },
+	{ "uidinfo struct", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * spin locks
+	 */
+#ifdef SMP
+	{ "ap boot", &lock_class_mtx_spin },
+#ifdef __i386__
+	{ "com", &lock_class_mtx_spin },
+#endif
+#endif
+	{ "sio", &lock_class_mtx_spin },
+#ifdef __i386__
+	{ "cy", &lock_class_mtx_spin },
+#endif
+	{ "ng_node", &lock_class_mtx_spin },
+	{ "ng_worklist", &lock_class_mtx_spin },
+	{ "ithread table lock", &lock_class_mtx_spin },
+	{ "sched lock", &lock_class_mtx_spin },
+	{ "callout", &lock_class_mtx_spin },
+	/*
+	 * leaf locks
+	 */
+	{ "allpmaps", &lock_class_mtx_spin },
+	{ "vm page buckets mutex", &lock_class_mtx_spin },
+	{ "icu", &lock_class_mtx_spin },
+#ifdef SMP
+	{ "smp rendezvous", &lock_class_mtx_spin },
+#endif
+	{ "clk", &lock_class_mtx_spin },
+	{ "mutex profiling lock", &lock_class_mtx_spin },
+	{ NULL, NULL },
+	{ NULL, NULL }
+};
+
+/*
+ * Pairs of locks which have been blessed
+ * Don't complain about order problems with blessed locks
+ */
+static struct witness_blessed blessed_list[] = {
+};
+static int blessed_count =
+	sizeof(blessed_list) / sizeof(struct witness_blessed);
+
+/*
+ * List of all locks in the system.
+ */
+TAILQ_HEAD(, lock_object) all_locks = TAILQ_HEAD_INITIALIZER(all_locks);
+
+static struct mtx all_mtx = {
+	{ &lock_class_mtx_sleep,	/* mtx_object.lo_class */
+	  "All locks list",		/* mtx_object.lo_name */
+	  "All locks list",		/* mtx_object.lo_type */
+	  LO_INITIALIZED,		/* mtx_object.lo_flags */
+	  { NULL, NULL },		/* mtx_object.lo_list */
+	  NULL },			/* mtx_object.lo_witness */
+	MTX_UNOWNED, 0,			/* mtx_lock, mtx_recurse */
+	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
+	{ NULL, NULL }			/* mtx_contested */
+};
+
+/*
+ * This global is set to 0 once it becomes safe to use the witness code.
+ */
+static int witness_cold = 1;
+
+/*
+ * Global variables for book keeping.
+ */
+static int lock_cur_cnt;
+static int lock_max_cnt;
+
+/*
+ * The WITNESS-enabled diagnostic code.
+ */
+static void
+witness_initialize(void *dummy __unused)
+{
+	struct lock_object *lock;
+	struct witness_order_list_entry *order;
+	struct witness *w, *w1;
+	int i;
+
+	/*
+	 * We have to release Giant before initializing its witness
+	 * structure so that WITNESS doesn't get confused.
+	 */
+	mtx_unlock(&Giant);
+	mtx_assert(&Giant, MA_NOTOWNED);
+
+	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
+	TAILQ_INSERT_HEAD(&all_locks, &all_mtx.mtx_object, lo_list);
+	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
+	    MTX_NOWITNESS);
+	for (i = 0; i < WITNESS_COUNT; i++)
+		witness_free(&w_data[i]);
+	for (i = 0; i < WITNESS_CHILDCOUNT; i++)
+		witness_child_free(&w_childdata[i]);
+	for (i = 0; i < LOCK_CHILDCOUNT; i++)
+		witness_lock_list_free(&w_locklistdata[i]);
+
+	/* First add in all the specified order lists. */
+	for (order = order_lists; order->w_name != NULL; order++) {
+		w = enroll(order->w_name, order->w_class);
+		if (w == NULL)
+			continue;
+		w->w_file = "order list";
+		for (order++; order->w_name != NULL; order++) {
+			w1 = enroll(order->w_name, order->w_class);
+			if (w1 == NULL)
+				continue;
+			w1->w_file = "order list";
+			itismychild(w, w1);
+			w = w1;
+		}
+	}
+
+	/* Iterate through all locks and add them to witness. */
+	mtx_lock(&all_mtx);
+	TAILQ_FOREACH(lock, &all_locks, lo_list) {
+		if (lock->lo_flags & LO_WITNESS)
+			lock->lo_witness = enroll(lock->lo_type,
+			    lock->lo_class);
+		else
+			lock->lo_witness = NULL;
+	}
+	mtx_unlock(&all_mtx);
+
+	/* Mark the witness code as being ready for use. */
+	atomic_store_rel_int(&witness_cold, 0);
+
+	mtx_lock(&Giant);
+}
+SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize, NULL)
+
+void
+witness_init(struct lock_object *lock)
+{
+	struct lock_class *class;
+
+	class = lock->lo_class;
+	if (lock->lo_flags & LO_INITIALIZED)
+		panic("%s: lock (%s) %s is already initialized", __func__,
+		    class->lc_name, lock->lo_name);
+	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
+	    (class->lc_flags & LC_RECURSABLE) == 0)
+		panic("%s: lock (%s) %s can not be recursable", __func__,
+		    class->lc_name, lock->lo_name);
+	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+	    (class->lc_flags & LC_SLEEPABLE) == 0)
+		panic("%s: lock (%s) %s can not be sleepable", __func__,
+		    class->lc_name, lock->lo_name);
+	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
+	    (class->lc_flags & LC_UPGRADABLE) == 0)
+		panic("%s: lock (%s) %s can not be upgradable", __func__,
+		    class->lc_name, lock->lo_name);
+
+	mtx_lock(&all_mtx);
+	TAILQ_INSERT_TAIL(&all_locks, lock, lo_list);
+	lock->lo_flags |= LO_INITIALIZED;
+	lock_cur_cnt++;
+	if (lock_cur_cnt > lock_max_cnt)
+		lock_max_cnt = lock_cur_cnt;
+	mtx_unlock(&all_mtx);
+	if (!witness_cold && !witness_dead && panicstr == NULL &&
+	    (lock->lo_flags & LO_WITNESS) != 0)
+		lock->lo_witness = enroll(lock->lo_type, class);
+	else
+		lock->lo_witness = NULL;
+}
+
+void
+witness_destroy(struct lock_object *lock)
+{
+	struct witness *w;
+
+	if (witness_cold)
+		panic("lock (%s) %s destroyed while witness_cold",
+		    lock->lo_class->lc_name, lock->lo_name);
+	if ((lock->lo_flags & LO_INITIALIZED) == 0)
+		panic("%s: lock (%s) %s is not initialized", __func__,
+		    lock->lo_class->lc_name, lock->lo_name);
+
+	/* XXX: need to verify that no one holds the lock */
+	w = lock->lo_witness;
+	if (w != NULL) {
+		mtx_lock_spin(&w_mtx);
+		MPASS(w->w_refcount > 0);
+		w->w_refcount--;
+		mtx_unlock_spin(&w_mtx);
+	}
+
+	mtx_lock(&all_mtx);
+	lock_cur_cnt--;
+	TAILQ_REMOVE(&all_locks, lock, lo_list);
+	lock->lo_flags &= ~LO_INITIALIZED;
+	mtx_unlock(&all_mtx);
+}
+
+static void
+witness_display_list(void(*prnt)(const char *fmt, ...),
+		     struct witness_list *list)
+{
+	struct witness *w, *w1;
+	int found;
+
+	STAILQ_FOREACH(w, list, w_typelist) {
+		if (w->w_file == NULL)
+			continue;
+		found = 0;
+		STAILQ_FOREACH(w1, list, w_typelist) {
+			if (isitmychild(w1, w)) {
+				found++;
+				break;
+			}
+		}
+		if (found)
+			continue;
+		/*
+		 * This lock has no anscestors, display its descendants. 
+		 */
+		witness_displaydescendants(prnt, w);
+	}
+}
+	
+static void
+witness_display(void(*prnt)(const char *fmt, ...))
+{
+	struct witness *w;
+
+	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+	witness_levelall();
+
+	/*
+	 * First, handle sleep locks which have been acquired at least
+	 * once.
+	 */
+	prnt("Sleep locks:\n");
+	witness_display_list(prnt, &w_sleep);
+	
+	/*
+	 * Now do spin locks which have been acquired at least once.
+	 */
+	prnt("\nSpin locks:\n");
+	witness_display_list(prnt, &w_spin);
+	
+	/*
+	 * Finally, any locks which have not been acquired yet.
+	 */
+	prnt("\nLocks which were never acquired:\n");
+	STAILQ_FOREACH(w, &w_all, w_list) {
+		if (w->w_file != NULL || w->w_refcount == 0)
+			continue;
+		prnt("%s\n", w->w_name);
+	}
+}
+
+void
+witness_lock(struct lock_object *lock, int flags, const char *file, int line)
+{
+	struct lock_list_entry **lock_list, *lle;
+	struct lock_instance *lock1, *lock2;
+	struct lock_class *class;
+	struct witness *w, *w1;
+	struct thread *td;
+	int i, j;
+#ifdef DDB
+	int go_into_ddb = 0;
+#endif /* DDB */
+
+	if (witness_cold || witness_dead || lock->lo_witness == NULL ||
+	    panicstr != NULL)
+		return;
+	w = lock->lo_witness;
+	class = lock->lo_class;
+	td = curthread;
+
+	if (class->lc_flags & LC_SLEEPLOCK) {
+		/*
+		 * Since spin locks include a critical section, this check
+		 * impliclty enforces a lock order of all sleep locks before
+		 * all spin locks.
+		 */
+		if (td->td_critnest != 0 && (flags & LOP_TRYLOCK) == 0)
+			panic("blockable sleep lock (%s) %s @ %s:%d",
+			    class->lc_name, lock->lo_name, file, line);
+		lock_list = &td->td_sleeplocks;
+	} else
+		lock_list = PCPU_PTR(spinlocks);
+
+	/*
+	 * Try locks do not block if they fail to acquire the lock, thus
+	 * there is no danger of deadlocks or of switching while holding a
+	 * spin lock if we acquire a lock via a try operation.
+	 */
+	if (flags & LOP_TRYLOCK)
+		goto out;
+
+	/*
+	 * Is this the first lock acquired?  If so, then no order checking
+	 * is needed.
+	 */
+	if (*lock_list == NULL)
+		goto out;
+
+	/*
+	 * Check to see if we are recursing on a lock we already own.
+	 */
+	lock1 = find_instance(*lock_list, lock);
+	if (lock1 != NULL) {
+		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
+		    (flags & LOP_EXCLUSIVE) == 0) {
+			printf("shared lock of (%s) %s @ %s:%d\n",
+			    class->lc_name, lock->lo_name, file, line);
+			printf("while exclusively locked from %s:%d\n",
+			    lock1->li_file, lock1->li_line);
+			panic("share->excl");
+		}
+		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
+		    (flags & LOP_EXCLUSIVE) != 0) {
+			printf("exclusive lock of (%s) %s @ %s:%d\n",
+			    class->lc_name, lock->lo_name, file, line);
+			printf("while share locked from %s:%d\n",
+			    lock1->li_file, lock1->li_line);
+			panic("excl->share");
+		}
+		lock1->li_flags++;
+		if ((lock->lo_flags & LO_RECURSABLE) == 0) {
+			printf(
+			"recursed on non-recursive lock (%s) %s @ %s:%d\n",
+			    class->lc_name, lock->lo_name, file, line);
+			printf("first acquired @ %s:%d\n", lock1->li_file,
+			    lock1->li_line);
+			panic("recurse");
+		}
+		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
+		    td->td_proc->p_pid, lock->lo_name,
+		    lock1->li_flags & LI_RECURSEMASK);
+		lock1->li_file = file;
+		lock1->li_line = line;
+		return;
+	}
+
+	/*
+	 * Check for duplicate locks of the same type.  Note that we only
+	 * have to check for this on the last lock we just acquired.  Any
+	 * other cases will be caught as lock order violations.
+	 */
+	lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1];
+	w1 = lock1->li_lock->lo_witness;
+	if (w1 == w) {
+		if (w->w_same_squawked || (lock->lo_flags & LO_DUPOK))
+			goto out;
+		w->w_same_squawked = 1;
+		printf("acquiring duplicate lock of same type: \"%s\"\n", 
+			lock->lo_type);
+		printf(" 1st %s @ %s:%d\n", lock1->li_lock->lo_name,
+		    lock1->li_file, lock1->li_line);
+		printf(" 2nd %s @ %s:%d\n", lock->lo_name, file, line);
+#ifdef DDB
+		go_into_ddb = 1;
+#endif /* DDB */
+		goto out;
+	}
+	MPASS(!mtx_owned(&w_mtx));
+	mtx_lock_spin(&w_mtx);
+	/*
+	 * If we have a known higher number just say ok
+	 */
+	if (witness_watch > 1 && w->w_level > w1->w_level) {
+		mtx_unlock_spin(&w_mtx);
+		goto out;
+	}
+	if (isitmydescendant(w1, w)) {
+		mtx_unlock_spin(&w_mtx);
+		goto out;
+	}
+	for (j = 0, lle = *lock_list; lle != NULL; lle = lle->ll_next) {
+		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
+
+			MPASS(j < WITNESS_COUNT);
+			lock1 = &lle->ll_children[i];
+			w1 = lock1->li_lock->lo_witness;
+
+			/*
+			 * If this lock doesn't undergo witness checking,
+			 * then skip it.
+			 */
+			if (w1 == NULL) {
+				KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
+				    ("lock missing witness structure"));
+				continue;
+			}
+			/*
+			 * If we are locking Giant and we slept with this
+			 * lock, then skip it.
+			 */
+			if ((lock1->li_flags & LI_SLEPT) != 0 &&
+			    lock == &Giant.mtx_object)
+				continue;
+			/*
+			 * If we are locking a sleepable lock and this lock
+			 * isn't sleepable and isn't Giant, we want to treat
+			 * it as a lock order violation to enfore a general
+			 * lock order of sleepable locks before non-sleepable
+			 * locks.  Thus, we only bother checking the lock
+			 * order hierarchy if we pass the initial test.
+			 */
+			if (!((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+			    ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
+			    lock1->li_lock != &Giant.mtx_object)) &&
+			    !isitmydescendant(w, w1))
+				continue;
+			/*
+			 * We have a lock order violation, check to see if it
+			 * is allowed or has already been yelled about.
+			 */
+			mtx_unlock_spin(&w_mtx);
+			if (blessed(w, w1))
+				goto out;
+			if (lock1->li_lock == &Giant.mtx_object) {
+				if (w1->w_Giant_squawked)
+					goto out;
+				else
+					w1->w_Giant_squawked = 1;
+			} else {
+				if (w1->w_other_squawked)
+					goto out;
+				else
+					w1->w_other_squawked = 1;
+			}
+			/*
+			 * Ok, yell about it.
+			 */
+			printf("lock order reversal\n");
+			/*
+			 * Try to locate an earlier lock with
+			 * witness w in our list.
+			 */
+			do {
+				lock2 = &lle->ll_children[i];
+				MPASS(lock2->li_lock != NULL);
+				if (lock2->li_lock->lo_witness == w)
+					break;
+				i--;
+				if (i == 0 && lle->ll_next != NULL) {
+					lle = lle->ll_next;
+					i = lle->ll_count - 1;
+					MPASS(i != 0);
+				}
+			} while (i >= 0);
+			if (i < 0) {
+				printf(" 1st %p %s (%s) @ %s:%d\n",
+				    lock1->li_lock, lock1->li_lock->lo_name,
+				    lock1->li_lock->lo_type, lock1->li_file,
+				    lock1->li_line);
+				printf(" 2nd %p %s (%s) @ %s:%d\n", lock,
+				    lock->lo_name, lock->lo_type, file, line);
+			} else {
+				printf(" 1st %p %s (%s) @ %s:%d\n",
+				    lock2->li_lock, lock2->li_lock->lo_name,
+				    lock2->li_lock->lo_type, lock2->li_file,
+				    lock2->li_line);
+				printf(" 2nd %p %s (%s) @ %s:%d\n",
+				    lock1->li_lock, lock1->li_lock->lo_name,
+				    lock1->li_lock->lo_type, lock1->li_file,
+				    lock1->li_line);
+				printf(" 3rd %p %s (%s) @ %s:%d\n", lock,
+				    lock->lo_name, lock->lo_type, file, line);
+			}
+#ifdef DDB
+			go_into_ddb = 1;
+#endif /* DDB */
+			goto out;
+		}
+	}
+	lock1 = &(*lock_list)->ll_children[(*lock_list)->ll_count - 1];
+	/*
+	 * Don't build a new relationship if we are locking Giant just
+	 * after waking up and the previous lock in the list was acquired
+	 * prior to blocking.
+	 */
+	if (lock == &Giant.mtx_object && (lock1->li_flags & LI_SLEPT) != 0)
+		mtx_unlock_spin(&w_mtx);
+	else {
+		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
+		    lock->lo_type, lock1->li_lock->lo_type);
+		if (!itismychild(lock1->li_lock->lo_witness, w))
+			mtx_unlock_spin(&w_mtx);
+	} 
+
+out:
+#ifdef DDB
+	if (witness_ddb && go_into_ddb)
+		Debugger(__func__);
+#endif /* DDB */
+	w->w_file = file;
+	w->w_line = line;
+	
+	lle = *lock_list;
+	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
+		lle = witness_lock_list_get();
+		if (lle == NULL)
+			return;
+		lle->ll_next = *lock_list;
+		CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
+		    td->td_proc->p_pid, lle);
+		*lock_list = lle;
+	}
+	lock1 = &lle->ll_children[lle->ll_count++];
+	lock1->li_lock = lock;
+	lock1->li_line = line;
+	lock1->li_file = file;
+	if ((flags & LOP_EXCLUSIVE) != 0)
+		lock1->li_flags = LI_EXCLUSIVE;
+	else
+		lock1->li_flags = 0;
+	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
+	    td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
+}
+
+void
+witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
+{
+	struct lock_instance *instance;
+	struct lock_class *class;
+
+	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+	if (lock->lo_witness == NULL || witness_dead || panicstr != NULL)
+		return;
+	class = lock->lo_class;
+	if ((lock->lo_flags & LO_UPGRADABLE) == 0)
+		panic("upgrade of non-upgradable lock (%s) %s @ %s:%d",
+		    class->lc_name, lock->lo_name, file, line);
+	if ((flags & LOP_TRYLOCK) == 0)
+		panic("non-try upgrade of lock (%s) %s @ %s:%d", class->lc_name,
+		    lock->lo_name, file, line);
+	if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+		panic("upgrade of non-sleep lock (%s) %s @ %s:%d",
+		    class->lc_name, lock->lo_name, file, line);
+	instance = find_instance(curthread->td_sleeplocks, lock);
+	if (instance == NULL)
+		panic("upgrade of unlocked lock (%s) %s @ %s:%d",
+		    class->lc_name, lock->lo_name, file, line);
+	if ((instance->li_flags & LI_EXCLUSIVE) != 0)
+		panic("upgrade of exclusive lock (%s) %s @ %s:%d",
+		    class->lc_name, lock->lo_name, file, line);
+	if ((instance->li_flags & LI_RECURSEMASK) != 0)
+		panic("upgrade of recursed lock (%s) %s r=%d @ %s:%d",
+		    class->lc_name, lock->lo_name,
+		    instance->li_flags & LI_RECURSEMASK, file, line);
+	instance->li_flags |= LI_EXCLUSIVE;
+}
+
+void
+witness_downgrade(struct lock_object *lock, int flags, const char *file,
+    int line)
+{
+	struct lock_instance *instance;
+	struct lock_class *class;
+
+	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+	if (lock->lo_witness == NULL || witness_dead || panicstr != NULL)
+		return;
+	class = lock->lo_class;
+	if ((lock->lo_flags & LO_UPGRADABLE) == 0)
+		panic("downgrade of non-upgradable lock (%s) %s @ %s:%d",
+		    class->lc_name, lock->lo_name, file, line);
+	if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+		panic("downgrade of non-sleep lock (%s) %s @ %s:%d",
+		    class->lc_name, lock->lo_name, file, line);
+	instance = find_instance(curthread->td_sleeplocks, lock);
+	if (instance == NULL)
+		panic("downgrade of unlocked lock (%s) %s @ %s:%d",
+		    class->lc_name, lock->lo_name, file, line);
+	if ((instance->li_flags & LI_EXCLUSIVE) == 0)
+		panic("downgrade of shared lock (%s) %s @ %s:%d",
+		    class->lc_name, lock->lo_name, file, line);
+	if ((instance->li_flags & LI_RECURSEMASK) != 0)
+		panic("downgrade of recursed lock (%s) %s r=%d @ %s:%d",
+		    class->lc_name, lock->lo_name,
+		    instance->li_flags & LI_RECURSEMASK, file, line);
+	instance->li_flags &= ~LI_EXCLUSIVE;
+}
+
+void
+witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
+{
+	struct lock_list_entry **lock_list, *lle;
+	struct lock_instance *instance;
+	struct lock_class *class;
+	struct thread *td;
+	register_t s;
+	int i, j;
+
+	if (witness_cold || witness_dead || lock->lo_witness == NULL ||
+	    panicstr != NULL)
+		return;
+	td = curthread;
+	class = lock->lo_class;
+	if (class->lc_flags & LC_SLEEPLOCK)
+		lock_list = &td->td_sleeplocks;
+	else
+		lock_list = PCPU_PTR(spinlocks);
+	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
+		for (i = 0; i < (*lock_list)->ll_count; i++) {
+			instance = &(*lock_list)->ll_children[i];
+			if (instance->li_lock == lock) {
+				if ((instance->li_flags & LI_EXCLUSIVE) != 0 &&
+				    (flags & LOP_EXCLUSIVE) == 0) {
+					printf(
+					"shared unlock of (%s) %s @ %s:%d\n",
+					    class->lc_name, lock->lo_name,
+					    file, line);
+					printf(
+					"while exclusively locked from %s:%d\n",
+					    instance->li_file,
+					    instance->li_line);
+					panic("excl->ushare");
+				}
+				if ((instance->li_flags & LI_EXCLUSIVE) == 0 &&
+				    (flags & LOP_EXCLUSIVE) != 0) {
+					printf(
+					"exclusive unlock of (%s) %s @ %s:%d\n",
+					    class->lc_name, lock->lo_name,
+					    file, line);
+					printf(
+					"while share locked from %s:%d\n",
+					    instance->li_file,
+					    instance->li_line);
+					panic("share->uexcl");
+				}
+				/* If we are recursed, unrecurse. */
+				if ((instance->li_flags & LI_RECURSEMASK) > 0) {
+					CTR4(KTR_WITNESS,
+				    "%s: pid %d unrecursed on %s r=%d", __func__,
+					    td->td_proc->p_pid,
+					    instance->li_lock->lo_name,
+					    instance->li_flags);
+					instance->li_flags--;
+					return;
+				}
+				s = intr_disable();
+				CTR4(KTR_WITNESS,
+				    "%s: pid %d removed %s from lle[%d]", __func__,
+				    td->td_proc->p_pid,
+				    instance->li_lock->lo_name,
+				    (*lock_list)->ll_count - 1);
+				for (j = i; j < (*lock_list)->ll_count - 1; j++)
+					(*lock_list)->ll_children[j] =
+					    (*lock_list)->ll_children[j + 1];
+				(*lock_list)->ll_count--;
+				intr_restore(s);
+				if ((*lock_list)->ll_count == 0) {
+					lle = *lock_list;
+					*lock_list = lle->ll_next;
+					CTR3(KTR_WITNESS,
+					    "%s: pid %d removed lle %p", __func__,
+					    td->td_proc->p_pid, lle);
+					witness_lock_list_free(lle);
+				}
+				return;
+			}
+		}
+	panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name,
+	    file, line);
+}
+
+/*
+ * Warn if any held locks are not sleepable.  Note that Giant and the lock
+ * passed in are both special cases since they are both released during the
+ * sleep process and aren't actually held while the thread is asleep.
+ */
+int
+witness_sleep(int check_only, struct lock_object *lock, const char *file,
+	      int line)
+{
+	struct lock_list_entry **lock_list, *lle;
+	struct lock_instance *lock1;
+	struct thread *td;
+	int i, n;
+
+	if (witness_cold || witness_dead || panicstr != NULL)
+		return (0);
+	n = 0;
+	td = curthread;
+	lock_list = &td->td_sleeplocks;
+again:
+	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
+		for (i = lle->ll_count - 1; i >= 0; i--) {
+			lock1 = &lle->ll_children[i];
+			if (lock1->li_lock == lock ||
+			    lock1->li_lock == &Giant.mtx_object)
+				continue;
+			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0) {
+				if (check_only == 0) {
+					CTR3(KTR_WITNESS,
+				    "pid %d: sleeping with lock (%s) %s held",
+					    td->td_proc->p_pid,
+					    lock1->li_lock->lo_class->lc_name,
+					    lock1->li_lock->lo_name);
+					lock1->li_flags |= LI_SLEPT;
+				}
+				continue;
+			}
+			n++;
+			printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
+			    file, line, check_only ? "could sleep" : "sleeping",
+			    lock1->li_lock->lo_name, lock1->li_file,
+			    lock1->li_line);
+		}
+	if (lock_list == &td->td_sleeplocks && PCPU_GET(spinlocks) != NULL) {
+		/*
+		 * Since we already hold a spinlock preemption is
+		 * already blocked.
+		 */
+		lock_list = PCPU_PTR(spinlocks);
+		goto again;
+	}
+#ifdef DDB
+	if (witness_ddb && n)
+		Debugger(__func__);
+#endif /* DDB */
+	return (n);
+}
+
+static struct witness *
+enroll(const char *description, struct lock_class *lock_class)
+{
+	struct witness *w;
+
+	if (!witness_watch || witness_dead || panicstr != NULL)
+		return (NULL);
+	if ((lock_class->lc_flags & LC_SPINLOCK) && witness_skipspin)
+		return (NULL);
+	mtx_lock_spin(&w_mtx);
+	STAILQ_FOREACH(w, &w_all, w_list) {
+		if (w->w_name == description || (w->w_refcount > 0 &&
+		    strcmp(description, w->w_name) == 0)) {
+			w->w_refcount++;
+			mtx_unlock_spin(&w_mtx);
+			if (lock_class != w->w_class)
+				panic(
+				"lock (%s) %s does not match earlier (%s) lock",
+				    description, lock_class->lc_name,
+				    w->w_class->lc_name);
+			return (w);
+		}
+	}
+	/*
+	 * This isn't quite right, as witness_cold is still 0 while we
+	 * enroll all the locks initialized before witness_initialize().
+	 */
+	if ((lock_class->lc_flags & LC_SPINLOCK) && !witness_cold) {
+		mtx_unlock_spin(&w_mtx);
+		panic("spin lock %s not in order list", description);
+	}
+	if ((w = witness_get()) == NULL)
+		return (NULL);
+	w->w_name = description;
+	w->w_class = lock_class;
+	w->w_refcount = 1;
+	STAILQ_INSERT_HEAD(&w_all, w, w_list);
+	if (lock_class->lc_flags & LC_SPINLOCK)
+		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
+	else if (lock_class->lc_flags & LC_SLEEPLOCK)
+		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
+	else {
+		mtx_unlock_spin(&w_mtx);
+		panic("lock class %s is not sleep or spin",
+		    lock_class->lc_name);
+	}
+	mtx_unlock_spin(&w_mtx);
+	return (w);
+}
+
+static int
+itismychild(struct witness *parent, struct witness *child)
+{
+	static int recursed;
+	struct witness_child_list_entry **wcl;
+	struct witness_list *list;
+
+	MPASS(child != NULL && parent != NULL);
+	if ((parent->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) !=
+	    (child->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)))
+		panic(
+		"%s: parent (%s) and child (%s) are not the same lock type",
+		    __func__, parent->w_class->lc_name,
+		    child->w_class->lc_name);
+
+	/*
+	 * Insert "child" after "parent"
+	 */
+	wcl = &parent->w_children;
+	while (*wcl != NULL && (*wcl)->wcl_count == WITNESS_NCHILDREN)
+		wcl = &(*wcl)->wcl_next;
+	if (*wcl == NULL) {
+		*wcl = witness_child_get();
+		if (*wcl == NULL)
+			return (1);
+	}
+	(*wcl)->wcl_children[(*wcl)->wcl_count++] = child;
+
+	/*
+	 * Now prune whole tree.  We look for cases where a lock is now
+	 * both a descendant and a direct child of a given lock.  In that
+	 * case, we want to remove the direct child link from the tree.
+	 */
+	if (recursed)
+		return (0);
+	recursed = 1;
+	if (parent->w_class->lc_flags & LC_SLEEPLOCK)
+		list = &w_sleep;
+	else
+		list = &w_spin;
+	STAILQ_FOREACH(child, list, w_typelist) {
+		STAILQ_FOREACH(parent, list, w_typelist) {
+			if (!isitmychild(parent, child))
+				continue;
+			removechild(parent, child);
+			if (isitmydescendant(parent, child))
+				continue;
+			itismychild(parent, child);
+		}
+	}
+	recursed = 0;
+	witness_levelall();
+	return (0);
+}
+
+static void
+removechild(struct witness *parent, struct witness *child)
+{
+	struct witness_child_list_entry **wcl, *wcl1;
+	int i;
+
+	for (wcl = &parent->w_children; *wcl != NULL; wcl = &(*wcl)->wcl_next)
+		for (i = 0; i < (*wcl)->wcl_count; i++)
+			if ((*wcl)->wcl_children[i] == child)
+				goto found;
+	return;
+found:
+	(*wcl)->wcl_count--;
+	if ((*wcl)->wcl_count > i)
+		(*wcl)->wcl_children[i] =
+		    (*wcl)->wcl_children[(*wcl)->wcl_count];
+	MPASS((*wcl)->wcl_children[i] != NULL);
+	if ((*wcl)->wcl_count != 0)
+		return;
+	wcl1 = *wcl;
+	*wcl = wcl1->wcl_next;
+	witness_child_free(wcl1);
+}
+
+static int
+isitmychild(struct witness *parent, struct witness *child)
+{
+	struct witness_child_list_entry *wcl;
+	int i;
+
+	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) {
+		for (i = 0; i < wcl->wcl_count; i++) {
+			if (wcl->wcl_children[i] == child)
+				return (1);
+		}
+	}
+	return (0);
+}
+
+static int
+isitmydescendant(struct witness *parent, struct witness *child)
+{
+	struct witness_child_list_entry *wcl;
+	int i, j;
+
+	if (isitmychild(parent, child))
+		return (1);
+	j = 0;
+	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) {
+		MPASS(j < 1000);
+		for (i = 0; i < wcl->wcl_count; i++) {
+			if (isitmydescendant(wcl->wcl_children[i], child))
+				return (1);
+		}
+		j++;
+	}
+	return (0);
+}
+
+void
+witness_levelall (void)
+{
+	struct witness_list *list;
+	struct witness *w, *w1;
+
+	/*
+	 * First clear all levels.
+	 */
+	STAILQ_FOREACH(w, &w_all, w_list) {
+		w->w_level = 0;
+	}
+
+	/*
+	 * Look for locks with no parent and level all their descendants.
+	 */
+	STAILQ_FOREACH(w, &w_all, w_list) {
+		/*
+		 * This is just an optimization, technically we could get
+		 * away just walking the all list each time.
+		 */
+		if (w->w_class->lc_flags & LC_SLEEPLOCK)
+			list = &w_sleep;
+		else
+			list = &w_spin;
+		STAILQ_FOREACH(w1, list, w_typelist) {
+			if (isitmychild(w1, w))
+				goto skip;
+		}
+		witness_leveldescendents(w, 0);
+	skip:
+		;	/* silence GCC 3.x */
+	}
+}
+
+static void
+witness_leveldescendents(struct witness *parent, int level)
+{
+	struct witness_child_list_entry *wcl;
+	int i;
+
+	if (parent->w_level < level)
+		parent->w_level = level;
+	level++;
+	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
+		for (i = 0; i < wcl->wcl_count; i++)
+			witness_leveldescendents(wcl->wcl_children[i], level);
+}
+
+static void
+witness_displaydescendants(void(*prnt)(const char *fmt, ...),
+			   struct witness *parent)
+{
+	struct witness_child_list_entry *wcl;
+	int i, level;
+
+	level = parent->w_level;
+	prnt("%-2d", level);
+	for (i = 0; i < level; i++)
+		prnt(" ");
+	if (parent->w_refcount > 0) {
+		prnt("%s", parent->w_name);
+		if (parent->w_file != NULL)
+			prnt(" -- last acquired @ %s:%d\n", parent->w_file,
+			    parent->w_line);
+	} else
+		prnt("(dead)\n");
+	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
+		for (i = 0; i < wcl->wcl_count; i++)
+			    witness_displaydescendants(prnt,
+				wcl->wcl_children[i]);
+}
+
+static int
+blessed(struct witness *w1, struct witness *w2)
+{
+	int i;
+	struct witness_blessed *b;
+
+	for (i = 0; i < blessed_count; i++) {
+		b = &blessed_list[i];
+		if (strcmp(w1->w_name, b->b_lock1) == 0) {
+			if (strcmp(w2->w_name, b->b_lock2) == 0)
+				return (1);
+			continue;
+		}
+		if (strcmp(w1->w_name, b->b_lock2) == 0)
+			if (strcmp(w2->w_name, b->b_lock1) == 0)
+				return (1);
+	}
+	return (0);
+}
+
+static struct witness *
+witness_get(void)
+{
+	struct witness *w;
+
+	if (witness_dead) {
+		mtx_unlock_spin(&w_mtx);
+		return (NULL);
+	}
+	if (STAILQ_EMPTY(&w_free)) {
+		witness_dead = 1;
+		mtx_unlock_spin(&w_mtx);
+		printf("%s: witness exhausted\n", __func__);
+		return (NULL);
+	}
+	w = STAILQ_FIRST(&w_free);
+	STAILQ_REMOVE_HEAD(&w_free, w_list);
+	bzero(w, sizeof(*w));
+	return (w);
+}
+
+static void
+witness_free(struct witness *w)
+{
+
+	STAILQ_INSERT_HEAD(&w_free, w, w_list);
+}
+
+static struct witness_child_list_entry *
+witness_child_get(void)
+{
+	struct witness_child_list_entry *wcl;
+
+	if (witness_dead) {
+		mtx_unlock_spin(&w_mtx);
+		return (NULL);
+	}
+	wcl = w_child_free;
+	if (wcl == NULL) {
+		witness_dead = 1;
+		mtx_unlock_spin(&w_mtx);
+		printf("%s: witness exhausted\n", __func__);
+		return (NULL);
+	}
+	w_child_free = wcl->wcl_next;
+	bzero(wcl, sizeof(*wcl));
+	return (wcl);
+}
+
+static void
+witness_child_free(struct witness_child_list_entry *wcl)
+{
+
+	wcl->wcl_next = w_child_free;
+	w_child_free = wcl;
+}
+
+static struct lock_list_entry *
+witness_lock_list_get(void)
+{
+	struct lock_list_entry *lle;
+
+	if (witness_dead)
+		return (NULL);
+	mtx_lock_spin(&w_mtx);
+	lle = w_lock_list_free;
+	if (lle == NULL) {
+		witness_dead = 1;
+		mtx_unlock_spin(&w_mtx);
+		printf("%s: witness exhausted\n", __func__);
+		return (NULL);
+	}
+	w_lock_list_free = lle->ll_next;
+	mtx_unlock_spin(&w_mtx);
+	bzero(lle, sizeof(*lle));
+	return (lle);
+}
+		
+static void
+witness_lock_list_free(struct lock_list_entry *lle)
+{
+
+	mtx_lock_spin(&w_mtx);
+	lle->ll_next = w_lock_list_free;
+	w_lock_list_free = lle;
+	mtx_unlock_spin(&w_mtx);
+}
+
+static struct lock_instance *
+find_instance(struct lock_list_entry *lock_list, struct lock_object *lock)
+{
+	struct lock_list_entry *lle;
+	struct lock_instance *instance;
+	int i;
+
+	for (lle = lock_list; lle != NULL; lle = lle->ll_next)
+		for (i = lle->ll_count - 1; i >= 0; i--) {
+			instance = &lle->ll_children[i];
+			if (instance->li_lock == lock)
+				return (instance);
+		}
+	return (NULL);
+}
+
+int
+witness_list_locks(struct lock_list_entry **lock_list)
+{
+	struct lock_list_entry *lle;
+	struct lock_instance *instance;
+	struct lock_object *lock;
+	int i, nheld;
+
+	nheld = 0;
+	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
+		for (i = lle->ll_count - 1; i >= 0; i--) {
+			instance = &lle->ll_children[i];
+			lock = instance->li_lock;
+			printf("%s %s %s",
+			    (instance->li_flags & LI_EXCLUSIVE) != 0 ?
+			    "exclusive" : "shared",
+			    lock->lo_class->lc_name, lock->lo_name);
+			if (lock->lo_type != lock->lo_name)
+				printf(" (%s)", lock->lo_type);
+			printf(" r = %d (%p) locked @ %s:%d\n",
+			    instance->li_flags & LI_RECURSEMASK, lock,
+			    instance->li_file, instance->li_line);
+			nheld++;
+		}
+	return (nheld);
+}
+
+/*
+ * Calling this on td != curthread is bad unless we are in ddb.
+ */
+int
+witness_list(struct thread *td)
+{
+	int nheld;
+
+	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+#ifdef DDB
+	KASSERT(td == curthread || db_active,
+	    ("%s: td != curthread and we aren't in the debugger", __func__));
+	if (!db_active && witness_dead)
+		return (0);
+#else
+	KASSERT(td == curthread, ("%s: p != curthread", __func__));
+	if (witness_dead)
+		return (0);
+#endif
+	nheld = witness_list_locks(&td->td_sleeplocks);
+
+	/*
+	 * We only handle spinlocks if td == curthread.  This is somewhat broken
+	 * if td is currently executing on some other CPU and holds spin locks
+	 * as we won't display those locks.  If we had a MI way of getting
+	 * the per-cpu data for a given cpu then we could use
+	 * td->td_kse->ke_oncpu to get the list of spinlocks for this thread
+	 * and "fix" this.
+	 *
+	 * That still wouldn't really fix this unless we locked sched_lock
+	 * or stopped the other CPU to make sure it wasn't changing the list
+	 * out from under us.  It is probably best to just not try to handle
+	 * threads on other CPU's for now.
+	 */
+	if (td == curthread && PCPU_GET(spinlocks) != NULL)
+		nheld += witness_list_locks(PCPU_PTR(spinlocks));
+
+	return (nheld);
+}
+
+void
+witness_save(struct lock_object *lock, const char **filep, int *linep)
+{
+	struct lock_instance *instance;
+
+	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+	if (lock->lo_witness == NULL || witness_dead || panicstr != NULL)
+		return;
+	if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+		panic("%s: lock (%s) %s is not a sleep lock", __func__,
+		    lock->lo_class->lc_name, lock->lo_name);
+	instance = find_instance(curthread->td_sleeplocks, lock);
+	if (instance == NULL)
+		panic("%s: lock (%s) %s not locked", __func__,
+		    lock->lo_class->lc_name, lock->lo_name);
+	*filep = instance->li_file;
+	*linep = instance->li_line;
+}
+
+void
+witness_restore(struct lock_object *lock, const char *file, int line)
+{
+	struct lock_instance *instance;
+
+	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
+	if (lock->lo_witness == NULL || witness_dead || panicstr != NULL)
+		return;
+	if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+		panic("%s: lock (%s) %s is not a sleep lock", __func__,
+		    lock->lo_class->lc_name, lock->lo_name);
+	instance = find_instance(curthread->td_sleeplocks, lock);
+	if (instance == NULL)
+		panic("%s: lock (%s) %s not locked", __func__,
+		    lock->lo_class->lc_name, lock->lo_name);
+	lock->lo_witness->w_file = file;
+	lock->lo_witness->w_line = line;
+	instance->li_file = file;
+	instance->li_line = line;
+}
+
+void
+witness_assert(struct lock_object *lock, int flags, const char *file, int line)
+{
+#ifdef INVARIANT_SUPPORT
+	struct lock_instance *instance;
+
+	if (lock->lo_witness == NULL || witness_dead || panicstr != NULL)
+		return;
+	if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) != 0)
+		instance = find_instance(curthread->td_sleeplocks, lock);
+	else if ((lock->lo_class->lc_flags & LC_SPINLOCK) != 0)
+		instance = find_instance(PCPU_GET(spinlocks), lock);
+	else {
+		panic("Lock (%s) %s is not sleep or spin!",
+		    lock->lo_class->lc_name, lock->lo_name);
+		return;
+	}
+	switch (flags) {
+	case LA_UNLOCKED:
+		if (instance != NULL)
+			panic("Lock (%s) %s locked @ %s:%d.",
+			    lock->lo_class->lc_name, lock->lo_name, file, line);
+		break;
+	case LA_LOCKED:
+	case LA_LOCKED | LA_RECURSED:
+	case LA_LOCKED | LA_NOTRECURSED:
+	case LA_SLOCKED:
+	case LA_SLOCKED | LA_RECURSED:
+	case LA_SLOCKED | LA_NOTRECURSED:
+	case LA_XLOCKED:
+	case LA_XLOCKED | LA_RECURSED:
+	case LA_XLOCKED | LA_NOTRECURSED:
+		if (instance == NULL) {
+			panic("Lock (%s) %s not locked @ %s:%d.",
+			    lock->lo_class->lc_name, lock->lo_name, file, line);
+			break;
+		}
+		if ((flags & LA_XLOCKED) != 0 &&
+		    (instance->li_flags & LI_EXCLUSIVE) == 0)
+			panic("Lock (%s) %s not exclusively locked @ %s:%d.",
+			    lock->lo_class->lc_name, lock->lo_name, file, line);
+		if ((flags & LA_SLOCKED) != 0 &&
+		    (instance->li_flags & LI_EXCLUSIVE) != 0)
+			panic("Lock (%s) %s exclusively locked @ %s:%d.",
+			    lock->lo_class->lc_name, lock->lo_name, file, line);
+		if ((flags & LA_RECURSED) != 0 &&
+		    (instance->li_flags & LI_RECURSEMASK) == 0)
+			panic("Lock (%s) %s not recursed @ %s:%d.",
+			    lock->lo_class->lc_name, lock->lo_name, file, line);
+		if ((flags & LA_NOTRECURSED) != 0 &&
+		    (instance->li_flags & LI_RECURSEMASK) != 0)
+			panic("Lock (%s) %s recursed @ %s:%d.",
+			    lock->lo_class->lc_name, lock->lo_name, file, line);
+		break;
+	default:
+		panic("Invalid lock assertion at %s:%d.", file, line);
+
+	}
+#endif	/* INVARIANT_SUPPORT */
+}
+
+#ifdef DDB
+
+DB_SHOW_COMMAND(locks, db_witness_list)
+{
+	struct thread *td;
+	pid_t pid;
+	struct proc *p;
+
+	if (have_addr) {
+		pid = (addr % 16) + ((addr >> 4) % 16) * 10 +
+		    ((addr >> 8) % 16) * 100 + ((addr >> 12) % 16) * 1000 +
+		    ((addr >> 16) % 16) * 10000;
+		/* sx_slock(&allproc_lock); */
+		FOREACH_PROC_IN_SYSTEM(p) {
+			if (p->p_pid == pid)
+				break;
+		}
+		/* sx_sunlock(&allproc_lock); */
+		if (p == NULL) {
+			db_printf("pid %d not found\n", pid);
+			return;
+		}
+		FOREACH_THREAD_IN_PROC(p, td) {
+			witness_list(td);
+		}
+	} else {
+		td = curthread;
+		witness_list(td);
+	}
+}
+
+DB_SHOW_COMMAND(witness, db_witness_display)
+{
+
+	witness_display(db_printf);
+}
+#endif
diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c
new file mode 100644
index 0000000..c9d2676
--- /dev/null
+++ b/sys/kern/subr_xxx.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_xxx.c	8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+/*
+ * Miscellaneous trivial functions.
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+
+/*
+ * Return error for operation not supported
+ * on a specific object or file type.
+ */
+int
+eopnotsupp()
+{
+
+	return (EOPNOTSUPP);
+}
+
+/*
+ * Generic null operation, always returns success.
+ */
+int
+nullop()
+{
+
+	return (0);
+}
+
+#include <sys/conf.h>
+
+/*
+ * Unsupported devswitch functions (e.g. for writing to read-only device).
+ * XXX may belong elsewhere.
+ */
+
+int
+noopen(dev, flags, fmt, td)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct thread *td;
+{
+
+	return (ENODEV);
+}
+
+int
+noclose(dev, flags, fmt, td)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct thread *td;
+{
+
+	return (ENODEV);
+}
+
+int
+noread(dev, uio, ioflag)
+	dev_t dev;
+	struct uio *uio;
+	int ioflag;
+{
+
+	return (ENODEV);
+}
+
+int
+nowrite(dev, uio, ioflag)
+	dev_t dev;
+	struct uio *uio;
+	int ioflag;
+{
+
+	return (ENODEV);
+}
+
+int
+noioctl(dev, cmd, data, flags, td)
+	dev_t dev;
+	u_long cmd;
+	caddr_t data;
+	int flags;
+	struct thread *td;
+{
+
+	return (ENODEV);
+}
+
+int
+nokqfilter(dev, kn)
+	dev_t dev;
+	struct knote *kn;
+{
+
+	return (ENODEV);
+}
+
+int
+nommap(dev, offset, nprot)
+	dev_t dev;
+	vm_offset_t offset;
+	int nprot;
+{
+
+	/* Don't return ENODEV.  That would allow mapping address ENODEV! */
+	return (-1);
+}
+
+int
+nodump(dev_t dev, void *virtual __unused, vm_offset_t physical __unused, off_t offset __unused, size_t length __unused)
+{
+
+	return (ENODEV);
+}
+
+/*
+ * Null devswitch functions (for when the operation always succeeds).
+ * XXX may belong elsewhere.
+ * XXX not all are here (e.g., seltrue() isn't).
+ */
+
+/*
+ * XXX this is probably bogus.  Any device that uses it isn't checking the
+ * minor number.
+ */
+int
+nullopen(dev, flags, fmt, td)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct thread *td;
+{
+
+	return (0);
+}
+
+int
+nullclose(dev, flags, fmt, td)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct thread *td;
+{
+
+	return (0);
+}
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
new file mode 100644
index 0000000..1bdd913
--- /dev/null
+++ b/sys/kern/sys_generic.c
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/condvar.h>
+#ifdef __alpha__
+#include <sys/disklabel.h>
+#endif
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/limits.h>
+
+static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
+static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
+MALLOC_DEFINE(M_IOV, "iov", "large iov's");
+
+static int	pollscan(struct thread *, struct pollfd *, u_int);
+static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
+static int	dofileread(struct thread *, struct file *, int, void *,
+		    size_t, off_t, int);
+static int	dofilewrite(struct thread *, struct file *, int,
+		    const void *, size_t, off_t, int);
+
+/*
+ * Read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct read_args {
+	int	fd;
+	void	*buf;
+	size_t	nbyte;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+read(td, uap)
+	struct thread *td;
+	struct read_args *uap;
+{
+	struct file *fp;
+	int error;
+
+	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
+		error = dofileread(td, fp, uap->fd, uap->buf,
+			    uap->nbyte, (off_t)-1, 0);
+		fdrop(fp, td);
+	}
+	return(error);
+}
+
+/*
+ * Pread system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pread_args {
+	int	fd;
+	void	*buf;
+	size_t	nbyte;
+	int	pad;
+	off_t	offset;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+pread(td, uap)
+	struct thread *td;
+	struct pread_args *uap;
+{
+	struct file *fp;
+	int error;
+
+	if ((error = fget_read(td, uap->fd, &fp)) != 0)
+		return (error);
+	if (fp->f_type != DTYPE_VNODE) {
+		error = ESPIPE;
+	} else {
+		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 
+			    uap->offset, FOF_OFFSET);
+	}
+	fdrop(fp, td);
+	return(error);
+}
+
+/*
+ * Code common for read and pread
+ */
+int
+dofileread(td, fp, fd, buf, nbyte, offset, flags)
+	struct thread *td;
+	struct file *fp;
+	int fd, flags;
+	void *buf;
+	size_t nbyte;
+	off_t offset;
+{
+	struct uio auio;
+	struct iovec aiov;
+	long cnt, error = 0;
+#ifdef KTRACE
+	struct iovec ktriov;
+	struct uio ktruio;
+	int didktr = 0;
+#endif
+
+	aiov.iov_base = (caddr_t)buf;
+	aiov.iov_len = nbyte;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = offset;
+	if (nbyte > INT_MAX)
+		return (EINVAL);
+	auio.uio_resid = nbyte;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+#ifdef KTRACE
+	/*
+	 * if tracing, save a copy of iovec
+	 */
+	if (KTRPOINT(td, KTR_GENIO)) {
+		ktriov = aiov;
+		ktruio = auio;
+		didktr = 1;
+	}
+#endif
+	cnt = nbyte;
+
+	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
+		if (auio.uio_resid != cnt && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	}
+	cnt -= auio.uio_resid;
+#ifdef KTRACE
+	if (didktr && error == 0) {
+		ktruio.uio_iov = &ktriov;
+		ktruio.uio_resid = cnt;
+		ktrgenio(fd, UIO_READ, &ktruio, error);
+	}
+#endif
+	td->td_retval[0] = cnt;
+	return (error);
+}
+
+/*
+ * Scatter read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readv_args {
+	int	fd;
+	struct	iovec *iovp;
+	u_int	iovcnt;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+readv(td, uap)
+	struct thread *td;
+	struct readv_args *uap;
+{
+	struct file *fp;
+	struct uio auio;
+	struct iovec *iov;
+	struct iovec *needfree;
+	struct iovec aiov[UIO_SMALLIOV];
+	long i, cnt;
+	int error;
+	u_int iovlen;
+#ifdef KTRACE
+	struct iovec *ktriov = NULL;
+	struct uio ktruio;
+#endif
+
+	if ((error = fget_read(td, uap->fd, &fp)) != 0)
+		return (error);
+	needfree = NULL;
+	/* note: can't use iovlen until iovcnt is validated */
+	iovlen = uap->iovcnt * sizeof (struct iovec);
+	if (uap->iovcnt > UIO_SMALLIOV) {
+		if (uap->iovcnt > UIO_MAXIOV) {
+			error = EINVAL;
+			goto done;
+		}
+		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+		needfree = iov;
+	} else
+		iov = aiov;
+	auio.uio_iov = iov;
+	auio.uio_iovcnt = uap->iovcnt;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	auio.uio_offset = -1;
+	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
+		goto done;
+	auio.uio_resid = 0;
+	for (i = 0; i < uap->iovcnt; i++) {
+		if (iov->iov_len > INT_MAX - auio.uio_resid) {
+			error = EINVAL;
+			goto done;
+		}
+		auio.uio_resid += iov->iov_len;
+		iov++;
+	}
+#ifdef KTRACE
+	/*
+	 * if tracing, save a copy of iovec
+	 */
+	if (KTRPOINT(td, KTR_GENIO))  {
+		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+		ktruio = auio;
+	}
+#endif
+	cnt = auio.uio_resid;
+	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
+		if (auio.uio_resid != cnt && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	}
+	cnt -= auio.uio_resid;
+#ifdef KTRACE
+	if (ktriov != NULL) {
+		if (error == 0) {
+			ktruio.uio_iov = ktriov;
+			ktruio.uio_resid = cnt;
+			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
+		}
+		FREE(ktriov, M_TEMP);
+	}
+#endif
+	td->td_retval[0] = cnt;
+done:
+	fdrop(fp, td);
+	if (needfree)
+		FREE(needfree, M_IOV);
+	return (error);
+}
+
+/*
+ * Write system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct write_args {
+	int	fd;
+	const void *buf;
+	size_t	nbyte;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+write(td, uap)
+	struct thread *td;
+	struct write_args *uap;
+{
+	struct file *fp;
+	int error;
+
+	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
+		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
+			    (off_t)-1, 0);
+		fdrop(fp, td);
+	} else {
+		error = EBADF;	/* XXX this can't be right */
+	}
+	return(error);
+}
+
+/*
+ * Pwrite system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pwrite_args {
+	int	fd;
+	const void *buf;
+	size_t	nbyte;
+	int	pad;
+	off_t	offset;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+pwrite(td, uap)
+	struct thread *td;
+	struct pwrite_args *uap;
+{
+	struct file *fp;
+	int error;
+
+	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
+		if (fp->f_type == DTYPE_VNODE) {
+			error = dofilewrite(td, fp, uap->fd, uap->buf,
+				    uap->nbyte, uap->offset, FOF_OFFSET);
+		} else {
+			error = ESPIPE;
+		}
+		fdrop(fp, td);
+	} else {
+		error = EBADF;	/* this can't be right */
+	}
+	return(error);
+}
+
+static int
+dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
+	struct thread *td;
+	struct file *fp;
+	int fd, flags;
+	const void *buf;
+	size_t nbyte;
+	off_t offset;
+{
+	struct uio auio;
+	struct iovec aiov;
+	long cnt, error = 0;
+#ifdef KTRACE
+	struct iovec ktriov;
+	struct uio ktruio;
+	int didktr = 0;
+#endif
+
+	aiov.iov_base = (void *)(uintptr_t)buf;
+	aiov.iov_len = nbyte;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = offset;
+	if (nbyte > INT_MAX)
+		return (EINVAL);
+	auio.uio_resid = nbyte;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+#ifdef KTRACE
+	/*
+	 * if tracing, save a copy of iovec and uio
+	 */
+	if (KTRPOINT(td, KTR_GENIO)) {
+		ktriov = aiov;
+		ktruio = auio;
+		didktr = 1;
+	}
+#endif
+	cnt = nbyte;
+	if (fp->f_type == DTYPE_VNODE)
+		bwillwrite();
+	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
+		if (auio.uio_resid != cnt && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		/* Socket layer is responsible for issuing SIGPIPE. */
+		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
+			PROC_LOCK(td->td_proc);
+			psignal(td->td_proc, SIGPIPE);
+			PROC_UNLOCK(td->td_proc);
+		}
+	}
+	cnt -= auio.uio_resid;
+#ifdef KTRACE
+	if (didktr && error == 0) {
+		ktruio.uio_iov = &ktriov;
+		ktruio.uio_resid = cnt;
+		ktrgenio(fd, UIO_WRITE, &ktruio, error);
+	}
+#endif
+	td->td_retval[0] = cnt;
+	return (error);
+}
+
+/*
+ * Gather write system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct writev_args {
+	int	fd;
+	struct	iovec *iovp;
+	u_int	iovcnt;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+writev(td, uap)
+	struct thread *td;
+	register struct writev_args *uap;
+{
+	struct file *fp;
+	struct uio auio;
+	register struct iovec *iov;
+	struct iovec *needfree;
+	struct iovec aiov[UIO_SMALLIOV];
+	long i, cnt, error = 0;
+	u_int iovlen;
+#ifdef KTRACE
+	struct iovec *ktriov = NULL;
+	struct uio ktruio;
+#endif
+
+	mtx_lock(&Giant);
+	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
+		error = EBADF;
+		goto done2;
+	}
+	/* note: can't use iovlen until iovcnt is validated */
+	iovlen = uap->iovcnt * sizeof (struct iovec);
+	if (uap->iovcnt > UIO_SMALLIOV) {
+		if (uap->iovcnt > UIO_MAXIOV) {
+			needfree = NULL;
+			error = EINVAL;
+			goto done;
+		}
+		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+		needfree = iov;
+	} else {
+		iov = aiov;
+		needfree = NULL;
+	}
+	auio.uio_iov = iov;
+	auio.uio_iovcnt = uap->iovcnt;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	auio.uio_offset = -1;
+	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
+		goto done;
+	auio.uio_resid = 0;
+	for (i = 0; i < uap->iovcnt; i++) {
+		if (iov->iov_len > INT_MAX - auio.uio_resid) {
+			error = EINVAL;
+			goto done;
+		}
+		auio.uio_resid += iov->iov_len;
+		iov++;
+	}
+#ifdef KTRACE
+	/*
+	 * if tracing, save a copy of iovec and uio
+	 */
+	if (KTRPOINT(td, KTR_GENIO))  {
+		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+		ktruio = auio;
+	}
+#endif
+	cnt = auio.uio_resid;
+	if (fp->f_type == DTYPE_VNODE)
+		bwillwrite();
+	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
+		if (auio.uio_resid != cnt && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		if (error == EPIPE) {
+			PROC_LOCK(td->td_proc);
+			psignal(td->td_proc, SIGPIPE);
+			PROC_UNLOCK(td->td_proc);
+		}
+	}
+	cnt -= auio.uio_resid;
+#ifdef KTRACE
+	if (ktriov != NULL) {
+		if (error == 0) {
+			ktruio.uio_iov = ktriov;
+			ktruio.uio_resid = cnt;
+			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
+		}
+		FREE(ktriov, M_TEMP);
+	}
+#endif
+	td->td_retval[0] = cnt;
+done:
+	fdrop(fp, td);
+	if (needfree)
+		FREE(needfree, M_IOV);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Ioctl system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ioctl_args {
+	int	fd;
+	u_long	com;
+	caddr_t	data;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ioctl(td, uap)
+	struct thread *td;
+	register struct ioctl_args *uap;
+{
+	struct file *fp;
+	register struct filedesc *fdp;
+	register u_long com;
+	int error = 0;
+	register u_int size;
+	caddr_t data, memp;
+	int tmp;
+#define STK_PARAMS	128
+	union {
+	    char stkbuf[STK_PARAMS];
+	    long align;
+	} ubuf;
+
+	if ((error = fget(td, uap->fd, &fp)) != 0)
+		return (error);
+	mtx_lock(&Giant);
+	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
+		fdrop(fp, td);
+		mtx_unlock(&Giant);
+		return (EBADF);
+	}
+	fdp = td->td_proc->p_fd;
+	switch (com = uap->com) {
+	case FIONCLEX:
+		FILEDESC_LOCK(fdp);
+		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
+		FILEDESC_UNLOCK(fdp);
+		fdrop(fp, td);
+		mtx_unlock(&Giant);
+		return (0);
+	case FIOCLEX:
+		FILEDESC_LOCK(fdp);
+		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
+		FILEDESC_UNLOCK(fdp);
+		fdrop(fp, td);
+		mtx_unlock(&Giant);
+		return (0);
+	}
+
+	/*
+	 * Interpret high order word to find amount of data to be
+	 * copied to/from the user's address space.
+	 */
+	size = IOCPARM_LEN(com);
+	if (size > IOCPARM_MAX) {
+		fdrop(fp, td);
+		mtx_unlock(&Giant);
+		return (ENOTTY);
+	}
+
+	memp = NULL;
+	if (size > sizeof (ubuf.stkbuf)) {
+		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+		data = memp;
+	} else {
+		data = ubuf.stkbuf;
+	}
+	if (com&IOC_IN) {
+		if (size) {
+			error = copyin(uap->data, data, (u_int)size);
+			if (error) {
+				if (memp)
+					free(memp, M_IOCTLOPS);
+				fdrop(fp, td);
+				goto done;
+			}
+		} else {
+			*(caddr_t *)data = uap->data;
+		}
+	} else if ((com&IOC_OUT) && size) {
+		/*
+		 * Zero the buffer so the user always
+		 * gets back something deterministic.
+		 */
+		bzero(data, size);
+	} else if (com&IOC_VOID) {
+		*(caddr_t *)data = uap->data;
+	}
+
+	switch (com) {
+
+	case FIONBIO:
+		FILE_LOCK(fp);
+		if ((tmp = *(int *)data))
+			fp->f_flag |= FNONBLOCK;
+		else
+			fp->f_flag &= ~FNONBLOCK;
+		FILE_UNLOCK(fp);
+		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
+		break;
+
+	case FIOASYNC:
+		FILE_LOCK(fp);
+		if ((tmp = *(int *)data))
+			fp->f_flag |= FASYNC;
+		else
+			fp->f_flag &= ~FASYNC;
+		FILE_UNLOCK(fp);
+		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
+		break;
+
+	default:
+		error = fo_ioctl(fp, com, data, td);
+		/*
+		 * Copy any data to user, size was
+		 * already set and checked above.
+		 */
+		if (error == 0 && (com&IOC_OUT) && size)
+			error = copyout(data, uap->data, (u_int)size);
+		break;
+	}
+	if (memp)
+		free(memp, M_IOCTLOPS);
+	fdrop(fp, td);
+done:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * sellock and selwait are initialized in selectinit() via SYSINIT.
+ */
+struct mtx	sellock;
+struct cv	selwait;
+u_int		nselcoll;	/* Select collisions since boot */
+SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
+
+/*
+ * Select system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct select_args {
+	int	nd;
+	fd_set	*in, *ou, *ex;
+	struct	timeval *tv;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+select(td, uap)
+	register struct thread *td;
+	register struct select_args *uap;
+{
+	struct filedesc *fdp;
+	/*
+	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
+	 * infds with the new FD_SETSIZE of 1024, and more than enough for
+	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
+	 * of 256.
+	 */
+	fd_mask s_selbits[howmany(2048, NFDBITS)];
+	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
+	struct timeval atv, rtv, ttv;
+	int error, timo;
+	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
+
+	if (uap->nd < 0)
+		return (EINVAL);
+	fdp = td->td_proc->p_fd;
+	mtx_lock(&Giant);
+	FILEDESC_LOCK(fdp);
+
+	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
+		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
+	FILEDESC_UNLOCK(fdp);
+
+	/*
+	 * Allocate just enough bits for the non-null fd_sets.  Use the
+	 * preallocated auto buffer if possible.
+	 */
+	nfdbits = roundup(uap->nd, NFDBITS);
+	ncpbytes = nfdbits / NBBY;
+	nbufbytes = 0;
+	if (uap->in != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (uap->ou != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (uap->ex != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (nbufbytes <= sizeof s_selbits)
+		selbits = &s_selbits[0];
+	else
+		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
+
+	/*
+	 * Assign pointers into the bit buffers and fetch the input bits.
+	 * Put the output buffers together so that they can be bzeroed
+	 * together.
+	 */
+	sbp = selbits;
+#define	getbits(name, x) \
+	do {								\
+		if (uap->name == NULL)					\
+			ibits[x] = NULL;				\
+		else {							\
+			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
+			obits[x] = sbp;					\
+			sbp += ncpbytes / sizeof *sbp;			\
+			error = copyin(uap->name, ibits[x], ncpbytes);	\
+			if (error != 0)					\
+				goto done_nosellock;			\
+		}							\
+	} while (0)
+	getbits(in, 0);
+	getbits(ou, 1);
+	getbits(ex, 2);
+#undef	getbits
+	if (nbufbytes != 0)
+		bzero(selbits, nbufbytes / 2);
+
+	if (uap->tv) {
+		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
+			sizeof (atv));
+		if (error)
+			goto done_nosellock;
+		if (itimerfix(&atv)) {
+			error = EINVAL;
+			goto done_nosellock;
+		}
+		getmicrouptime(&rtv);
+		timevaladd(&atv, &rtv);
+	} else {
+		atv.tv_sec = 0;
+		atv.tv_usec = 0;
+	}
+	timo = 0;
+	mtx_lock(&sellock);
+retry:
+	ncoll = nselcoll;
+	mtx_lock_spin(&sched_lock);
+	td->td_flags |= TDF_SELECT;
+	mtx_unlock_spin(&sched_lock);
+	mtx_unlock(&sellock);
+
+	/* XXX Is there a better place for this? */
+	TAILQ_INIT(&td->td_selq);
+	error = selscan(td, ibits, obits, uap->nd);
+	mtx_lock(&sellock);
+	if (error || td->td_retval[0])
+		goto done;
+	if (atv.tv_sec || atv.tv_usec) {
+		getmicrouptime(&rtv);
+		if (timevalcmp(&rtv, &atv, >=))
+			goto done;
+		ttv = atv;
+		timevalsub(&ttv, &rtv);
+		timo = ttv.tv_sec > 24 * 60 * 60 ?
+		    24 * 60 * 60 * hz : tvtohz(&ttv);
+	}
+
+	/*
+	 * An event of interest may occur while we do not hold
+	 * sellock, so check TDF_SELECT and the number of
+	 * collisions and rescan the file descriptors if
+	 * necessary.
+	 */
+	mtx_lock_spin(&sched_lock);
+	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
+		mtx_unlock_spin(&sched_lock);
+		goto retry;
+	}
+	mtx_unlock_spin(&sched_lock);
+
+	if (timo > 0)
+		error = cv_timedwait_sig(&selwait, &sellock, timo);
+	else
+		error = cv_wait_sig(&selwait, &sellock);
+	
+	if (error == 0)
+		goto retry;
+
+done:
+	clear_selinfo_list(td);
+	mtx_lock_spin(&sched_lock);
+	td->td_flags &= ~TDF_SELECT;
+	mtx_unlock_spin(&sched_lock);
+	mtx_unlock(&sellock);
+
+done_nosellock:
+	/* select is not restarted after signals... */
+	if (error == ERESTART)
+		error = EINTR;
+	if (error == EWOULDBLOCK)
+		error = 0;
+#define	putbits(name, x) \
+	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
+		error = error2;
+	if (error == 0) {
+		int error2;
+
+		putbits(in, 0);
+		putbits(ou, 1);
+		putbits(ex, 2);
+#undef putbits
+	}
+	if (selbits != &s_selbits[0])
+		free(selbits, M_SELECT);
+
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static int
+selscan(td, ibits, obits, nfd)
+	struct thread *td;
+	fd_mask **ibits, **obits;
+	int nfd;
+{
+	int msk, i, fd;
+	fd_mask bits;
+	struct file *fp;
+	int n = 0;
+	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
+	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
+	struct filedesc *fdp = td->td_proc->p_fd;
+
+	FILEDESC_LOCK(fdp);
+	for (msk = 0; msk < 3; msk++) {
+		if (ibits[msk] == NULL)
+			continue;
+		for (i = 0; i < nfd; i += NFDBITS) {
+			bits = ibits[msk][i/NFDBITS];
+			/* ffs(int mask) not portable, fd_mask is long */
+			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
+				if (!(bits & 1))
+					continue;
+				if ((fp = fget_locked(fdp, fd)) == NULL) {
+					FILEDESC_UNLOCK(fdp);
+					return (EBADF);
+				}
+				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
+					obits[msk][(fd)/NFDBITS] |=
+					    ((fd_mask)1 << ((fd) % NFDBITS));
+					n++;
+				}
+			}
+		}
+	}
+	FILEDESC_UNLOCK(fdp);
+	td->td_retval[0] = n;
+	return (0);
+}
+
+/*
+ * Poll system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct poll_args {
+	struct pollfd *fds;
+	u_int	nfds;
+	int	timeout;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+poll(td, uap)
+	struct thread *td;
+	struct poll_args *uap;
+{
+	caddr_t bits;
+	char smallbits[32 * sizeof(struct pollfd)];
+	struct timeval atv, rtv, ttv;
+	int error = 0, timo;
+	u_int ncoll, nfds;
+	size_t ni;
+
+	nfds = SCARG(uap, nfds);
+
+	mtx_lock(&Giant);
+	/*
+	 * This is kinda bogus.  We have fd limits, but that is not
+	 * really related to the size of the pollfd array.  Make sure
+	 * we let the process use at least FD_SETSIZE entries and at
+	 * least enough for the current limits.  We want to be reasonably
+	 * safe, but not overly restrictive.
+	 */
+	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
+	    (nfds > FD_SETSIZE)) {
+		error = EINVAL;
+		goto done2;
+	}
+	ni = nfds * sizeof(struct pollfd);
+	if (ni > sizeof(smallbits))
+		bits = malloc(ni, M_TEMP, M_WAITOK);
+	else
+		bits = smallbits;
+	error = copyin(SCARG(uap, fds), bits, ni);
+	if (error)
+		goto done_nosellock;
+	if (SCARG(uap, timeout) != INFTIM) {
+		atv.tv_sec = SCARG(uap, timeout) / 1000;
+		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
+		if (itimerfix(&atv)) {
+			error = EINVAL;
+			goto done_nosellock;
+		}
+		getmicrouptime(&rtv);
+		timevaladd(&atv, &rtv);
+	} else {
+		atv.tv_sec = 0;
+		atv.tv_usec = 0;
+	}
+	timo = 0;
+	mtx_lock(&sellock);
+retry:
+	ncoll = nselcoll;
+	mtx_lock_spin(&sched_lock);
+	td->td_flags |= TDF_SELECT;
+	mtx_unlock_spin(&sched_lock);
+	mtx_unlock(&sellock);
+
+	/* XXX Is there a better place for this? */
+	TAILQ_INIT(&td->td_selq);
+	error = pollscan(td, (struct pollfd *)bits, nfds);
+	mtx_lock(&sellock);
+	if (error || td->td_retval[0])
+		goto done;
+	if (atv.tv_sec || atv.tv_usec) {
+		getmicrouptime(&rtv);
+		if (timevalcmp(&rtv, &atv, >=))
+			goto done;
+		ttv = atv;
+		timevalsub(&ttv, &rtv);
+		timo = ttv.tv_sec > 24 * 60 * 60 ?
+		    24 * 60 * 60 * hz : tvtohz(&ttv);
+	}
+	/*
+	 * An event of interest may occur while we do not hold
+	 * sellock, so check TDF_SELECT and the number of collisions
+	 * and rescan the file descriptors if necessary.
+	 */
+	mtx_lock_spin(&sched_lock);
+	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
+		mtx_unlock_spin(&sched_lock);
+		goto retry;
+	}
+	mtx_unlock_spin(&sched_lock);
+
+	if (timo > 0)
+		error = cv_timedwait_sig(&selwait, &sellock, timo);
+	else
+		error = cv_wait_sig(&selwait, &sellock);
+
+	if (error == 0)
+		goto retry;
+
+done:
+	clear_selinfo_list(td);
+	mtx_lock_spin(&sched_lock);
+	td->td_flags &= ~TDF_SELECT;
+	mtx_unlock_spin(&sched_lock);
+	mtx_unlock(&sellock);
+
+done_nosellock:
+	/* poll is not restarted after signals... */
+	if (error == ERESTART)
+		error = EINTR;
+	if (error == EWOULDBLOCK)
+		error = 0;
+	if (error == 0) {
+		error = copyout(bits, SCARG(uap, fds), ni);
+		if (error)
+			goto out;
+	}
+out:
+	if (ni > sizeof(smallbits))
+		free(bits, M_TEMP);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static int
+pollscan(td, fds, nfd)
+	struct thread *td;
+	struct pollfd *fds;
+	u_int nfd;
+{
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	int i;
+	struct file *fp;
+	int n = 0;
+
+	FILEDESC_LOCK(fdp);
+	for (i = 0; i < nfd; i++, fds++) {
+		if (fds->fd >= fdp->fd_nfiles) {
+			fds->revents = POLLNVAL;
+			n++;
+		} else if (fds->fd < 0) {
+			fds->revents = 0;
+		} else {
+			fp = fdp->fd_ofiles[fds->fd];
+			if (fp == NULL) {
+				fds->revents = POLLNVAL;
+				n++;
+			} else {
+				/*
+				 * Note: backend also returns POLLHUP and
+				 * POLLERR if appropriate.
+				 */
+				fds->revents = fo_poll(fp, fds->events,
+				    fp->f_cred, td);
+				if (fds->revents != 0)
+					n++;
+			}
+		}
+	}
+	FILEDESC_UNLOCK(fdp);
+	td->td_retval[0] = n;
+	return (0);
+}
+
+/*
+ * OpenBSD poll system call.
+ * XXX this isn't quite a true representation..  OpenBSD uses select ops.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct openbsd_poll_args {
+	struct pollfd *fds;
+	u_int	nfds;
+	int	timeout;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+openbsd_poll(td, uap)
+	register struct thread *td;
+	register struct openbsd_poll_args *uap;
+{
+	return (poll(td, (struct poll_args *)uap));
+}
+
+/*
+ * Remove the references to the thread from all of the objects
+ * we were polling.
+ *
+ * This code assumes that the underlying owner of the selinfo
+ * structure will hold sellock before it changes it, and that
+ * it will unlink itself from our list if it goes away.
+ */
+void
+clear_selinfo_list(td)
+	struct thread *td;
+{
+	struct selinfo *si;
+
+	mtx_assert(&sellock, MA_OWNED);
+	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
+		si->si_thread = NULL;
+	TAILQ_INIT(&td->td_selq);
+}
+
+/*ARGSUSED*/
+int
+seltrue(dev, events, td)
+	dev_t dev;
+	int events;
+	struct thread *td;
+{
+
+	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Record a select request.
+ */
+void
+selrecord(selector, sip)
+	struct thread *selector;
+	struct selinfo *sip;
+{
+
+	mtx_lock(&sellock);
+	/*
+	 * If the thread is NULL then take ownership of selinfo
+	 * however if the thread is not NULL and the thread points to
+	 * someone else, then we have a collision, otherwise leave it alone
+	 * as we've owned it in a previous selrecord on this selinfo.
+	 */
+	if (sip->si_thread == NULL) {
+		sip->si_thread = selector;
+		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
+	} else if (sip->si_thread != selector) {
+		sip->si_flags |= SI_COLL;
+	}
+
+	mtx_unlock(&sellock);
+}
+
+/*
+ * Do a wakeup when a selectable event occurs.
+ */
+void
+selwakeup(sip)
+	struct selinfo *sip;
+{
+	struct thread *td;
+
+	mtx_lock(&sellock);
+	td = sip->si_thread;
+	if ((sip->si_flags & SI_COLL) != 0) {
+		nselcoll++;
+		sip->si_flags &= ~SI_COLL;
+		cv_broadcast(&selwait);
+	}
+	if (td == NULL) {
+		mtx_unlock(&sellock);
+		return;
+	}
+	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
+	sip->si_thread = NULL;
+	mtx_lock_spin(&sched_lock);
+	if (td->td_wchan == (caddr_t)&selwait) {
+		if (td->td_proc->p_stat == SSLEEP)
+			setrunnable(td);
+		else
+			cv_waitq_remove(td);
+	} else
+		td->td_flags &= ~TDF_SELECT;
+	mtx_unlock_spin(&sched_lock);
+	mtx_unlock(&sellock);
+}
+
+static void selectinit(void *);
+SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
+
+/* ARGSUSED*/
+static void
+selectinit(dummy)
+	void *dummy;
+{
+	cv_init(&selwait, "select");
+	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
+}
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
new file mode 100644
index 0000000..11ab6d1
--- /dev/null
+++ b/sys/kern/sys_pipe.c
@@ -0,0 +1,1427 @@
+/*
+ * Copyright (c) 1996 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ *    John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ *    are met.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This file contains a high-performance replacement for the socket-based
+ * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
+ * all features of sockets, but does do everything that pipes normally
+ * do.
+ */
+
+/*
+ * This code has two modes of operation, a small write mode and a large
+ * write mode.  The small write mode acts like conventional pipes with
+ * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
+ * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
+ * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
+ * the receiving process can copy it directly from the pages in the sending
+ * process.
+ *
+ * If the sending process receives a signal, it is possible that it will
+ * go away, and certainly its address space can change, because control
+ * is returned back to the user-mode side.  In that case, the pipe code
+ * arranges to copy the buffer supplied by the user process, to a pageable
+ * kernel buffer, and the receiving process will grab the data from the
+ * pageable kernel buffer.  Since signals don't happen all that often,
+ * the copy operation is normally eliminated.
+ *
+ * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
+ * happen for small transfers so that the system will not spend all of
+ * its time context switching.  PIPE_SIZE is constrained by the
+ * amount of kernel virtual memory.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/ttycom.h>
+#include <sys/stat.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/selinfo.h>
+#include <sys/signalvar.h>
+#include <sys/sysproto.h>
+#include <sys/pipe.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/event.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+/*
+ * Use this define if you want to disable *fancy* VM things.  Expect an
+ * approx 30% decrease in transfer rate.  This could be useful for
+ * NetBSD or OpenBSD.
+ */
+/* #define PIPE_NODIRECT */
+
+/*
+ * interfaces to the outside world
+ */
+static int pipe_read(struct file *fp, struct uio *uio, 
+		struct ucred *cred, int flags, struct thread *td);
+static int pipe_write(struct file *fp, struct uio *uio, 
+		struct ucred *cred, int flags, struct thread *td);
+static int pipe_close(struct file *fp, struct thread *td);
+static int pipe_poll(struct file *fp, int events, struct ucred *cred,
+		struct thread *td);
+static int pipe_kqfilter(struct file *fp, struct knote *kn);
+static int pipe_stat(struct file *fp, struct stat *sb, struct thread *td);
+static int pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct thread *td);
+
+static struct fileops pipeops = {
+	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
+	pipe_stat, pipe_close
+};
+
+static void	filt_pipedetach(struct knote *kn);
+static int	filt_piperead(struct knote *kn, long hint);
+static int	filt_pipewrite(struct knote *kn, long hint);
+
+static struct filterops pipe_rfiltops =
+	{ 1, NULL, filt_pipedetach, filt_piperead };
+static struct filterops pipe_wfiltops =
+	{ 1, NULL, filt_pipedetach, filt_pipewrite };
+
+#define PIPE_GET_GIANT(pipe)						\
+	do {								\
+		KASSERT(((pipe)->pipe_state & PIPE_LOCKFL) != 0,	\
+		    ("%s:%d PIPE_GET_GIANT: line pipe not locked",	\
+		     __FILE__, __LINE__));				\
+		PIPE_UNLOCK(pipe);					\
+		mtx_lock(&Giant);					\
+	} while (0)
+
+#define PIPE_DROP_GIANT(pipe)						\
+	do {								\
+		mtx_unlock(&Giant);					\
+		PIPE_LOCK(pipe);					\
+	} while (0)
+
+/*
+ * Default pipe buffer size(s), this can be kind-of large now because pipe
+ * space is pageable.  The pipe code will try to maintain locality of
+ * reference for performance reasons, so small amounts of outstanding I/O
+ * will not wipe the cache.
+ */
+#define MINPIPESIZE (PIPE_SIZE/3)
+#define MAXPIPESIZE (2*PIPE_SIZE/3)
+
+/*
+ * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
+ * is there so that on large systems, we don't exhaust it.
+ */
+#define MAXPIPEKVA (8*1024*1024)
+
+/*
+ * Limit for direct transfers, we cannot, of course limit
+ * the amount of kva for pipes in general though.
+ */
+#define LIMITPIPEKVA (16*1024*1024)
+
+/*
+ * Limit the number of "big" pipes
+ */
+#define LIMITBIGPIPES	32
+static int nbigpipe;
+
+static int amountpipekva;
+
+static void pipeinit(void *dummy __unused);
+static void pipeclose(struct pipe *cpipe);
+static void pipe_free_kmem(struct pipe *cpipe);
+static int pipe_create(struct pipe **cpipep);
+static __inline int pipelock(struct pipe *cpipe, int catch);
+static __inline void pipeunlock(struct pipe *cpipe);
+static __inline void pipeselwakeup(struct pipe *cpipe);
+#ifndef PIPE_NODIRECT
+static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
+static void pipe_destroy_write_buffer(struct pipe *wpipe);
+static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
+static void pipe_clone_write_buffer(struct pipe *wpipe);
+#endif
+static int pipespace(struct pipe *cpipe, int size);
+
+static uma_zone_t pipe_zone;
+
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
+
+static void
+pipeinit(void *dummy __unused)
+{
+	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipe), NULL,
+	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+/*
+ * The pipe system call for the DTYPE_PIPE type of pipes
+ */
+
+/* ARGSUSED */
+int
+pipe(td, uap)
+	struct thread *td;
+	struct pipe_args /* {
+		int	dummy;
+	} */ *uap;
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	struct file *rf, *wf;
+	struct pipe *rpipe, *wpipe;
+	struct mtx *pmtx;
+	int fd, error;
+	
+	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
+
+	pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO);
+	
+	rpipe = wpipe = NULL;
+	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
+		pipeclose(rpipe); 
+		pipeclose(wpipe); 
+		free(pmtx, M_TEMP);
+		return (ENFILE);
+	}
+	
+	rpipe->pipe_state |= PIPE_DIRECTOK;
+	wpipe->pipe_state |= PIPE_DIRECTOK;
+
+	error = falloc(td, &rf, &fd);
+	if (error) {
+		pipeclose(rpipe);
+		pipeclose(wpipe);
+		free(pmtx, M_TEMP);
+		return (error);
+	}
+	fhold(rf);
+	td->td_retval[0] = fd;
+
+	/*
+	 * Warning: once we've gotten past allocation of the fd for the
+	 * read-side, we can only drop the read side via fdrop() in order
+	 * to avoid races against processes which manage to dup() the read
+	 * side while we are blocked trying to allocate the write side.
+	 */
+	FILE_LOCK(rf);
+	rf->f_flag = FREAD | FWRITE;
+	rf->f_type = DTYPE_PIPE;
+	rf->f_data = (caddr_t)rpipe;
+	rf->f_ops = &pipeops;
+	FILE_UNLOCK(rf);
+	error = falloc(td, &wf, &fd);
+	if (error) {
+		FILEDESC_LOCK(fdp);
+		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
+			fdp->fd_ofiles[td->td_retval[0]] = NULL;
+			FILEDESC_UNLOCK(fdp);
+			fdrop(rf, td);
+		} else
+			FILEDESC_UNLOCK(fdp);
+		fdrop(rf, td);
+		/* rpipe has been closed by fdrop(). */
+		pipeclose(wpipe);
+		free(pmtx, M_TEMP);
+		return (error);
+	}
+	FILE_LOCK(wf);
+	wf->f_flag = FREAD | FWRITE;
+	wf->f_type = DTYPE_PIPE;
+	wf->f_data = (caddr_t)wpipe;
+	wf->f_ops = &pipeops;
+	FILE_UNLOCK(wf);
+	td->td_retval[1] = fd;
+	rpipe->pipe_peer = wpipe;
+	wpipe->pipe_peer = rpipe;
+	mtx_init(pmtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
+	rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
+	fdrop(rf, td);
+
+	return (0);
+}
+
+/*
+ * Allocate kva for pipe circular buffer, the space is pageable
+ * This routine will 'realloc' the size of a pipe safely, if it fails
+ * it will retain the old buffer.
+ * If it fails it will return ENOMEM.
+ */
+static int
+pipespace(cpipe, size)
+	struct pipe *cpipe;
+	int size;
+{
+	struct vm_object *object;
+	caddr_t buffer;
+	int npages, error;
+
+	GIANT_REQUIRED;
+	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
+	       ("pipespace: pipe mutex locked"));
+
+	npages = round_page(size)/PAGE_SIZE;
+	/*
+	 * Create an object, I don't like the idea of paging to/from
+	 * kernel_object.
+	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+	 */
+	object = vm_object_allocate(OBJT_DEFAULT, npages);
+	buffer = (caddr_t) vm_map_min(kernel_map);
+
+	/*
+	 * Insert the object into the kernel map, and allocate kva for it.
+	 * The map entry is, by default, pageable.
+	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+	 */
+	error = vm_map_find(kernel_map, object, 0,
+		(vm_offset_t *) &buffer, size, 1,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
+
+	if (error != KERN_SUCCESS) {
+		vm_object_deallocate(object);
+		return (ENOMEM);
+	}
+
+	/* free old resources if we're resizing */
+	pipe_free_kmem(cpipe);
+	cpipe->pipe_buffer.object = object;
+	cpipe->pipe_buffer.buffer = buffer;
+	cpipe->pipe_buffer.size = size;
+	cpipe->pipe_buffer.in = 0;
+	cpipe->pipe_buffer.out = 0;
+	cpipe->pipe_buffer.cnt = 0;
+	amountpipekva += cpipe->pipe_buffer.size;
+	return (0);
+}
+
+/*
+ * initialize and allocate VM and memory for pipe
+ */
+static int
+pipe_create(cpipep)
+	struct pipe **cpipep;
+{
+	struct pipe *cpipe;
+	int error;
+
+	*cpipep = uma_zalloc(pipe_zone, M_WAITOK);
+	if (*cpipep == NULL)
+		return (ENOMEM);
+
+	cpipe = *cpipep;
+	
+	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
+	cpipe->pipe_buffer.object = NULL;
+#ifndef PIPE_NODIRECT
+	cpipe->pipe_map.kva = NULL;
+#endif
+	/*
+	 * protect so pipeclose() doesn't follow a junk pointer
+	 * if pipespace() fails.
+	 */
+	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
+	cpipe->pipe_state = 0;
+	cpipe->pipe_peer = NULL;
+	cpipe->pipe_busy = 0;
+
+#ifndef PIPE_NODIRECT
+	/*
+	 * pipe data structure initializations to support direct pipe I/O
+	 */
+	cpipe->pipe_map.cnt = 0;
+	cpipe->pipe_map.kva = 0;
+	cpipe->pipe_map.pos = 0;
+	cpipe->pipe_map.npages = 0;
+	/* cpipe->pipe_map.ms[] = invalid */
+#endif
+
+	cpipe->pipe_mtxp = NULL;	/* avoid pipespace assertion */
+	error = pipespace(cpipe, PIPE_SIZE);
+	if (error)
+		return (error);
+
+	vfs_timestamp(&cpipe->pipe_ctime);
+	cpipe->pipe_atime = cpipe->pipe_ctime;
+	cpipe->pipe_mtime = cpipe->pipe_ctime;
+
+	return (0);
+}
+
+
+/*
+ * lock a pipe for I/O, blocking other access
+ */
+static __inline int
+pipelock(cpipe, catch)
+	struct pipe *cpipe;
+	int catch;
+{
+	int error;
+
+	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
+	while (cpipe->pipe_state & PIPE_LOCKFL) {
+		cpipe->pipe_state |= PIPE_LWANT;
+		error = msleep(cpipe, PIPE_MTX(cpipe),
+		    catch ? (PRIBIO | PCATCH) : PRIBIO,
+		    "pipelk", 0);
+		if (error != 0) 
+			return (error);
+	}
+	cpipe->pipe_state |= PIPE_LOCKFL;
+	return (0);
+}
+
+/*
+ * unlock a pipe I/O lock
+ */
+static __inline void
+pipeunlock(cpipe)
+	struct pipe *cpipe;
+{
+
+	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
+	cpipe->pipe_state &= ~PIPE_LOCKFL;
+	if (cpipe->pipe_state & PIPE_LWANT) {
+		cpipe->pipe_state &= ~PIPE_LWANT;
+		wakeup(cpipe);
+	}
+}
+
+static __inline void
+pipeselwakeup(cpipe)
+	struct pipe *cpipe;
+{
+
+	if (cpipe->pipe_state & PIPE_SEL) {
+		cpipe->pipe_state &= ~PIPE_SEL;
+		selwakeup(&cpipe->pipe_sel);
+	}
+	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
+		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
+	KNOTE(&cpipe->pipe_sel.si_note, 0);
+}
+
+/* ARGSUSED */
+static int
+pipe_read(fp, uio, cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+	struct thread *td;
+	int flags;
+{
+	struct pipe *rpipe = (struct pipe *) fp->f_data;
+	int error;
+	int nread = 0;
+	u_int size;
+
+	PIPE_LOCK(rpipe);
+	++rpipe->pipe_busy;
+	error = pipelock(rpipe, 1);
+	if (error)
+		goto unlocked_error;
+
+	while (uio->uio_resid) {
+		/*
+		 * normal pipe buffer receive
+		 */
+		if (rpipe->pipe_buffer.cnt > 0) {
+			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
+			if (size > rpipe->pipe_buffer.cnt)
+				size = rpipe->pipe_buffer.cnt;
+			if (size > (u_int) uio->uio_resid)
+				size = (u_int) uio->uio_resid;
+
+			PIPE_UNLOCK(rpipe);
+			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
+					size, uio);
+			PIPE_LOCK(rpipe);
+			if (error)
+				break;
+
+			rpipe->pipe_buffer.out += size;
+			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
+				rpipe->pipe_buffer.out = 0;
+
+			rpipe->pipe_buffer.cnt -= size;
+
+			/*
+			 * If there is no more to read in the pipe, reset
+			 * its pointers to the beginning.  This improves
+			 * cache hit stats.
+			 */
+			if (rpipe->pipe_buffer.cnt == 0) {
+				rpipe->pipe_buffer.in = 0;
+				rpipe->pipe_buffer.out = 0;
+			}
+			nread += size;
+#ifndef PIPE_NODIRECT
+		/*
+		 * Direct copy, bypassing a kernel buffer.
+		 */
+		} else if ((size = rpipe->pipe_map.cnt) &&
+			   (rpipe->pipe_state & PIPE_DIRECTW)) {
+			caddr_t	va;
+			if (size > (u_int) uio->uio_resid)
+				size = (u_int) uio->uio_resid;
+
+			va = (caddr_t) rpipe->pipe_map.kva +
+			    rpipe->pipe_map.pos;
+			PIPE_UNLOCK(rpipe);
+			error = uiomove(va, size, uio);
+			PIPE_LOCK(rpipe);
+			if (error)
+				break;
+			nread += size;
+			rpipe->pipe_map.pos += size;
+			rpipe->pipe_map.cnt -= size;
+			if (rpipe->pipe_map.cnt == 0) {
+				rpipe->pipe_state &= ~PIPE_DIRECTW;
+				wakeup(rpipe);
+			}
+#endif
+		} else {
+			/*
+			 * detect EOF condition
+			 * read returns 0 on EOF, no need to set error
+			 */
+			if (rpipe->pipe_state & PIPE_EOF)
+				break;
+
+			/*
+			 * If the "write-side" has been blocked, wake it up now.
+			 */
+			if (rpipe->pipe_state & PIPE_WANTW) {
+				rpipe->pipe_state &= ~PIPE_WANTW;
+				wakeup(rpipe);
+			}
+
+			/*
+			 * Break if some data was read.
+			 */
+			if (nread > 0)
+				break;
+
+			/*
+			 * Unlock the pipe buffer for our remaining processing.  We
+			 * will either break out with an error or we will sleep and
+			 * relock to loop.
+			 */
+			pipeunlock(rpipe);
+
+			/*
+			 * Handle non-blocking mode operation or
+			 * wait for more data.
+			 */
+			if (fp->f_flag & FNONBLOCK) {
+				error = EAGAIN;
+			} else {
+				rpipe->pipe_state |= PIPE_WANTR;
+				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
+				    PRIBIO | PCATCH,
+				    "piperd", 0)) == 0)
+					error = pipelock(rpipe, 1);
+			}
+			if (error)
+				goto unlocked_error;
+		}
+	}
+	pipeunlock(rpipe);
+
+	/* XXX: should probably do this before getting any locks. */
+	if (error == 0)
+		vfs_timestamp(&rpipe->pipe_atime);
+unlocked_error:
+	--rpipe->pipe_busy;
+
+	/*
+	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
+	 */
+	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
+		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
+		wakeup(rpipe);
+	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
+		/*
+		 * Handle write blocking hysteresis.
+		 */
+		if (rpipe->pipe_state & PIPE_WANTW) {
+			rpipe->pipe_state &= ~PIPE_WANTW;
+			wakeup(rpipe);
+		}
+	}
+
+	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
+		pipeselwakeup(rpipe);
+
+	PIPE_UNLOCK(rpipe);
+	return (error);
+}
+
+#ifndef PIPE_NODIRECT
+/*
+ * Map the sending processes' buffer into kernel space and wire it.
+ * This is similar to a physical write operation.
+ */
+static int
+pipe_build_write_buffer(wpipe, uio)
+	struct pipe *wpipe;
+	struct uio *uio;
+{
+	u_int size;
+	int i;
+	vm_offset_t addr, endaddr, paddr;
+
+	GIANT_REQUIRED;
+	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
+
+	size = (u_int) uio->uio_iov->iov_len;
+	if (size > wpipe->pipe_buffer.size)
+		size = wpipe->pipe_buffer.size;
+
+	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
+	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
+	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
+		vm_page_t m;
+
+		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
+		    (paddr = pmap_extract(vmspace_pmap(curproc->p_vmspace),
+		     addr)) == 0) {
+			int j;
+
+			for (j = 0; j < i; j++)
+				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
+			return (EFAULT);
+		}
+
+		m = PHYS_TO_VM_PAGE(paddr);
+		vm_page_wire(m);
+		wpipe->pipe_map.ms[i] = m;
+	}
+
+/*
+ * set up the control block
+ */
+	wpipe->pipe_map.npages = i;
+	wpipe->pipe_map.pos =
+	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
+	wpipe->pipe_map.cnt = size;
+
+/*
+ * and map the buffer
+ */
+	if (wpipe->pipe_map.kva == 0) {
+		/*
+		 * We need to allocate space for an extra page because the
+		 * address range might (will) span pages at times.
+		 */
+		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
+			wpipe->pipe_buffer.size + PAGE_SIZE);
+		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
+	}
+	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
+		wpipe->pipe_map.npages);
+
+/*
+ * and update the uio data
+ */
+
+	uio->uio_iov->iov_len -= size;
+	uio->uio_iov->iov_base += size;
+	if (uio->uio_iov->iov_len == 0)
+		uio->uio_iov++;
+	uio->uio_resid -= size;
+	uio->uio_offset += size;
+	return (0);
+}
+
+/*
+ * unmap and unwire the process buffer
+ */
+static void
+pipe_destroy_write_buffer(wpipe)
+	struct pipe *wpipe;
+{
+	int i;
+
+	GIANT_REQUIRED;
+	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
+
+	if (wpipe->pipe_map.kva) {
+		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
+
+		if (amountpipekva > MAXPIPEKVA) {
+			vm_offset_t kva = wpipe->pipe_map.kva;
+			wpipe->pipe_map.kva = 0;
+			kmem_free(kernel_map, kva,
+				wpipe->pipe_buffer.size + PAGE_SIZE);
+			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
+		}
+	}
+	for (i = 0; i < wpipe->pipe_map.npages; i++)
+		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
+	wpipe->pipe_map.npages = 0;
+}
+
+/*
+ * In the case of a signal, the writing process might go away.  This
+ * code copies the data into the circular buffer so that the source
+ * pages can be freed without loss of data.
+ */
+static void
+pipe_clone_write_buffer(wpipe)
+	struct pipe *wpipe;
+{
+	int size;
+	int pos;
+
+	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
+	size = wpipe->pipe_map.cnt;
+	pos = wpipe->pipe_map.pos;
+
+	wpipe->pipe_buffer.in = size;
+	wpipe->pipe_buffer.out = 0;
+	wpipe->pipe_buffer.cnt = size;
+	wpipe->pipe_state &= ~PIPE_DIRECTW;
+
+	PIPE_GET_GIANT(wpipe);
+	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
+	    (caddr_t) wpipe->pipe_buffer.buffer, size);
+	pipe_destroy_write_buffer(wpipe);
+	PIPE_DROP_GIANT(wpipe);
+}
+
+/*
+ * This implements the pipe buffer write mechanism.  Note that only
+ * a direct write OR a normal pipe write can be pending at any given time.
+ * If there are any characters in the pipe buffer, the direct write will
+ * be deferred until the receiving process grabs all of the bytes from
+ * the pipe buffer.  Then the direct mapping write is set-up.
+ */
+static int
+pipe_direct_write(wpipe, uio)
+	struct pipe *wpipe;
+	struct uio *uio;
+{
+	int error;
+
+retry:
+	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
+	while (wpipe->pipe_state & PIPE_DIRECTW) {
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+		wpipe->pipe_state |= PIPE_WANTW;
+		error = msleep(wpipe, PIPE_MTX(wpipe),
+		    PRIBIO | PCATCH, "pipdww", 0);
+		if (error)
+			goto error1;
+		if (wpipe->pipe_state & PIPE_EOF) {
+			error = EPIPE;
+			goto error1;
+		}
+	}
+	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
+	if (wpipe->pipe_buffer.cnt > 0) {
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+			
+		wpipe->pipe_state |= PIPE_WANTW;
+		error = msleep(wpipe, PIPE_MTX(wpipe),
+		    PRIBIO | PCATCH, "pipdwc", 0);
+		if (error)
+			goto error1;
+		if (wpipe->pipe_state & PIPE_EOF) {
+			error = EPIPE;
+			goto error1;
+		}
+		goto retry;
+	}
+
+	wpipe->pipe_state |= PIPE_DIRECTW;
+
+	pipelock(wpipe, 0);
+	PIPE_GET_GIANT(wpipe);
+	error = pipe_build_write_buffer(wpipe, uio);
+	PIPE_DROP_GIANT(wpipe);
+	pipeunlock(wpipe);
+	if (error) {
+		wpipe->pipe_state &= ~PIPE_DIRECTW;
+		goto error1;
+	}
+
+	error = 0;
+	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
+		if (wpipe->pipe_state & PIPE_EOF) {
+			pipelock(wpipe, 0);
+			PIPE_GET_GIANT(wpipe);
+			pipe_destroy_write_buffer(wpipe);
+			PIPE_DROP_GIANT(wpipe);
+			pipeunlock(wpipe);
+			pipeselwakeup(wpipe);
+			error = EPIPE;
+			goto error1;
+		}
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+		pipeselwakeup(wpipe);
+		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
+		    "pipdwt", 0);
+	}
+
+	pipelock(wpipe,0);
+	if (wpipe->pipe_state & PIPE_DIRECTW) {
+		/*
+		 * this bit of trickery substitutes a kernel buffer for
+		 * the process that might be going away.
+		 */
+		pipe_clone_write_buffer(wpipe);
+	} else {
+		PIPE_GET_GIANT(wpipe);
+		pipe_destroy_write_buffer(wpipe);
+		PIPE_DROP_GIANT(wpipe);
+	}
+	pipeunlock(wpipe);
+	return (error);
+
+error1:
+	wakeup(wpipe);
+	return (error);
+}
+#endif
+	
+static int
+pipe_write(fp, uio, cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+	struct thread *td;
+	int flags;
+{
+	int error = 0;
+	int orig_resid;
+	struct pipe *wpipe, *rpipe;
+
+	rpipe = (struct pipe *) fp->f_data;
+	wpipe = rpipe->pipe_peer;
+
+	PIPE_LOCK(rpipe);
+	/*
+	 * detect loss of pipe read side, issue SIGPIPE if lost.
+	 */
+	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+		PIPE_UNLOCK(rpipe);
+		return (EPIPE);
+	}
+	++wpipe->pipe_busy;
+
+	/*
+	 * If it is advantageous to resize the pipe buffer, do
+	 * so.
+	 */
+	if ((uio->uio_resid > PIPE_SIZE) &&
+		(nbigpipe < LIMITBIGPIPES) &&
+		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
+		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
+		(wpipe->pipe_buffer.cnt == 0)) {
+
+		if ((error = pipelock(wpipe,1)) == 0) {
+			PIPE_GET_GIANT(wpipe);
+			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
+				nbigpipe++;
+			PIPE_DROP_GIANT(wpipe);
+			pipeunlock(wpipe);
+		}
+	}
+
+	/*
+	 * If an early error occured unbusy and return, waking up any pending
+	 * readers.
+	 */
+	if (error) {
+		--wpipe->pipe_busy;
+		if ((wpipe->pipe_busy == 0) && 
+		    (wpipe->pipe_state & PIPE_WANT)) {
+			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
+			wakeup(wpipe);
+		}
+		PIPE_UNLOCK(rpipe);
+		return(error);
+	}
+		
+	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
+
+	orig_resid = uio->uio_resid;
+
+	while (uio->uio_resid) {
+		int space;
+
+#ifndef PIPE_NODIRECT
+		/*
+		 * If the transfer is large, we can gain performance if
+		 * we do process-to-process copies directly.
+		 * If the write is non-blocking, we don't use the
+		 * direct write mechanism.
+		 *
+		 * The direct write mechanism will detect the reader going
+		 * away on us.
+		 */
+		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
+		    (fp->f_flag & FNONBLOCK) == 0 &&
+			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
+			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
+			error = pipe_direct_write( wpipe, uio);
+			if (error)
+				break;
+			continue;
+		}
+#endif
+
+		/*
+		 * Pipe buffered writes cannot be coincidental with
+		 * direct writes.  We wait until the currently executing
+		 * direct write is completed before we start filling the
+		 * pipe buffer.  We break out if a signal occurs or the
+		 * reader goes away.
+		 */
+	retrywrite:
+		while (wpipe->pipe_state & PIPE_DIRECTW) {
+			if (wpipe->pipe_state & PIPE_WANTR) {
+				wpipe->pipe_state &= ~PIPE_WANTR;
+				wakeup(wpipe);
+			}
+			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
+			    "pipbww", 0);
+			if (wpipe->pipe_state & PIPE_EOF)
+				break;
+			if (error)
+				break;
+		}
+		if (wpipe->pipe_state & PIPE_EOF) {
+			error = EPIPE;
+			break;
+		}
+
+		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
+
+		/* Writes of size <= PIPE_BUF must be atomic. */
+		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
+			space = 0;
+
+		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
+			if ((error = pipelock(wpipe,1)) == 0) {
+				int size;	/* Transfer size */
+				int segsize;	/* first segment to transfer */
+
+				/*
+				 * It is possible for a direct write to
+				 * slip in on us... handle it here...
+				 */
+				if (wpipe->pipe_state & PIPE_DIRECTW) {
+					pipeunlock(wpipe);
+					goto retrywrite;
+				}
+				/* 
+				 * If a process blocked in uiomove, our
+				 * value for space might be bad.
+				 *
+				 * XXX will we be ok if the reader has gone
+				 * away here?
+				 */
+				if (space > wpipe->pipe_buffer.size - 
+				    wpipe->pipe_buffer.cnt) {
+					pipeunlock(wpipe);
+					goto retrywrite;
+				}
+
+				/*
+				 * Transfer size is minimum of uio transfer
+				 * and free space in pipe buffer.
+				 */
+				if (space > uio->uio_resid)
+					size = uio->uio_resid;
+				else
+					size = space;
+				/*
+				 * First segment to transfer is minimum of 
+				 * transfer size and contiguous space in
+				 * pipe buffer.  If first segment to transfer
+				 * is less than the transfer size, we've got
+				 * a wraparound in the buffer.
+				 */
+				segsize = wpipe->pipe_buffer.size - 
+					wpipe->pipe_buffer.in;
+				if (segsize > size)
+					segsize = size;
+				
+				/* Transfer first segment */
+
+				PIPE_UNLOCK(rpipe);
+				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 
+						segsize, uio);
+				PIPE_LOCK(rpipe);
+				
+				if (error == 0 && segsize < size) {
+					/* 
+					 * Transfer remaining part now, to
+					 * support atomic writes.  Wraparound
+					 * happened.
+					 */
+					if (wpipe->pipe_buffer.in + segsize != 
+					    wpipe->pipe_buffer.size)
+						panic("Expected pipe buffer wraparound disappeared");
+						
+					PIPE_UNLOCK(rpipe);
+					error = uiomove(&wpipe->pipe_buffer.buffer[0],
+							size - segsize, uio);
+					PIPE_LOCK(rpipe);
+				}
+				if (error == 0) {
+					wpipe->pipe_buffer.in += size;
+					if (wpipe->pipe_buffer.in >=
+					    wpipe->pipe_buffer.size) {
+						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
+							panic("Expected wraparound bad");
+						wpipe->pipe_buffer.in = size - segsize;
+					}
+				
+					wpipe->pipe_buffer.cnt += size;
+					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
+						panic("Pipe buffer overflow");
+				
+				}
+				pipeunlock(wpipe);
+			}
+			if (error)
+				break;
+
+		} else {
+			/*
+			 * If the "read-side" has been blocked, wake it up now.
+			 */
+			if (wpipe->pipe_state & PIPE_WANTR) {
+				wpipe->pipe_state &= ~PIPE_WANTR;
+				wakeup(wpipe);
+			}
+
+			/*
+			 * don't block on non-blocking I/O
+			 */
+			if (fp->f_flag & FNONBLOCK) {
+				error = EAGAIN;
+				break;
+			}
+
+			/*
+			 * We have no more space and have something to offer,
+			 * wake up select/poll.
+			 */
+			pipeselwakeup(wpipe);
+
+			wpipe->pipe_state |= PIPE_WANTW;
+			error = msleep(wpipe, PIPE_MTX(rpipe),
+			    PRIBIO | PCATCH, "pipewr", 0);
+			if (error != 0)
+				break;
+			/*
+			 * If read side wants to go away, we just issue a signal
+			 * to ourselves.
+			 */
+			if (wpipe->pipe_state & PIPE_EOF) {
+				error = EPIPE;
+				break;
+			}	
+		}
+	}
+
+	--wpipe->pipe_busy;
+
+	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
+		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
+		wakeup(wpipe);
+	} else if (wpipe->pipe_buffer.cnt > 0) {
+		/*
+		 * If we have put any characters in the buffer, we wake up
+		 * the reader.
+		 */
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+	}
+
+	/*
+	 * Don't return EPIPE if I/O was successful
+	 */
+	if ((wpipe->pipe_buffer.cnt == 0) &&
+	    (uio->uio_resid == 0) &&
+	    (error == EPIPE)) {
+		error = 0;
+	}
+
+	if (error == 0)
+		vfs_timestamp(&wpipe->pipe_mtime);
+
+	/*
+	 * We have something to offer,
+	 * wake up select/poll.
+	 */
+	if (wpipe->pipe_buffer.cnt)
+		pipeselwakeup(wpipe);
+
+	PIPE_UNLOCK(rpipe);
+	return (error);
+}
+
+/*
+ * we implement a very minimal set of ioctls for compatibility with sockets.
+ */
+int
+pipe_ioctl(fp, cmd, data, td)
+	struct file *fp;
+	u_long cmd;
+	caddr_t data;
+	struct thread *td;
+{
+	struct pipe *mpipe = (struct pipe *)fp->f_data;
+
+	switch (cmd) {
+
+	case FIONBIO:
+		return (0);
+
+	case FIOASYNC:
+		PIPE_LOCK(mpipe);
+		if (*(int *)data) {
+			mpipe->pipe_state |= PIPE_ASYNC;
+		} else {
+			mpipe->pipe_state &= ~PIPE_ASYNC;
+		}
+		PIPE_UNLOCK(mpipe);
+		return (0);
+
+	case FIONREAD:
+		PIPE_LOCK(mpipe);
+		if (mpipe->pipe_state & PIPE_DIRECTW)
+			*(int *)data = mpipe->pipe_map.cnt;
+		else
+			*(int *)data = mpipe->pipe_buffer.cnt;
+		PIPE_UNLOCK(mpipe);
+		return (0);
+
+	case FIOSETOWN:
+		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
+
+	case FIOGETOWN:
+		*(int *)data = fgetown(mpipe->pipe_sigio);
+		return (0);
+
+	/* This is deprecated, FIOSETOWN should be used instead. */
+	case TIOCSPGRP:
+		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
+
+	/* This is deprecated, FIOGETOWN should be used instead. */
+	case TIOCGPGRP:
+		*(int *)data = -fgetown(mpipe->pipe_sigio);
+		return (0);
+
+	}
+	return (ENOTTY);
+}
+
+int
+pipe_poll(fp, events, cred, td)
+	struct file *fp;
+	int events;
+	struct ucred *cred;
+	struct thread *td;
+{
+	struct pipe *rpipe = (struct pipe *)fp->f_data;
+	struct pipe *wpipe;
+	int revents = 0;
+
+	wpipe = rpipe->pipe_peer;
+	PIPE_LOCK(rpipe);
+	if (events & (POLLIN | POLLRDNORM))
+		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
+		    (rpipe->pipe_buffer.cnt > 0) ||
+		    (rpipe->pipe_state & PIPE_EOF))
+			revents |= events & (POLLIN | POLLRDNORM);
+
+	if (events & (POLLOUT | POLLWRNORM))
+		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
+		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
+		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
+			revents |= events & (POLLOUT | POLLWRNORM);
+
+	if ((rpipe->pipe_state & PIPE_EOF) ||
+	    (wpipe == NULL) ||
+	    (wpipe->pipe_state & PIPE_EOF))
+		revents |= POLLHUP;
+
+	if (revents == 0) {
+		if (events & (POLLIN | POLLRDNORM)) {
+			selrecord(td, &rpipe->pipe_sel);
+			rpipe->pipe_state |= PIPE_SEL;
+		}
+
+		if (events & (POLLOUT | POLLWRNORM)) {
+			selrecord(td, &wpipe->pipe_sel);
+			wpipe->pipe_state |= PIPE_SEL;
+		}
+	}
+	PIPE_UNLOCK(rpipe);
+
+	return (revents);
+}
+
+/*
+ * We shouldn't need locks here as we're doing a read and this should
+ * be a natural race.
+ */
+static int
+pipe_stat(fp, ub, td)
+	struct file *fp;
+	struct stat *ub;
+	struct thread *td;
+{
+	struct pipe *pipe = (struct pipe *)fp->f_data;
+
+	bzero((caddr_t)ub, sizeof(*ub));
+	ub->st_mode = S_IFIFO;
+	ub->st_blksize = pipe->pipe_buffer.size;
+	ub->st_size = pipe->pipe_buffer.cnt;
+	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
+	ub->st_atimespec = pipe->pipe_atime;
+	ub->st_mtimespec = pipe->pipe_mtime;
+	ub->st_ctimespec = pipe->pipe_ctime;
+	ub->st_uid = fp->f_cred->cr_uid;
+	ub->st_gid = fp->f_cred->cr_gid;
+	/*
+	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
+	 * XXX (st_dev, st_ino) should be unique.
+	 */
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+pipe_close(fp, td)
+	struct file *fp;
+	struct thread *td;
+{
+	struct pipe *cpipe = (struct pipe *)fp->f_data;
+
+	fp->f_ops = &badfileops;
+	fp->f_data = NULL;
+	funsetown(&cpipe->pipe_sigio);
+	pipeclose(cpipe);
+	return (0);
+}
+
+static void
+pipe_free_kmem(cpipe)
+	struct pipe *cpipe;
+{
+
+	GIANT_REQUIRED;
+	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
+	       ("pipespace: pipe mutex locked"));
+
+	if (cpipe->pipe_buffer.buffer != NULL) {
+		if (cpipe->pipe_buffer.size > PIPE_SIZE)
+			--nbigpipe;
+		amountpipekva -= cpipe->pipe_buffer.size;
+		kmem_free(kernel_map,
+			(vm_offset_t)cpipe->pipe_buffer.buffer,
+			cpipe->pipe_buffer.size);
+		cpipe->pipe_buffer.buffer = NULL;
+	}
+#ifndef PIPE_NODIRECT
+	if (cpipe->pipe_map.kva != NULL) {
+		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
+		kmem_free(kernel_map,
+			cpipe->pipe_map.kva,
+			cpipe->pipe_buffer.size + PAGE_SIZE);
+		cpipe->pipe_map.cnt = 0;
+		cpipe->pipe_map.kva = 0;
+		cpipe->pipe_map.pos = 0;
+		cpipe->pipe_map.npages = 0;
+	}
+#endif
+}
+
+/*
+ * shutdown the pipe
+ */
+static void
+pipeclose(cpipe)
+	struct pipe *cpipe;
+{
+	struct pipe *ppipe;
+	int hadpeer;
+
+	if (cpipe == NULL)
+		return;
+
+	hadpeer = 0;
+
+	/* partially created pipes won't have a valid mutex. */
+	if (PIPE_MTX(cpipe) != NULL)
+		PIPE_LOCK(cpipe);
+		
+	pipeselwakeup(cpipe);
+
+	/*
+	 * If the other side is blocked, wake it up saying that
+	 * we want to close it down.
+	 */
+	while (cpipe->pipe_busy) {
+		wakeup(cpipe);
+		cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
+		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
+	}
+
+	/*
+	 * Disconnect from peer
+	 */
+	if ((ppipe = cpipe->pipe_peer) != NULL) {
+		hadpeer++;
+		pipeselwakeup(ppipe);
+
+		ppipe->pipe_state |= PIPE_EOF;
+		wakeup(ppipe);
+		KNOTE(&ppipe->pipe_sel.si_note, 0);
+		ppipe->pipe_peer = NULL;
+	}
+	/*
+	 * free resources
+	 */
+	if (PIPE_MTX(cpipe) != NULL) {
+		PIPE_UNLOCK(cpipe);
+		if (!hadpeer) {
+			mtx_destroy(PIPE_MTX(cpipe));
+			free(PIPE_MTX(cpipe), M_TEMP);
+		}
+	}
+	mtx_lock(&Giant);
+	pipe_free_kmem(cpipe);
+	uma_zfree(pipe_zone, cpipe);
+	mtx_unlock(&Giant);
+}
+
+/*ARGSUSED*/
+static int
+pipe_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct pipe *cpipe;
+
+	cpipe = (struct pipe *)kn->kn_fp->f_data;
+	switch (kn->kn_filter) {
+	case EVFILT_READ:
+		kn->kn_fop = &pipe_rfiltops;
+		break;
+	case EVFILT_WRITE:
+		kn->kn_fop = &pipe_wfiltops;
+		cpipe = cpipe->pipe_peer;
+		break;
+	default:
+		return (1);
+	}
+	kn->kn_hook = (caddr_t)cpipe;
+
+	PIPE_LOCK(cpipe);
+	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
+	PIPE_UNLOCK(cpipe);
+	return (0);
+}
+
+static void
+filt_pipedetach(struct knote *kn)
+{
+	struct pipe *cpipe = (struct pipe *)kn->kn_hook;
+
+	PIPE_LOCK(cpipe);
+	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
+	PIPE_UNLOCK(cpipe);
+}
+
+/*ARGSUSED*/
+static int
+filt_piperead(struct knote *kn, long hint)
+{
+	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
+	struct pipe *wpipe = rpipe->pipe_peer;
+
+	PIPE_LOCK(rpipe);
+	kn->kn_data = rpipe->pipe_buffer.cnt;
+	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
+		kn->kn_data = rpipe->pipe_map.cnt;
+
+	if ((rpipe->pipe_state & PIPE_EOF) ||
+	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+		kn->kn_flags |= EV_EOF;
+		PIPE_UNLOCK(rpipe);
+		return (1);
+	}
+	PIPE_UNLOCK(rpipe);
+	return (kn->kn_data > 0);
+}
+
+/*ARGSUSED*/
+static int
+filt_pipewrite(struct knote *kn, long hint)
+{
+	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
+	struct pipe *wpipe = rpipe->pipe_peer;
+
+	PIPE_LOCK(rpipe);
+	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+		kn->kn_data = 0;
+		kn->kn_flags |= EV_EOF; 
+		PIPE_UNLOCK(rpipe);
+		return (1);
+	}
+	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
+	if (wpipe->pipe_state & PIPE_DIRECTW)
+		kn->kn_data = 0;
+
+	PIPE_UNLOCK(rpipe);
+	return (kn->kn_data >= PIPE_BUF);
+}
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
new file mode 100644
index 0000000..dacb9d9
--- /dev/null
+++ b/sys/kern/sys_process.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (c) 1994, Sean Eric Fagan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Sean Eric Fagan.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/ptrace.h>
+#include <sys/sx.h>
+#include <sys/user.h>
+
+#include <machine/reg.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+
+/*
+ * Functions implemented using PROC_ACTION():
+ *
+ * proc_read_regs(proc, regs)
+ *	Get the current user-visible register set from the process
+ *	and copy it into the regs structure (<machine/reg.h>).
+ *	The process is stopped at the time read_regs is called.
+ *
+ * proc_write_regs(proc, regs)
+ *	Update the current register set from the passed in regs
+ *	structure.  Take care to avoid clobbering special CPU
+ *	registers or privileged bits in the PSL.
+ *	Depending on the architecture this may have fix-up work to do,
+ *	especially if the IAR or PCW are modified.
+ *	The process is stopped at the time write_regs is called.
+ *
+ * proc_read_fpregs, proc_write_fpregs
+ *	deal with the floating point register set, otherwise as above.
+ *
+ * proc_read_dbregs, proc_write_dbregs
+ *	deal with the processor debug register set, otherwise as above.
+ *
+ * proc_sstep(proc)
+ *	Arrange for the process to trap after executing a single instruction.
+ */
+
+#define	PROC_ACTION(action) do {					\
+	int error;							\
+									\
+	mtx_lock_spin(&sched_lock);					\
+	if ((td->td_proc->p_sflag & PS_INMEM) == 0)			\
+		error = EIO;						\
+	else								\
+		error = (action);					\
+	mtx_unlock_spin(&sched_lock);					\
+	return (error);							\
+} while(0)
+	
+int
+proc_read_regs(struct thread *td, struct reg *regs)
+{
+
+	PROC_ACTION(fill_regs(td, regs));
+}
+
+int
+proc_write_regs(struct thread *td, struct reg *regs)
+{
+
+	PROC_ACTION(set_regs(td, regs));
+}
+
+int
+proc_read_dbregs(struct thread *td, struct dbreg *dbregs)
+{
+
+	PROC_ACTION(fill_dbregs(td, dbregs));
+}
+
+int
+proc_write_dbregs(struct thread *td, struct dbreg *dbregs)
+{
+
+	PROC_ACTION(set_dbregs(td, dbregs));
+}
+
+/*
+ * Ptrace doesn't support fpregs at all, and there are no security holes
+ * or translations for fpregs, so we can just copy them.
+ */
+int
+proc_read_fpregs(struct thread *td, struct fpreg *fpregs)
+{
+
+	PROC_ACTION(fill_fpregs(td, fpregs));
+}
+
+int
+proc_write_fpregs(struct thread *td, struct fpreg *fpregs)
+{
+
+	PROC_ACTION(set_fpregs(td, fpregs));
+}
+
+int
+proc_sstep(struct thread *td)
+{
+
+	PROC_ACTION(ptrace_single_step(td));
+}
+
+int
+proc_rwmem(struct proc *p, struct uio *uio)
+{
+	struct vmspace *vm;
+	vm_map_t map;
+	vm_object_t object = NULL;
+	vm_offset_t pageno = 0;		/* page number */
+	vm_prot_t reqprot;
+	vm_offset_t kva;
+	int error, writing;
+
+	GIANT_REQUIRED;
+
+	/*
+	 * if the vmspace is in the midst of being deallocated or the
+	 * process is exiting, don't try to grab anything.  The page table
+	 * usage in that process can be messed up.
+	 */
+	vm = p->p_vmspace;
+	if ((p->p_flag & P_WEXIT))
+		return (EFAULT);
+	if (vm->vm_refcnt < 1)
+		return (EFAULT);
+	++vm->vm_refcnt;
+	/*
+	 * The map we want...
+	 */
+	map = &vm->vm_map;
+
+	writing = uio->uio_rw == UIO_WRITE;
+	reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) :
+	    VM_PROT_READ;
+
+	kva = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+
+	/*
+	 * Only map in one page at a time.  We don't have to, but it
+	 * makes things easier.  This way is trivial - right?
+	 */
+	do {
+		vm_map_t tmap;
+		vm_offset_t uva;
+		int page_offset;		/* offset into page */
+		vm_map_entry_t out_entry;
+		vm_prot_t out_prot;
+		boolean_t wired;
+		vm_pindex_t pindex;
+		u_int len;
+		vm_page_t m;
+
+		object = NULL;
+
+		uva = (vm_offset_t)uio->uio_offset;
+
+		/*
+		 * Get the page number of this segment.
+		 */
+		pageno = trunc_page(uva);
+		page_offset = uva - pageno;
+
+		/*
+		 * How many bytes to copy
+		 */
+		len = min(PAGE_SIZE - page_offset, uio->uio_resid);
+
+		/*
+		 * Fault the page on behalf of the process
+		 */
+		error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL);
+		if (error) {
+			error = EFAULT;
+			break;
+		}
+
+		/*
+		 * Now we need to get the page.  out_entry, out_prot, wired,
+		 * and single_use aren't used.  One would think the vm code
+		 * would be a *bit* nicer...  We use tmap because
+		 * vm_map_lookup() can change the map argument.
+		 */
+		tmap = map;
+		error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry,
+		    &object, &pindex, &out_prot, &wired);
+
+		if (error) {
+			error = EFAULT;
+
+			/*
+			 * Make sure that there is no residue in 'object' from
+			 * an error return on vm_map_lookup.
+			 */
+			object = NULL;
+
+			break;
+		}
+
+		m = vm_page_lookup(object, pindex);
+
+		/* Allow fallback to backing objects if we are reading */
+
+		while (m == NULL && !writing && object->backing_object) {
+
+			pindex += OFF_TO_IDX(object->backing_object_offset);
+			object = object->backing_object;
+			
+			m = vm_page_lookup(object, pindex);
+		}
+
+		if (m == NULL) {
+			error = EFAULT;
+
+			/*
+			 * Make sure that there is no residue in 'object' from
+			 * an error return on vm_map_lookup.
+			 */
+			object = NULL;
+
+			vm_map_lookup_done(tmap, out_entry);
+
+			break;
+		}
+
+		/*
+		 * Wire the page into memory
+		 */
+		vm_page_wire(m);
+
+		/*
+		 * We're done with tmap now.
+		 * But reference the object first, so that we won't loose
+		 * it.
+		 */
+		vm_object_reference(object);
+		vm_map_lookup_done(tmap, out_entry);
+
+		pmap_qenter(kva, &m, 1);
+
+		/*
+		 * Now do the i/o move.
+		 */
+		error = uiomove((caddr_t)(kva + page_offset), len, uio);
+
+		pmap_qremove(kva, 1);
+
+		/*
+		 * release the page and the object
+		 */
+		vm_page_unwire(m, 1);
+		vm_object_deallocate(object);
+
+		object = NULL;
+
+	} while (error == 0 && uio->uio_resid > 0);
+
+	if (object)
+		vm_object_deallocate(object);
+
+	kmem_free(kernel_map, kva, PAGE_SIZE);
+	vmspace_free(vm);
+	return (error);
+}
+
+/*
+ * Process debugging system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ptrace_args {
+	int	req;
+	pid_t	pid;
+	caddr_t	addr;
+	int	data;
+};
+#endif
+
+int
+ptrace(struct thread *td, struct ptrace_args *uap)
+{
+	struct iovec iov;
+	struct uio uio;
+	/*
+	 * XXX this obfuscation is to reduce stack usage, but the register
+	 * structs may be too large to put on the stack anyway.
+	 */
+	union {
+		struct ptrace_io_desc piod;
+		struct dbreg dbreg;
+		struct fpreg fpreg;
+		struct reg reg;
+	} r;
+	struct proc *curp, *p, *pp;
+	struct thread *td2;
+	int error, write;
+	int proctree_locked = 0;
+
+	curp = td->td_proc;
+
+	/*
+	 * Do copyin() early before getting locks and lock proctree before
+	 * locking the process.
+	 */
+	switch (uap->req) {
+	case PT_TRACE_ME:
+	case PT_ATTACH:
+	case PT_STEP:
+	case PT_CONTINUE:
+	case PT_DETACH:
+		sx_xlock(&proctree_lock);
+		proctree_locked = 1;
+		break;
+#ifdef PT_SETREGS
+	case PT_SETREGS:
+		error = copyin(uap->addr, &r.reg, sizeof r.reg);
+		if (error)
+			return (error);
+		break;
+#endif /* PT_SETREGS */
+#ifdef PT_SETFPREGS
+	case PT_SETFPREGS:
+		error = copyin(uap->addr, &r.fpreg, sizeof r.fpreg);
+		if (error)
+			return (error);
+		break;
+#endif /* PT_SETFPREGS */
+#ifdef PT_SETDBREGS
+	case PT_SETDBREGS:
+		error = copyin(uap->addr, &r.dbreg, sizeof r.dbreg);
+		if (error)
+			return (error);
+		break;
+#endif /* PT_SETDBREGS */
+	default:
+		break;
+	}
+		
+	write = 0;
+	if (uap->req == PT_TRACE_ME) {
+		p = td->td_proc;
+		PROC_LOCK(p);
+	} else {
+		if ((p = pfind(uap->pid)) == NULL) {
+			if (proctree_locked)
+				sx_xunlock(&proctree_lock);
+			return (ESRCH);
+		}
+	}
+	if (p_cansee(td, p)) {
+		error = ESRCH;
+		goto fail;
+	}
+
+	if ((error = p_candebug(td, p)) != 0)
+		goto fail;
+
+	/*
+	 * System processes can't be debugged.
+	 */
+	if ((p->p_flag & P_SYSTEM) != 0) {
+		error = EINVAL;
+		goto fail;
+	}
+	
+	/*
+	 * Permissions check
+	 */
+	switch (uap->req) {
+	case PT_TRACE_ME:
+		/* Always legal. */
+		break;
+
+	case PT_ATTACH:
+		/* Self */
+		if (p->p_pid == td->td_proc->p_pid) {
+			error = EINVAL;
+			goto fail;
+		}
+
+		/* Already traced */
+		if (p->p_flag & P_TRACED) {
+			error = EBUSY;
+			goto fail;
+		}
+
+		/* Can't trace an ancestor if you're being traced. */
+		if (curp->p_flag & P_TRACED) {
+			for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) {
+				if (pp == p) {
+					error = EINVAL;
+					goto fail;
+				}
+			}
+		}
+
+
+		/* OK */
+		break;
+
+	case PT_READ_I:
+	case PT_READ_D:
+	case PT_WRITE_I:
+	case PT_WRITE_D:
+	case PT_IO:
+	case PT_CONTINUE:
+	case PT_KILL:
+	case PT_STEP:
+	case PT_DETACH:
+	case PT_GETREGS:
+	case PT_SETREGS:
+	case PT_GETFPREGS:
+	case PT_SETFPREGS:
+	case PT_GETDBREGS:
+	case PT_SETDBREGS:
+		/* not being traced... */
+		if ((p->p_flag & P_TRACED) == 0) {
+			error = EPERM;
+			goto fail;
+		}
+
+		/* not being traced by YOU */
+		if (p->p_pptr != td->td_proc) {
+			error = EBUSY;
+			goto fail;
+		}
+
+		/* not currently stopped */
+		if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0) {
+			error = EBUSY;
+			goto fail;
+		}
+
+		/* OK */
+		break;
+
+	default:
+		error = EINVAL;
+		goto fail;
+	}
+
+	td2 = FIRST_THREAD_IN_PROC(p);
+#ifdef FIX_SSTEP
+	/*
+	 * Single step fixup ala procfs
+	 */
+	FIX_SSTEP(td2);			/* XXXKSE */
+#endif
+
+	/*
+	 * Actually do the requests
+	 */
+
+	td->td_retval[0] = 0;
+
+	switch (uap->req) {
+	case PT_TRACE_ME:
+		/* set my trace flag and "owner" so it can read/write me */
+		p->p_flag |= P_TRACED;
+		p->p_oppid = p->p_pptr->p_pid;
+		PROC_UNLOCK(p);
+		sx_xunlock(&proctree_lock);
+		return (0);
+
+	case PT_ATTACH:
+		/* security check done above */
+		p->p_flag |= P_TRACED;
+		p->p_oppid = p->p_pptr->p_pid;
+		if (p->p_pptr != td->td_proc)
+			proc_reparent(p, td->td_proc);
+		uap->data = SIGSTOP;
+		goto sendsig;	/* in PT_CONTINUE below */
+
+	case PT_STEP:
+	case PT_CONTINUE:
+	case PT_DETACH:
+		/* XXX uap->data is used even in the PT_STEP case. */
+		if (uap->req != PT_STEP && (unsigned)uap->data > _SIG_MAXSIG) {
+			error = EINVAL;
+			goto fail;
+		}
+
+		_PHOLD(p);
+
+		if (uap->req == PT_STEP) {
+			error = ptrace_single_step(td2);
+			if (error) {
+				_PRELE(p);
+				goto fail;
+			}
+		}
+
+		if (uap->addr != (caddr_t)1) {
+			fill_kinfo_proc(p, &p->p_uarea->u_kproc);
+			error = ptrace_set_pc(td2,
+			    (u_long)(uintfptr_t)uap->addr);
+			if (error) {
+				_PRELE(p);
+				goto fail;
+			}
+		}
+		_PRELE(p);
+
+		if (uap->req == PT_DETACH) {
+			/* reset process parent */
+			if (p->p_oppid != p->p_pptr->p_pid) {
+				struct proc *pp;
+
+				PROC_UNLOCK(p);
+				pp = pfind(p->p_oppid);
+				if (pp == NULL)
+					pp = initproc;
+				else
+					PROC_UNLOCK(pp);
+				PROC_LOCK(p);
+				proc_reparent(p, pp);
+			}
+			p->p_flag &= ~(P_TRACED | P_WAITED);
+			p->p_oppid = 0;
+
+			/* should we send SIGCHLD? */
+		}
+
+	sendsig:
+		if (proctree_locked)
+			sx_xunlock(&proctree_lock);
+		/* deliver or queue signal */
+		if (p->p_stat == SSTOP) {
+			p->p_xstat = uap->data;
+			mtx_lock_spin(&sched_lock);
+			setrunnable(td2);	/* XXXKSE */
+			mtx_unlock_spin(&sched_lock);
+		} else if (uap->data)		      
+			psignal(p, uap->data);
+		PROC_UNLOCK(p);
+		
+		return (0);
+
+	case PT_WRITE_I:
+	case PT_WRITE_D:
+		write = 1;
+		/* fallthrough */
+	case PT_READ_I:
+	case PT_READ_D:
+		PROC_UNLOCK(p);
+		/* write = 0 set above */
+		iov.iov_base = write ? (caddr_t)&uap->data :
+		    (caddr_t)td->td_retval;
+		iov.iov_len = sizeof(int);
+		uio.uio_iov = &iov;
+		uio.uio_iovcnt = 1;
+		uio.uio_offset = (off_t)(uintptr_t)uap->addr;
+		uio.uio_resid = sizeof(int);
+		uio.uio_segflg = UIO_SYSSPACE;	/* i.e.: the uap */
+		uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+		uio.uio_td = td;
+		error = proc_rwmem(p, &uio);
+		if (uio.uio_resid != 0) {
+			/*
+			 * XXX proc_rwmem() doesn't currently return ENOSPC,
+			 * so I think write() can bogusly return 0.
+			 * XXX what happens for short writes?  We don't want
+			 * to write partial data.
+			 * XXX proc_rwmem() returns EPERM for other invalid
+			 * addresses.  Convert this to EINVAL.  Does this
+			 * clobber returns of EPERM for other reasons?
+			 */
+			if (error == 0 || error == ENOSPC || error == EPERM)
+				error = EINVAL;	/* EOF */
+		}
+		return (error);
+
+	case PT_IO:
+		error = copyin(uap->addr, &r.piod, sizeof r.piod);
+		if (error)
+			return (error);
+		iov.iov_base = r.piod.piod_addr;
+		iov.iov_len = r.piod.piod_len;
+		uio.uio_iov = &iov;
+		uio.uio_iovcnt = 1;
+		uio.uio_offset = (off_t)(uintptr_t)r.piod.piod_offs;
+		uio.uio_resid = r.piod.piod_len;
+		uio.uio_segflg = UIO_USERSPACE;
+		uio.uio_td = td;
+		switch (r.piod.piod_op) {
+		case PIOD_READ_D:
+		case PIOD_READ_I:
+			uio.uio_rw = UIO_READ;
+			break;
+		case PIOD_WRITE_D:
+		case PIOD_WRITE_I:
+			uio.uio_rw = UIO_WRITE;
+			break;
+		default:
+			return (EINVAL);
+		}
+		error = proc_rwmem(p, &uio);
+		r.piod.piod_len -= uio.uio_resid;
+		(void)copyout(&r.piod, uap->addr, sizeof r.piod);
+		return (error);
+
+	case PT_KILL:
+		uap->data = SIGKILL;
+		goto sendsig;	/* in PT_CONTINUE above */
+
+	case PT_SETREGS:
+		_PHOLD(p);
+		error = proc_write_regs(td2, &r.reg);
+		_PRELE(p);
+		PROC_UNLOCK(p);
+		return (error);
+
+	case PT_GETREGS:
+		_PHOLD(p);
+		error = proc_read_regs(td2, &r.reg);
+		_PRELE(p);
+		PROC_UNLOCK(p);
+		if (error == 0)
+			error = copyout(&r.reg, uap->addr, sizeof r.reg);
+		return (error);
+
+	case PT_SETFPREGS:
+		_PHOLD(p);
+		error = proc_write_fpregs(td2, &r.fpreg);
+		_PRELE(p);
+		PROC_UNLOCK(p);
+		return (error);
+
+	case PT_GETFPREGS:
+		_PHOLD(p);
+		error = proc_read_fpregs(td2, &r.fpreg);
+		_PRELE(p);
+		PROC_UNLOCK(p);
+		if (error == 0)
+			error = copyout(&r.fpreg, uap->addr, sizeof r.fpreg);
+		return (error);
+
+	case PT_SETDBREGS:
+		_PHOLD(p);
+		error = proc_write_dbregs(td2, &r.dbreg);
+		_PRELE(p);
+		PROC_UNLOCK(p);
+		return (error);
+
+	case PT_GETDBREGS:
+		_PHOLD(p);
+		error = proc_read_dbregs(td2, &r.dbreg);
+		_PRELE(p);
+		PROC_UNLOCK(p);
+		if (error == 0)
+			error = copyout(&r.dbreg, uap->addr, sizeof r.dbreg);
+		return (error);
+
+	default:
+		KASSERT(0, ("unreachable code\n"));
+		break;
+	}
+
+	KASSERT(0, ("unreachable code\n"));
+	return (0);
+
+fail:
+	PROC_UNLOCK(p);
+	if (proctree_locked)
+		sx_xunlock(&proctree_lock);
+	return (error);
+}
+
+/*
+ * Stop a process because of a debugging event;
+ * stay stopped until p->p_step is cleared
+ * (cleared by PIOCCONT in procfs).
+ */
+void
+stopevent(struct proc *p, unsigned int event, unsigned int val)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED | MA_NOTRECURSED);
+	p->p_step = 1;
+
+	do {
+		p->p_xstat = val;
+		p->p_stype = event;	/* Which event caused the stop? */
+		wakeup(&p->p_stype);	/* Wake up any PIOCWAIT'ing procs */
+		msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0);
+	} while (p->p_step);
+}
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
new file mode 100644
index 0000000..c8a6198
--- /dev/null
+++ b/sys/kern/sys_socket.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)sys_socket.c	8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/filio.h>			/* XXX */
+#include <sys/sockio.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/filedesc.h>
+#include <sys/ucred.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+struct	fileops socketops = {
+	soo_read, soo_write, soo_ioctl, soo_poll, sokqfilter,
+	soo_stat, soo_close
+};
+
+/* ARGSUSED */
+int
+soo_read(fp, uio, cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+	struct thread *td;
+	int flags;
+{
+	struct socket *so = (struct socket *)fp->f_data;
+	int error;
+
+	mtx_lock(&Giant);
+	error = so->so_proto->pr_usrreqs->pru_soreceive(so, 0, uio, 0, 0, 0);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/* ARGSUSED */
+int
+soo_write(fp, uio, cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+	struct thread *td;
+	int flags;
+{
+	struct socket *so = (struct socket *)fp->f_data;
+	int error;
+
+	mtx_lock(&Giant);
+	error = so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0,
+						    uio->uio_td);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+int
+soo_ioctl(fp, cmd, data, td)
+	struct file *fp;
+	u_long cmd;
+	register caddr_t data;
+	struct thread *td;
+{
+	register struct socket *so = (struct socket *)fp->f_data;
+
+	switch (cmd) {
+
+	case FIONBIO:
+		if (*(int *)data)
+			so->so_state |= SS_NBIO;
+		else
+			so->so_state &= ~SS_NBIO;
+		return (0);
+
+	case FIOASYNC:
+		if (*(int *)data) {
+			so->so_state |= SS_ASYNC;
+			so->so_rcv.sb_flags |= SB_ASYNC;
+			so->so_snd.sb_flags |= SB_ASYNC;
+		} else {
+			so->so_state &= ~SS_ASYNC;
+			so->so_rcv.sb_flags &= ~SB_ASYNC;
+			so->so_snd.sb_flags &= ~SB_ASYNC;
+		}
+		return (0);
+
+	case FIONREAD:
+		*(int *)data = so->so_rcv.sb_cc;
+		return (0);
+
+	case FIOSETOWN:
+		return (fsetown(*(int *)data, &so->so_sigio));
+
+	case FIOGETOWN:
+		*(int *)data = fgetown(so->so_sigio);
+		return (0);
+
+	case SIOCSPGRP:
+		return (fsetown(-(*(int *)data), &so->so_sigio));
+
+	case SIOCGPGRP:
+		*(int *)data = -fgetown(so->so_sigio);
+		return (0);
+
+	case SIOCATMARK:
+		*(int *)data = (so->so_state&SS_RCVATMARK) != 0;
+		return (0);
+	}
+	/*
+	 * Interface/routing/protocol specific ioctls:
+	 * interface and routing ioctls should have a
+	 * different entry since a socket's unnecessary
+	 */
+	if (IOCGROUP(cmd) == 'i')
+		return (ifioctl(so, cmd, data, td));
+	if (IOCGROUP(cmd) == 'r')
+		return (rtioctl(cmd, data));
+	return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0, td));
+}
+
+int
+soo_poll(fp, events, cred, td)
+	struct file *fp;
+	int events;
+	struct ucred *cred;
+	struct thread *td;
+{
+	struct socket *so = (struct socket *)fp->f_data;
+	return so->so_proto->pr_usrreqs->pru_sopoll(so, events, cred, td);
+}
+
+int
+soo_stat(fp, ub, td)
+	struct file *fp;
+	struct stat *ub;
+	struct thread *td;
+{
+	struct socket *so = (struct socket *)fp->f_data;
+
+	bzero((caddr_t)ub, sizeof (*ub));
+	ub->st_mode = S_IFSOCK;
+	/*
+	 * If SS_CANTRCVMORE is set, but there's still data left in the
+	 * receive buffer, the socket is still readable.
+	 */
+	if ((so->so_state & SS_CANTRCVMORE) == 0 ||
+	    so->so_rcv.sb_cc != 0)
+		ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
+	if ((so->so_state & SS_CANTSENDMORE) == 0)
+		ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
+	ub->st_size = so->so_rcv.sb_cc;
+	ub->st_uid = so->so_cred->cr_uid;
+	ub->st_gid = so->so_cred->cr_gid;
+	return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub));
+}
+
+/*
+ * API socket close on file pointer.  We call soclose() to close the 
+ * socket (including initiating closing protocols).  soclose() will
+ * sorele() the file reference but the actual socket will not go away
+ * until the socket's ref count hits 0.
+ */
+/* ARGSUSED */
+int
+soo_close(fp, td)
+	struct file *fp;
+	struct thread *td;
+{
+	int error = 0;
+	struct socket *so;
+
+	so = (struct socket *)fp->f_data;
+	fp->f_ops = &badfileops;
+	fp->f_data = 0;
+
+	if (so)
+		error = soclose(so);
+	return (error);
+}
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
new file mode 100644
index 0000000..8b092fc
--- /dev/null
+++ b/sys/kern/syscalls.c
@@ -0,0 +1,403 @@
+/*
+ * System call names.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.113 2002/06/13 23:43:53 rwatson Exp 
+ */
+
+char *syscallnames[] = {
+	"syscall",			/* 0 = syscall */
+	"exit",			/* 1 = exit */
+	"fork",			/* 2 = fork */
+	"read",			/* 3 = read */
+	"write",			/* 4 = write */
+	"open",			/* 5 = open */
+	"close",			/* 6 = close */
+	"wait4",			/* 7 = wait4 */
+	"old.creat",		/* 8 = old creat */
+	"link",			/* 9 = link */
+	"unlink",			/* 10 = unlink */
+	"obs_execv",			/* 11 = obsolete execv */
+	"chdir",			/* 12 = chdir */
+	"fchdir",			/* 13 = fchdir */
+	"mknod",			/* 14 = mknod */
+	"chmod",			/* 15 = chmod */
+	"chown",			/* 16 = chown */
+	"break",			/* 17 = break */
+	"getfsstat",			/* 18 = getfsstat */
+	"old.lseek",		/* 19 = old lseek */
+	"getpid",			/* 20 = getpid */
+	"mount",			/* 21 = mount */
+	"unmount",			/* 22 = unmount */
+	"setuid",			/* 23 = setuid */
+	"getuid",			/* 24 = getuid */
+	"geteuid",			/* 25 = geteuid */
+	"ptrace",			/* 26 = ptrace */
+	"recvmsg",			/* 27 = recvmsg */
+	"sendmsg",			/* 28 = sendmsg */
+	"recvfrom",			/* 29 = recvfrom */
+	"accept",			/* 30 = accept */
+	"getpeername",			/* 31 = getpeername */
+	"getsockname",			/* 32 = getsockname */
+	"access",			/* 33 = access */
+	"chflags",			/* 34 = chflags */
+	"fchflags",			/* 35 = fchflags */
+	"sync",			/* 36 = sync */
+	"kill",			/* 37 = kill */
+	"old.stat",		/* 38 = old stat */
+	"getppid",			/* 39 = getppid */
+	"old.lstat",		/* 40 = old lstat */
+	"dup",			/* 41 = dup */
+	"pipe",			/* 42 = pipe */
+	"getegid",			/* 43 = getegid */
+	"profil",			/* 44 = profil */
+	"ktrace",			/* 45 = ktrace */
+	"old.sigaction",		/* 46 = old sigaction */
+	"getgid",			/* 47 = getgid */
+	"old.sigprocmask",		/* 48 = old sigprocmask */
+	"getlogin",			/* 49 = getlogin */
+	"setlogin",			/* 50 = setlogin */
+	"acct",			/* 51 = acct */
+	"old.sigpending",		/* 52 = old sigpending */
+	"sigaltstack",			/* 53 = sigaltstack */
+	"ioctl",			/* 54 = ioctl */
+	"reboot",			/* 55 = reboot */
+	"revoke",			/* 56 = revoke */
+	"symlink",			/* 57 = symlink */
+	"readlink",			/* 58 = readlink */
+	"execve",			/* 59 = execve */
+	"umask",			/* 60 = umask */
+	"chroot",			/* 61 = chroot */
+	"old.fstat",		/* 62 = old fstat */
+	"old.getkerninfo",		/* 63 = old getkerninfo */
+	"old.getpagesize",		/* 64 = old getpagesize */
+	"msync",			/* 65 = msync */
+	"vfork",			/* 66 = vfork */
+	"obs_vread",			/* 67 = obsolete vread */
+	"obs_vwrite",			/* 68 = obsolete vwrite */
+	"sbrk",			/* 69 = sbrk */
+	"sstk",			/* 70 = sstk */
+	"old.mmap",		/* 71 = old mmap */
+	"vadvise",			/* 72 = vadvise */
+	"munmap",			/* 73 = munmap */
+	"mprotect",			/* 74 = mprotect */
+	"madvise",			/* 75 = madvise */
+	"obs_vhangup",			/* 76 = obsolete vhangup */
+	"obs_vlimit",			/* 77 = obsolete vlimit */
+	"mincore",			/* 78 = mincore */
+	"getgroups",			/* 79 = getgroups */
+	"setgroups",			/* 80 = setgroups */
+	"getpgrp",			/* 81 = getpgrp */
+	"setpgid",			/* 82 = setpgid */
+	"setitimer",			/* 83 = setitimer */
+	"old.wait",		/* 84 = old wait */
+	"swapon",			/* 85 = swapon */
+	"getitimer",			/* 86 = getitimer */
+	"old.gethostname",		/* 87 = old gethostname */
+	"old.sethostname",		/* 88 = old sethostname */
+	"getdtablesize",			/* 89 = getdtablesize */
+	"dup2",			/* 90 = dup2 */
+	"#91",			/* 91 = getdopt */
+	"fcntl",			/* 92 = fcntl */
+	"select",			/* 93 = select */
+	"#94",			/* 94 = setdopt */
+	"fsync",			/* 95 = fsync */
+	"setpriority",			/* 96 = setpriority */
+	"socket",			/* 97 = socket */
+	"connect",			/* 98 = connect */
+	"old.accept",		/* 99 = old accept */
+	"getpriority",			/* 100 = getpriority */
+	"old.send",		/* 101 = old send */
+	"old.recv",		/* 102 = old recv */
+	"osigreturn",			/* 103 = osigreturn */
+	"bind",			/* 104 = bind */
+	"setsockopt",			/* 105 = setsockopt */
+	"listen",			/* 106 = listen */
+	"obs_vtimes",			/* 107 = obsolete vtimes */
+	"old.sigvec",		/* 108 = old sigvec */
+	"old.sigblock",		/* 109 = old sigblock */
+	"old.sigsetmask",		/* 110 = old sigsetmask */
+	"old.sigsuspend",		/* 111 = old sigsuspend */
+	"old.sigstack",		/* 112 = old sigstack */
+	"old.recvmsg",		/* 113 = old recvmsg */
+	"old.sendmsg",		/* 114 = old sendmsg */
+	"obs_vtrace",			/* 115 = obsolete vtrace */
+	"gettimeofday",			/* 116 = gettimeofday */
+	"getrusage",			/* 117 = getrusage */
+	"getsockopt",			/* 118 = getsockopt */
+	"#119",			/* 119 = resuba */
+	"readv",			/* 120 = readv */
+	"writev",			/* 121 = writev */
+	"settimeofday",			/* 122 = settimeofday */
+	"fchown",			/* 123 = fchown */
+	"fchmod",			/* 124 = fchmod */
+	"old.recvfrom",		/* 125 = old recvfrom */
+	"setreuid",			/* 126 = setreuid */
+	"setregid",			/* 127 = setregid */
+	"rename",			/* 128 = rename */
+	"old.truncate",		/* 129 = old truncate */
+	"old.ftruncate",		/* 130 = old ftruncate */
+	"flock",			/* 131 = flock */
+	"mkfifo",			/* 132 = mkfifo */
+	"sendto",			/* 133 = sendto */
+	"shutdown",			/* 134 = shutdown */
+	"socketpair",			/* 135 = socketpair */
+	"mkdir",			/* 136 = mkdir */
+	"rmdir",			/* 137 = rmdir */
+	"utimes",			/* 138 = utimes */
+	"obs_4.2",			/* 139 = obsolete 4.2 sigreturn */
+	"adjtime",			/* 140 = adjtime */
+	"old.getpeername",		/* 141 = old getpeername */
+	"old.gethostid",		/* 142 = old gethostid */
+	"old.sethostid",		/* 143 = old sethostid */
+	"old.getrlimit",		/* 144 = old getrlimit */
+	"old.setrlimit",		/* 145 = old setrlimit */
+	"old.killpg",		/* 146 = old killpg */
+	"setsid",			/* 147 = setsid */
+	"quotactl",			/* 148 = quotactl */
+	"old.quota",		/* 149 = old quota */
+	"old.getsockname",		/* 150 = old getsockname */
+	"#151",			/* 151 = sem_lock */
+	"#152",			/* 152 = sem_wakeup */
+	"#153",			/* 153 = asyncdaemon */
+	"#154",			/* 154 = nosys */
+	"nfssvc",			/* 155 = nfssvc */
+	"old.getdirentries",		/* 156 = old getdirentries */
+	"statfs",			/* 157 = statfs */
+	"fstatfs",			/* 158 = fstatfs */
+	"#159",			/* 159 = nosys */
+	"#160",			/* 160 = nosys */
+	"getfh",			/* 161 = getfh */
+	"getdomainname",			/* 162 = getdomainname */
+	"setdomainname",			/* 163 = setdomainname */
+	"uname",			/* 164 = uname */
+	"sysarch",			/* 165 = sysarch */
+	"rtprio",			/* 166 = rtprio */
+	"#167",			/* 167 = nosys */
+	"#168",			/* 168 = nosys */
+	"semsys",			/* 169 = semsys */
+	"msgsys",			/* 170 = msgsys */
+	"shmsys",			/* 171 = shmsys */
+	"#172",			/* 172 = nosys */
+	"pread",			/* 173 = pread */
+	"pwrite",			/* 174 = pwrite */
+	"#175",			/* 175 = nosys */
+	"ntp_adjtime",			/* 176 = ntp_adjtime */
+	"#177",			/* 177 = sfork */
+	"#178",			/* 178 = getdescriptor */
+	"#179",			/* 179 = setdescriptor */
+	"#180",			/* 180 = nosys */
+	"setgid",			/* 181 = setgid */
+	"setegid",			/* 182 = setegid */
+	"seteuid",			/* 183 = seteuid */
+	"#184",			/* 184 = lfs_bmapv */
+	"#185",			/* 185 = lfs_markv */
+	"#186",			/* 186 = lfs_segclean */
+	"#187",			/* 187 = lfs_segwait */
+	"stat",			/* 188 = stat */
+	"fstat",			/* 189 = fstat */
+	"lstat",			/* 190 = lstat */
+	"pathconf",			/* 191 = pathconf */
+	"fpathconf",			/* 192 = fpathconf */
+	"#193",			/* 193 = nosys */
+	"getrlimit",			/* 194 = getrlimit */
+	"setrlimit",			/* 195 = setrlimit */
+	"getdirentries",			/* 196 = getdirentries */
+	"mmap",			/* 197 = mmap */
+	"__syscall",			/* 198 = __syscall */
+	"lseek",			/* 199 = lseek */
+	"truncate",			/* 200 = truncate */
+	"ftruncate",			/* 201 = ftruncate */
+	"__sysctl",			/* 202 = __sysctl */
+	"mlock",			/* 203 = mlock */
+	"munlock",			/* 204 = munlock */
+	"undelete",			/* 205 = undelete */
+	"futimes",			/* 206 = futimes */
+	"getpgid",			/* 207 = getpgid */
+	"#208",			/* 208 = newreboot */
+	"poll",			/* 209 = poll */
+	"lkmnosys",			/* 210 = lkmnosys */
+	"lkmnosys",			/* 211 = lkmnosys */
+	"lkmnosys",			/* 212 = lkmnosys */
+	"lkmnosys",			/* 213 = lkmnosys */
+	"lkmnosys",			/* 214 = lkmnosys */
+	"lkmnosys",			/* 215 = lkmnosys */
+	"lkmnosys",			/* 216 = lkmnosys */
+	"lkmnosys",			/* 217 = lkmnosys */
+	"lkmnosys",			/* 218 = lkmnosys */
+	"lkmnosys",			/* 219 = lkmnosys */
+	"__semctl",			/* 220 = __semctl */
+	"semget",			/* 221 = semget */
+	"semop",			/* 222 = semop */
+	"#223",			/* 223 = semconfig */
+	"msgctl",			/* 224 = msgctl */
+	"msgget",			/* 225 = msgget */
+	"msgsnd",			/* 226 = msgsnd */
+	"msgrcv",			/* 227 = msgrcv */
+	"shmat",			/* 228 = shmat */
+	"shmctl",			/* 229 = shmctl */
+	"shmdt",			/* 230 = shmdt */
+	"shmget",			/* 231 = shmget */
+	"clock_gettime",			/* 232 = clock_gettime */
+	"clock_settime",			/* 233 = clock_settime */
+	"clock_getres",			/* 234 = clock_getres */
+	"#235",			/* 235 = timer_create */
+	"#236",			/* 236 = timer_delete */
+	"#237",			/* 237 = timer_settime */
+	"#238",			/* 238 = timer_gettime */
+	"#239",			/* 239 = timer_getoverrun */
+	"nanosleep",			/* 240 = nanosleep */
+	"#241",			/* 241 = nosys */
+	"#242",			/* 242 = nosys */
+	"#243",			/* 243 = nosys */
+	"#244",			/* 244 = nosys */
+	"#245",			/* 245 = nosys */
+	"#246",			/* 246 = nosys */
+	"#247",			/* 247 = nosys */
+	"#248",			/* 248 = nosys */
+	"#249",			/* 249 = nosys */
+	"minherit",			/* 250 = minherit */
+	"rfork",			/* 251 = rfork */
+	"openbsd_poll",			/* 252 = openbsd_poll */
+	"issetugid",			/* 253 = issetugid */
+	"lchown",			/* 254 = lchown */
+	"#255",			/* 255 = nosys */
+	"#256",			/* 256 = nosys */
+	"#257",			/* 257 = nosys */
+	"#258",			/* 258 = nosys */
+	"#259",			/* 259 = nosys */
+	"#260",			/* 260 = nosys */
+	"#261",			/* 261 = nosys */
+	"#262",			/* 262 = nosys */
+	"#263",			/* 263 = nosys */
+	"#264",			/* 264 = nosys */
+	"#265",			/* 265 = nosys */
+	"#266",			/* 266 = nosys */
+	"#267",			/* 267 = nosys */
+	"#268",			/* 268 = nosys */
+	"#269",			/* 269 = nosys */
+	"#270",			/* 270 = nosys */
+	"#271",			/* 271 = nosys */
+	"getdents",			/* 272 = getdents */
+	"#273",			/* 273 = nosys */
+	"lchmod",			/* 274 = lchmod */
+	"netbsd_lchown",			/* 275 = netbsd_lchown */
+	"lutimes",			/* 276 = lutimes */
+	"netbsd_msync",			/* 277 = netbsd_msync */
+	"nstat",			/* 278 = nstat */
+	"nfstat",			/* 279 = nfstat */
+	"nlstat",			/* 280 = nlstat */
+	"#281",			/* 281 = nosys */
+	"#282",			/* 282 = nosys */
+	"#283",			/* 283 = nosys */
+	"#284",			/* 284 = nosys */
+	"#285",			/* 285 = nosys */
+	"#286",			/* 286 = nosys */
+	"#287",			/* 287 = nosys */
+	"#288",			/* 288 = nosys */
+	"#289",			/* 289 = nosys */
+	"#290",			/* 290 = nosys */
+	"#291",			/* 291 = nosys */
+	"#292",			/* 292 = nosys */
+	"#293",			/* 293 = nosys */
+	"#294",			/* 294 = nosys */
+	"#295",			/* 295 = nosys */
+	"#296",			/* 296 = nosys */
+	"fhstatfs",			/* 297 = fhstatfs */
+	"fhopen",			/* 298 = fhopen */
+	"fhstat",			/* 299 = fhstat */
+	"modnext",			/* 300 = modnext */
+	"modstat",			/* 301 = modstat */
+	"modfnext",			/* 302 = modfnext */
+	"modfind",			/* 303 = modfind */
+	"kldload",			/* 304 = kldload */
+	"kldunload",			/* 305 = kldunload */
+	"kldfind",			/* 306 = kldfind */
+	"kldnext",			/* 307 = kldnext */
+	"kldstat",			/* 308 = kldstat */
+	"kldfirstmod",			/* 309 = kldfirstmod */
+	"getsid",			/* 310 = getsid */
+	"setresuid",			/* 311 = setresuid */
+	"setresgid",			/* 312 = setresgid */
+	"obs_signanosleep",			/* 313 = obsolete signanosleep */
+	"aio_return",			/* 314 = aio_return */
+	"aio_suspend",			/* 315 = aio_suspend */
+	"aio_cancel",			/* 316 = aio_cancel */
+	"aio_error",			/* 317 = aio_error */
+	"aio_read",			/* 318 = aio_read */
+	"aio_write",			/* 319 = aio_write */
+	"lio_listio",			/* 320 = lio_listio */
+	"yield",			/* 321 = yield */
+	"obs_thr_sleep",			/* 322 = obsolete thr_sleep */
+	"obs_thr_wakeup",			/* 323 = obsolete thr_wakeup */
+	"mlockall",			/* 324 = mlockall */
+	"munlockall",			/* 325 = munlockall */
+	"__getcwd",			/* 326 = __getcwd */
+	"sched_setparam",			/* 327 = sched_setparam */
+	"sched_getparam",			/* 328 = sched_getparam */
+	"sched_setscheduler",			/* 329 = sched_setscheduler */
+	"sched_getscheduler",			/* 330 = sched_getscheduler */
+	"sched_yield",			/* 331 = sched_yield */
+	"sched_get_priority_max",			/* 332 = sched_get_priority_max */
+	"sched_get_priority_min",			/* 333 = sched_get_priority_min */
+	"sched_rr_get_interval",			/* 334 = sched_rr_get_interval */
+	"utrace",			/* 335 = utrace */
+	"sendfile",			/* 336 = sendfile */
+	"kldsym",			/* 337 = kldsym */
+	"jail",			/* 338 = jail */
+	"#339",			/* 339 = pioctl */
+	"sigprocmask",			/* 340 = sigprocmask */
+	"sigsuspend",			/* 341 = sigsuspend */
+	"sigaction",			/* 342 = sigaction */
+	"sigpending",			/* 343 = sigpending */
+	"sigreturn",			/* 344 = sigreturn */
+	"#345",			/* 345 = sigtimedwait */
+	"#346",			/* 346 = sigwaitinfo */
+	"__acl_get_file",			/* 347 = __acl_get_file */
+	"__acl_set_file",			/* 348 = __acl_set_file */
+	"__acl_get_fd",			/* 349 = __acl_get_fd */
+	"__acl_set_fd",			/* 350 = __acl_set_fd */
+	"__acl_delete_file",			/* 351 = __acl_delete_file */
+	"__acl_delete_fd",			/* 352 = __acl_delete_fd */
+	"__acl_aclcheck_file",			/* 353 = __acl_aclcheck_file */
+	"__acl_aclcheck_fd",			/* 354 = __acl_aclcheck_fd */
+	"extattrctl",			/* 355 = extattrctl */
+	"extattr_set_file",			/* 356 = extattr_set_file */
+	"extattr_get_file",			/* 357 = extattr_get_file */
+	"extattr_delete_file",			/* 358 = extattr_delete_file */
+	"aio_waitcomplete",			/* 359 = aio_waitcomplete */
+	"getresuid",			/* 360 = getresuid */
+	"getresgid",			/* 361 = getresgid */
+	"kqueue",			/* 362 = kqueue */
+	"kevent",			/* 363 = kevent */
+	"#364",			/* 364 = __cap_get_proc */
+	"#365",			/* 365 = __cap_set_proc */
+	"#366",			/* 366 = __cap_get_fd */
+	"#367",			/* 367 = __cap_get_file */
+	"#368",			/* 368 = __cap_set_fd */
+	"#369",			/* 369 = __cap_set_file */
+	"lkmressys",			/* 370 = lkmressys */
+	"extattr_set_fd",			/* 371 = extattr_set_fd */
+	"extattr_get_fd",			/* 372 = extattr_get_fd */
+	"extattr_delete_fd",			/* 373 = extattr_delete_fd */
+	"__setugid",			/* 374 = __setugid */
+	"nfsclnt",			/* 375 = nfsclnt */
+	"eaccess",			/* 376 = eaccess */
+	"#377",			/* 377 = afs_syscall */
+	"nmount",			/* 378 = nmount */
+	"kse_exit",			/* 379 = kse_exit */
+	"kse_wakeup",			/* 380 = kse_wakeup */
+	"kse_new",			/* 381 = kse_new */
+	"thread_wakeup",			/* 382 = thread_wakeup */
+	"kse_yield",			/* 383 = kse_yield */
+	"#384",			/* 384 = __mac_get_proc */
+	"#385",			/* 385 = __mac_set_proc */
+	"#386",			/* 386 = __mac_get_fd */
+	"#387",			/* 387 = __mac_get_file */
+	"#388",			/* 388 = __mac_set_fd */
+	"#389",			/* 389 = __mac_set_file */
+	"kenv",			/* 390 = kenv */
+	"lchflags",			/* 391 = lchflags */
+	"uuidgen",			/* 392 = uuidgen */
+};
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
new file mode 100644
index 0000000..d8115fb
--- /dev/null
+++ b/sys/kern/syscalls.master
@@ -0,0 +1,565 @@
+ $FreeBSD$
+;	from: @(#)syscalls.master	8.2 (Berkeley) 1/13/94
+;
+; System call name/number master file.
+; Processed to created init_sysent.c, syscalls.c and syscall.h.
+
+; Columns: number [M]type nargs namespc name alt{name,tag,rtyp}/comments
+;	number	system call number, must be in order
+;	type	one of [M]STD, [M]OBSOL, [M]UNIMPL, [M]COMPAT, [M]CPT_NOA,
+;		[M]LIBCOMPAT, [M]NODEF,  [M]NOARGS,  [M]NOPROTO,  [M]NOIMPL,
+;		[M]NOSTD
+;	namespc one of POSIX, BSD, NOHIDE
+;	name	psuedo-prototype of syscall routine
+;		If one of the following alts is different, then all appear:
+;	altname	name of system call if different
+;	alttag	name of args struct tag if different from [o]`name'"_args"
+;	altrtyp	return type if not int (bogus - syscalls always return int)
+;		for UNIMPL/OBSOL, name continues with comments
+
+; types:
+;	[M]	e.g. like MSTD -- means the system call is MP-safe.  If no
+;		M prefix is used, the syscall wrapper will obtain the Giant
+;		lock for the syscall.
+;	STD	always included
+;	COMPAT	included on COMPAT #ifdef
+;	LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h
+;	OBSOL	obsolete, not included in system, only specifies name
+;	UNIMPL	not implemented, placeholder only
+;	NOSTD	implemented but as a lkm that can be statically
+;			compiled in sysent entry will be filled with lkmsys
+;			so the SYSCALL_MODULE macro works
+
+; #ifdef's, etc. may be included, and are copied to the output files.
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+; Reserved/unimplemented system calls in the range 0-150 inclusive
+; are reserved for use in future Berkeley releases.
+; Additional system calls implemented in vendor and other
+; redistributions should be placed in the reserved range at the end
+; of the current calls.
+
+0	STD	NOHIDE	{ int nosys(void); } syscall nosys_args int
+1	MSTD	NOHIDE	{ void sys_exit(int rval); } exit sys_exit_args void
+2	MSTD	POSIX	{ int fork(void); }
+3	MSTD	POSIX	{ ssize_t read(int fd, void *buf, size_t nbyte); }
+4	MSTD	POSIX	{ ssize_t write(int fd, const void *buf, size_t nbyte); }
+5	STD	POSIX	{ int open(char *path, int flags, int mode); }
+; XXX should be		{ int open(const char *path, int flags, ...); }
+; but we're not ready for `const' or varargs.
+; XXX man page says `mode_t mode'.
+6	MSTD	POSIX	{ int close(int fd); }
+7	MSTD	BSD	{ int wait4(int pid, int *status, int options, \
+			    struct rusage *rusage); } wait4 wait_args int
+8	COMPAT	BSD	{ int creat(char *path, int mode); }
+9	STD	POSIX	{ int link(char *path, char *link); }
+10	STD	POSIX	{ int unlink(char *path); }
+11	OBSOL	NOHIDE	execv
+12	STD	POSIX	{ int chdir(char *path); }
+13	STD	BSD	{ int fchdir(int fd); }
+14	STD	POSIX	{ int mknod(char *path, int mode, int dev); }
+15	STD	POSIX	{ int chmod(char *path, int mode); }
+16	STD	POSIX	{ int chown(char *path, int uid, int gid); }
+17	MSTD	BSD	{ int obreak(char *nsize); } break obreak_args int
+18	STD	BSD	{ int getfsstat(struct statfs *buf, long bufsize, \
+			    int flags); }
+19	COMPAT	POSIX	{ long lseek(int fd, long offset, int whence); }
+20	MSTD	POSIX	{ pid_t getpid(void); }
+21	STD	BSD	{ int mount(char *type, char *path, int flags, \
+			    caddr_t data); }
+; XXX `path' should have type `const char *' but we're not ready for that.
+22	STD	BSD	{ int unmount(char *path, int flags); }
+23	MSTD	POSIX	{ int setuid(uid_t uid); }
+24	MSTD	POSIX	{ uid_t getuid(void); }
+25	MSTD	POSIX	{ uid_t geteuid(void); }
+26	STD	BSD	{ int ptrace(int req, pid_t pid, caddr_t addr, \
+			    int data); }
+27	MSTD	BSD	{ int recvmsg(int s, struct msghdr *msg, int flags); }
+28	MSTD	BSD	{ int sendmsg(int s, caddr_t msg, int flags); }
+29	MSTD	BSD	{ int recvfrom(int s, caddr_t buf, size_t len, \
+			    int flags, caddr_t from, int *fromlenaddr); }
+30	MSTD	BSD	{ int accept(int s, caddr_t name, int *anamelen); }
+31	MSTD	BSD	{ int getpeername(int fdes, caddr_t asa, int *alen); }
+32	MSTD	BSD	{ int getsockname(int fdes, caddr_t asa, int *alen); }
+33	STD	POSIX	{ int access(char *path, int flags); }
+34	STD	BSD	{ int chflags(char *path, int flags); }
+35	STD	BSD	{ int fchflags(int fd, int flags); }
+36	STD	BSD	{ int sync(void); }
+37	MSTD	POSIX	{ int kill(int pid, int signum); }
+38	COMPAT	POSIX	{ int stat(char *path, struct ostat *ub); }
+39	MSTD	POSIX	{ pid_t getppid(void); }
+40	COMPAT	POSIX	{ int lstat(char *path, struct ostat *ub); }
+41	STD	POSIX	{ int dup(u_int fd); }
+42	STD	POSIX	{ int pipe(void); }
+43	MSTD	POSIX	{ gid_t getegid(void); }
+44	MSTD	BSD	{ int profil(caddr_t samples, size_t size, \
+			    size_t offset, u_int scale); }
+45	STD	BSD	{ int ktrace(const char *fname, int ops, int facs, \
+			    int pid); }
+46	MCOMPAT	POSIX	{ int sigaction(int signum, struct osigaction *nsa, \
+			    struct osigaction *osa); }
+47	MSTD	POSIX	{ gid_t getgid(void); }
+48	MCOMPAT	POSIX	{ int sigprocmask(int how, osigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it, and we return the old mask as the
+; (int) return value.
+49	MSTD	BSD	{ int getlogin(char *namebuf, u_int namelen); }
+50	MSTD	BSD	{ int setlogin(char *namebuf); }
+51	MSTD	BSD	{ int acct(char *path); }
+52	MCOMPAT	POSIX	{ int sigpending(void); }
+53	MSTD	BSD	{ int sigaltstack(stack_t *ss, stack_t *oss); }
+54	MSTD	POSIX	{ int ioctl(int fd, u_long com, caddr_t data); }
+55	MSTD	BSD	{ int reboot(int opt); }
+56	STD	POSIX	{ int revoke(char *path); }
+57	STD	POSIX	{ int symlink(char *path, char *link); }
+58	STD	POSIX	{ int readlink(char *path, char *buf, int count); }
+59	MSTD	POSIX	{ int execve(char *fname, char **argv, char **envv); }
+60	MSTD	POSIX	{ int umask(int newmask); } umask umask_args int
+61	STD	BSD	{ int chroot(char *path); }
+62	MCOMPAT	POSIX	{ int fstat(int fd, struct ostat *sb); }
+63	MCOMPAT	BSD	{ int getkerninfo(int op, char *where, size_t *size, \
+			    int arg); } getkerninfo getkerninfo_args int
+64	MCOMPAT	BSD	{ int getpagesize(void); } \
+			    getpagesize getpagesize_args int
+65	STD	BSD	{ int msync(void *addr, size_t len, int flags); }
+66	MSTD	BSD	{ int vfork(void); }
+67	OBSOL	NOHIDE	vread
+68	OBSOL	NOHIDE	vwrite
+69	MSTD	BSD	{ int sbrk(int incr); }
+70	MSTD	BSD	{ int sstk(int incr); }
+71	MCOMPAT	BSD	{ int mmap(void *addr, int len, int prot, \
+			    int flags, int fd, long pos); }
+72	MSTD	BSD	{ int ovadvise(int anom); } vadvise ovadvise_args int
+73	MSTD	BSD	{ int munmap(void *addr, size_t len); }
+74	MSTD	BSD	{ int mprotect(const void *addr, size_t len, int prot); }
+75	MSTD	BSD	{ int madvise(void *addr, size_t len, int behav); }
+76	OBSOL	NOHIDE	vhangup
+77	OBSOL	NOHIDE	vlimit
+78	MSTD	BSD	{ int mincore(const void *addr, size_t len, \
+			    char *vec); }
+79	MSTD	POSIX	{ int getgroups(u_int gidsetsize, gid_t *gidset); }
+80	MSTD	POSIX	{ int setgroups(u_int gidsetsize, gid_t *gidset); }
+81	MSTD	POSIX	{ int getpgrp(void); }
+82	MSTD	POSIX	{ int setpgid(int pid, int pgid); }
+83	MSTD	BSD	{ int setitimer(u_int which, struct itimerval *itv, \
+			    struct itimerval *oitv); }
+84	MCOMPAT	BSD	{ int wait(void); }
+85	MSTD	BSD	{ int swapon(char *name); }
+86	MSTD	BSD	{ int getitimer(u_int which, struct itimerval *itv); }
+87	MCOMPAT	BSD	{ int gethostname(char *hostname, u_int len); } \
+			    gethostname gethostname_args int
+88	MCOMPAT	BSD	{ int sethostname(char *hostname, u_int len); } \
+			    sethostname sethostname_args int
+89	MSTD	BSD	{ int getdtablesize(void); }
+90	MSTD	POSIX	{ int dup2(u_int from, u_int to); }
+91	UNIMPL	BSD	getdopt
+92	MSTD	POSIX	{ int fcntl(int fd, int cmd, long arg); }
+; XXX should be		{ int fcntl(int fd, int cmd, ...); }
+; but we're not ready for varargs.
+; XXX man page says `int arg' too.
+93	MSTD	BSD	{ int select(int nd, fd_set *in, fd_set *ou, \
+			    fd_set *ex, struct timeval *tv); }
+94	UNIMPL	BSD	setdopt
+95	STD	POSIX	{ int fsync(int fd); }
+96	MSTD	BSD	{ int setpriority(int which, int who, int prio); }
+97	MSTD	BSD	{ int socket(int domain, int type, int protocol); }
+98	MSTD	BSD	{ int connect(int s, caddr_t name, int namelen); }
+99	MCPT_NOA BSD	{ int accept(int s, caddr_t name, int *anamelen); } \
+			    accept accept_args int
+100	MSTD	BSD	{ int getpriority(int which, int who); }
+101	MCOMPAT	BSD	{ int send(int s, caddr_t buf, int len, int flags); }
+102	MCOMPAT	BSD	{ int recv(int s, caddr_t buf, int len, int flags); }
+103	MSTD	BSD	{ int osigreturn(struct osigcontext *sigcntxp); }
+104	MSTD	BSD	{ int bind(int s, caddr_t name, int namelen); }
+105	MSTD	BSD	{ int setsockopt(int s, int level, int name, \
+			    caddr_t val, int valsize); }
+106	MSTD	BSD	{ int listen(int s, int backlog); }
+107	OBSOL	NOHIDE	vtimes
+108	MCOMPAT	BSD	{ int sigvec(int signum, struct sigvec *nsv, \
+			    struct sigvec *osv); }
+109	MCOMPAT	BSD	{ int sigblock(int mask); }
+110	MCOMPAT	BSD	{ int sigsetmask(int mask); }
+111	MCOMPAT	POSIX	{ int sigsuspend(osigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it.
+112	MCOMPAT	BSD	{ int sigstack(struct sigstack *nss, \
+			    struct sigstack *oss); }
+113	MCOMPAT	BSD	{ int recvmsg(int s, struct omsghdr *msg, int flags); }
+114	MCOMPAT	BSD	{ int sendmsg(int s, caddr_t msg, int flags); }
+115	OBSOL	NOHIDE	vtrace
+116	MSTD	BSD	{ int gettimeofday(struct timeval *tp, \
+			    struct timezone *tzp); }
+117	MSTD	BSD	{ int getrusage(int who, struct rusage *rusage); }
+118	MSTD	BSD	{ int getsockopt(int s, int level, int name, \
+			    caddr_t val, int *avalsize); }
+119	UNIMPL	NOHIDE	resuba (BSD/OS 2.x)
+120	MSTD	BSD	{ int readv(int fd, struct iovec *iovp, u_int iovcnt); }
+121	MSTD	BSD	{ int writev(int fd, struct iovec *iovp, \
+			    u_int iovcnt); }
+122	MSTD	BSD	{ int settimeofday(struct timeval *tv, \
+			    struct timezone *tzp); }
+123	STD	BSD	{ int fchown(int fd, int uid, int gid); }
+124	STD	BSD	{ int fchmod(int fd, int mode); }
+125	MCPT_NOA BSD	{ int recvfrom(int s, caddr_t buf, size_t len, \
+			    int flags, caddr_t from, int *fromlenaddr); } \
+			    recvfrom recvfrom_args int
+126	MSTD	BSD	{ int setreuid(int ruid, int euid); }
+127	MSTD	BSD	{ int setregid(int rgid, int egid); }
+128	STD	POSIX	{ int rename(char *from, char *to); }
+129	COMPAT	BSD	{ int truncate(char *path, long length); }
+130	COMPAT	BSD	{ int ftruncate(int fd, long length); }
+131	MSTD	BSD	{ int flock(int fd, int how); }
+132	STD	POSIX	{ int mkfifo(char *path, int mode); }
+133	MSTD	BSD	{ int sendto(int s, caddr_t buf, size_t len, \
+			    int flags, caddr_t to, int tolen); }
+134	MSTD	BSD	{ int shutdown(int s, int how); }
+135	MSTD	BSD	{ int socketpair(int domain, int type, int protocol, \
+			    int *rsv); }
+136	STD	POSIX	{ int mkdir(char *path, int mode); }
+137	STD	POSIX	{ int rmdir(char *path); }
+138	STD	BSD	{ int utimes(char *path, struct timeval *tptr); }
+139	OBSOL	NOHIDE	4.2 sigreturn
+140	MSTD	BSD	{ int adjtime(struct timeval *delta, \
+			    struct timeval *olddelta); }
+141	MCOMPAT	BSD	{ int getpeername(int fdes, caddr_t asa, int *alen); }
+142	MCOMPAT	BSD	{ long gethostid(void); }
+143	MCOMPAT	BSD	{ int sethostid(long hostid); }
+144	MCOMPAT	BSD	{ int getrlimit(u_int which, struct orlimit *rlp); }
+145	MCOMPAT	BSD	{ int setrlimit(u_int which, struct orlimit *rlp); }
+146	MCOMPAT	BSD	{ int killpg(int pgid, int signum); }
+147	MSTD	POSIX	{ int setsid(void); }
+148	STD	BSD	{ int quotactl(char *path, int cmd, int uid, \
+			    caddr_t arg); }
+149	MCOMPAT	BSD	{ int quota(void); }
+150	MCPT_NOA BSD	{ int getsockname(int fdec, caddr_t asa, int *alen); }\
+			    getsockname getsockname_args int
+
+; Syscalls 151-180 inclusive are reserved for vendor-specific
+; system calls.  (This includes various calls added for compatibity
+; with other Unix variants.)
+; Some of these calls are now supported by BSD...
+151	UNIMPL	NOHIDE	sem_lock (BSD/OS 2.x)
+152	UNIMPL	NOHIDE	sem_wakeup (BSD/OS 2.x)
+153	UNIMPL	NOHIDE	asyncdaemon (BSD/OS 2.x)
+154	UNIMPL	NOHIDE	nosys
+; 155 is initialized by the NFS code, if present.
+155	MNOIMPL	BSD	{ int nfssvc(int flag, caddr_t argp); }
+156	COMPAT	BSD	{ int getdirentries(int fd, char *buf, u_int count, \
+			    long *basep); }
+157	STD	BSD	{ int statfs(char *path, struct statfs *buf); }
+158	STD	BSD	{ int fstatfs(int fd, struct statfs *buf); }
+159	UNIMPL	NOHIDE	nosys
+160	UNIMPL	NOHIDE	nosys
+161	STD	BSD	{ int getfh(char *fname, struct fhandle *fhp); }
+162	MSTD	BSD	{ int getdomainname(char *domainname, int len); }
+163	MSTD	BSD	{ int setdomainname(char *domainname, int len); }
+164	MSTD	BSD	{ int uname(struct utsname *name); }
+165	STD	BSD	{ int sysarch(int op, char *parms); }
+166	MSTD	BSD	{ int rtprio(int function, pid_t pid, \
+			    struct rtprio *rtp); }
+167	UNIMPL	NOHIDE	nosys
+168	UNIMPL	NOHIDE	nosys
+; 169 is initialized by the SYSVSEM code if present or loaded
+169	MNOSTD	BSD	{ int semsys(int which, int a2, int a3, int a4, \
+			    int a5); }
+; 169 is initialized by the SYSVMSG code if present or loaded
+; XXX should be		{ int semsys(int which, ...); }
+170	MNOSTD	BSD	{ int msgsys(int which, int a2, int a3, int a4, \
+			    int a5, int a6); }
+; 169 is initialized by the SYSVSHM code if present or loaded
+; XXX should be		{ int msgsys(int which, ...); }
+171	MNOSTD	BSD	{ int shmsys(int which, int a2, int a3, int a4); }
+; XXX should be		{ int shmsys(int which, ...); }
+172	UNIMPL	NOHIDE	nosys
+173	MSTD	POSIX	{ ssize_t pread(int fd, void *buf, size_t nbyte, \
+			    int pad, off_t offset); }
+174	MSTD	POSIX	{ ssize_t pwrite(int fd, const void *buf, \
+			    size_t nbyte, int pad, off_t offset); }
+175	UNIMPL	NOHIDE	nosys
+176	MSTD	BSD	{ int ntp_adjtime(struct timex *tp); }
+177	UNIMPL	NOHIDE	sfork (BSD/OS 2.x)
+178	UNIMPL	NOHIDE	getdescriptor (BSD/OS 2.x)
+179	UNIMPL	NOHIDE	setdescriptor (BSD/OS 2.x)
+180	UNIMPL	NOHIDE	nosys
+
+; Syscalls 181-199 are used by/reserved for BSD
+181	MSTD	POSIX	{ int setgid(gid_t gid); }
+182	MSTD	BSD	{ int setegid(gid_t egid); }
+183	MSTD	BSD	{ int seteuid(uid_t euid); }
+184	UNIMPL	BSD	lfs_bmapv
+185	UNIMPL	BSD	lfs_markv
+186	UNIMPL	BSD	lfs_segclean
+187	UNIMPL	BSD	lfs_segwait
+188	STD	POSIX	{ int stat(char *path, struct stat *ub); }
+189	MSTD	POSIX	{ int fstat(int fd, struct stat *sb); }
+190	STD	POSIX	{ int lstat(char *path, struct stat *ub); }
+191	STD	POSIX	{ int pathconf(char *path, int name); }
+192	MSTD	POSIX	{ int fpathconf(int fd, int name); }
+193	UNIMPL	NOHIDE	nosys
+194	MSTD	BSD	{ int getrlimit(u_int which, \
+			    struct rlimit *rlp); } \
+			    getrlimit __getrlimit_args int
+195	MSTD	BSD	{ int setrlimit(u_int which, \
+			    struct rlimit *rlp); } \
+			    setrlimit __setrlimit_args int
+196	STD	BSD	{ int getdirentries(int fd, char *buf, u_int count, \
+			    long *basep); }
+197	MSTD	BSD	{ caddr_t mmap(caddr_t addr, size_t len, int prot, \
+			    int flags, int fd, int pad, off_t pos); }
+198	STD	NOHIDE	{ int nosys(void); } __syscall __syscall_args int
+199	STD	POSIX	{ off_t lseek(int fd, int pad, off_t offset, \
+			    int whence); }
+200	STD	BSD	{ int truncate(char *path, int pad, off_t length); }
+201	STD	BSD	{ int ftruncate(int fd, int pad, off_t length); }
+202	MSTD	BSD	{ int __sysctl(int *name, u_int namelen, void *old, \
+			    size_t *oldlenp, void *new, size_t newlen); } \
+			    __sysctl sysctl_args int
+; properly, __sysctl should be a NOHIDE, but making an exception
+; here allows to avoid one in libc/sys/Makefile.inc.
+203	MSTD	BSD	{ int mlock(const void *addr, size_t len); }
+204	MSTD	BSD	{ int munlock(const void *addr, size_t len); }
+205	STD	BSD	{ int undelete(char *path); }
+206	STD	BSD	{ int futimes(int fd, struct timeval *tptr); }
+207	MSTD	BSD	{ int getpgid(pid_t pid); }
+208	UNIMPL	NOHIDE	newreboot (NetBSD)
+209	MSTD	BSD	{ int poll(struct pollfd *fds, u_int nfds, \
+			    int timeout); }
+
+;
+; The following are reserved for loadable syscalls
+;
+210	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+211	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+212	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+213	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+214	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+215	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+216	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+217	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+218	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+219	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+
+;
+; The following were introduced with NetBSD/4.4Lite-2
+; They are initialized by thier respective modules/sysinits
+220	MNOSTD	BSD	{ int __semctl(int semid, int semnum, int cmd, \
+			    union semun *arg); }
+221	MNOSTD	BSD	{ int semget(key_t key, int nsems, int semflg); }
+222	MNOSTD	BSD	{ int semop(int semid, struct sembuf *sops, \
+			    u_int nsops); }
+223	UNIMPL	NOHIDE	semconfig
+224	MNOSTD	BSD	{ int msgctl(int msqid, int cmd, \
+			    struct msqid_ds *buf); }
+225	MNOSTD	BSD	{ int msgget(key_t key, int msgflg); }
+226	MNOSTD	BSD	{ int msgsnd(int msqid, void *msgp, size_t msgsz, \
+			    int msgflg); }
+227	MNOSTD	BSD	{ int msgrcv(int msqid, void *msgp, size_t msgsz, \
+			    long msgtyp, int msgflg); }
+228	MNOSTD	BSD	{ int shmat(int shmid, void *shmaddr, int shmflg); }
+229	MNOSTD	BSD	{ int shmctl(int shmid, int cmd, \
+			    struct shmid_ds *buf); }
+230	MNOSTD	BSD	{ int shmdt(void *shmaddr); }
+231	MNOSTD	BSD	{ int shmget(key_t key, int size, int shmflg); }
+;
+232	MSTD	POSIX	{ int clock_gettime(clockid_t clock_id, \
+			    struct timespec *tp); }
+233	MSTD	POSIX	{ int clock_settime(clockid_t clock_id, \
+			    const struct timespec *tp); }
+234	MSTD	POSIX	{ int clock_getres(clockid_t clock_id, \
+			    struct timespec *tp); }
+235	UNIMPL	NOHIDE	timer_create
+236	UNIMPL	NOHIDE	timer_delete
+237	UNIMPL	NOHIDE	timer_settime
+238	UNIMPL	NOHIDE	timer_gettime
+239	UNIMPL	NOHIDE	timer_getoverrun
+240	MSTD	POSIX	{ int nanosleep(const struct timespec *rqtp, \
+			    struct timespec *rmtp); }
+241	UNIMPL	NOHIDE	nosys
+242	UNIMPL	NOHIDE	nosys
+243	UNIMPL	NOHIDE	nosys
+244	UNIMPL	NOHIDE	nosys
+245	UNIMPL	NOHIDE	nosys
+246	UNIMPL	NOHIDE	nosys
+247	UNIMPL	NOHIDE	nosys
+248	UNIMPL	NOHIDE	nosys
+249	UNIMPL	NOHIDE	nosys
+; syscall numbers initially used in OpenBSD
+250	MSTD	BSD	{ int minherit(void *addr, size_t len, int inherit); }
+251	MSTD	BSD	{ int rfork(int flags); }
+252	MSTD	BSD	{ int openbsd_poll(struct pollfd *fds, u_int nfds, \
+			    int timeout); }
+253	STD	BSD	{ int issetugid(void); }
+254	STD	BSD	{ int lchown(char *path, int uid, int gid); }
+255	UNIMPL	NOHIDE	nosys
+256	UNIMPL	NOHIDE	nosys
+257	UNIMPL	NOHIDE	nosys
+258	UNIMPL	NOHIDE	nosys
+259	UNIMPL	NOHIDE	nosys
+260	UNIMPL	NOHIDE	nosys
+261	UNIMPL	NOHIDE	nosys
+262	UNIMPL	NOHIDE	nosys
+263	UNIMPL	NOHIDE	nosys
+264	UNIMPL	NOHIDE	nosys
+265	UNIMPL	NOHIDE	nosys
+266	UNIMPL	NOHIDE	nosys
+267	UNIMPL	NOHIDE	nosys
+268	UNIMPL	NOHIDE	nosys
+269	UNIMPL	NOHIDE	nosys
+270	UNIMPL	NOHIDE	nosys
+271	UNIMPL	NOHIDE	nosys
+272	STD	BSD	{ int getdents(int fd, char *buf, size_t count); }
+273	UNIMPL	NOHIDE	nosys
+274	STD	BSD	{ int lchmod(char *path, mode_t mode); }
+275	NOPROTO BSD	{ int lchown(char *path, uid_t uid, gid_t gid); } netbsd_lchown lchown_args int
+276	STD	BSD	{ int lutimes(char *path, struct timeval *tptr); }
+277	MNOPROTO BSD	{ int msync(void *addr, size_t len, int flags); } netbsd_msync msync_args int
+278	STD	BSD	{ int nstat(char *path, struct nstat *ub); }
+279	MSTD	BSD	{ int nfstat(int fd, struct nstat *sb); }
+280	STD	BSD	{ int nlstat(char *path, struct nstat *ub); }
+281	UNIMPL	NOHIDE	nosys
+282	UNIMPL	NOHIDE	nosys
+283	UNIMPL	NOHIDE	nosys
+284	UNIMPL	NOHIDE	nosys
+285	UNIMPL	NOHIDE	nosys
+286	UNIMPL	NOHIDE	nosys
+287	UNIMPL	NOHIDE	nosys
+288	UNIMPL	NOHIDE	nosys
+289	UNIMPL	NOHIDE	nosys
+290	UNIMPL	NOHIDE	nosys
+291	UNIMPL	NOHIDE	nosys
+292	UNIMPL	NOHIDE	nosys
+293	UNIMPL	NOHIDE	nosys
+294	UNIMPL	NOHIDE	nosys
+295	UNIMPL	NOHIDE	nosys
+296	UNIMPL	NOHIDE	nosys
+; XXX 297 is 300 in NetBSD 
+297	STD	BSD	{ int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); }
+298	STD	BSD	{ int fhopen(const struct fhandle *u_fhp, int flags); }
+299	STD	BSD 	{ int fhstat(const struct fhandle *u_fhp, struct stat *sb); }
+; syscall numbers for FreeBSD
+300	MSTD	BSD	{ int modnext(int modid); }
+301	MSTD	BSD	{ int modstat(int modid, struct module_stat* stat); }
+302	MSTD	BSD	{ int modfnext(int modid); }
+303	MSTD	BSD	{ int modfind(const char *name); }
+304	MSTD	BSD	{ int kldload(const char *file); }
+305	MSTD	BSD	{ int kldunload(int fileid); }
+306	MSTD	BSD	{ int kldfind(const char *file); }
+307	MSTD	BSD	{ int kldnext(int fileid); }
+308	MSTD	BSD	{ int kldstat(int fileid, struct kld_file_stat* stat); }
+309	MSTD	BSD	{ int kldfirstmod(int fileid); }
+310	MSTD	BSD	{ int getsid(pid_t pid); }
+311	MSTD	BSD	{ int setresuid(uid_t ruid, uid_t euid, uid_t suid); }
+312	MSTD	BSD	{ int setresgid(gid_t rgid, gid_t egid, gid_t sgid); }
+313	OBSOL	NOHIDE	signanosleep
+314     NOSTD   BSD     { int aio_return(struct aiocb *aiocbp); }
+315     NOSTD   BSD     { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); }
+316     NOSTD   BSD     { int aio_cancel(int fd, struct aiocb *aiocbp); }
+317     NOSTD   BSD     { int aio_error(struct aiocb *aiocbp); }
+318     NOSTD   BSD     { int aio_read(struct aiocb *aiocbp); }
+319     NOSTD   BSD     { int aio_write(struct aiocb *aiocbp); }
+320     NOSTD   BSD     { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); }
+321     MSTD	BSD     { int yield(void); }
+322	OBSOL	NOHIDE	thr_sleep
+323	OBSOL	NOHIDE	thr_wakeup
+324     MSTD	BSD     { int mlockall(int how); }
+325     MSTD	BSD     { int munlockall(void); }
+326     STD     BSD     { int __getcwd(u_char *buf, u_int buflen); }
+
+327     MSTD     POSIX   { int sched_setparam (pid_t pid, const struct sched_param *param); }
+328     MSTD     POSIX   { int sched_getparam (pid_t pid, struct sched_param *param); }
+
+329     MSTD     POSIX   { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); }
+330     MSTD     POSIX   { int sched_getscheduler (pid_t pid); }
+
+331     MSTD     POSIX   { int sched_yield (void); }
+332     MSTD     POSIX   { int sched_get_priority_max (int policy); }
+333     MSTD     POSIX   { int sched_get_priority_min (int policy); }
+334     MSTD     POSIX   { int sched_rr_get_interval (pid_t pid, struct timespec *interval); }
+335	STD	BSD	{ int utrace(const void *addr, size_t len); }
+336	MSTD	BSD	{ int sendfile(int fd, int s, off_t offset, size_t nbytes, \
+				struct sf_hdtr *hdtr, off_t *sbytes, int flags); }
+337	STD	BSD	{ int kldsym(int fileid, int cmd, void *data); }
+338	MSTD	BSD	{ int jail(struct jail *jail); }
+339	UNIMPL	BSD	pioctl
+340	MSTD	POSIX	{ int sigprocmask(int how, const sigset_t *set, \
+			    sigset_t *oset); }
+341	MSTD	POSIX	{ int sigsuspend(const sigset_t *sigmask); }
+342	MSTD	POSIX	{ int sigaction(int sig, const struct sigaction *act, \
+			    struct sigaction *oact); }
+343	MSTD	POSIX	{ int sigpending(sigset_t *set); }
+344	MSTD	BSD	{ int sigreturn(const struct __ucontext *sigcntxp); }
+345	UNIMPL	NOHIDE	sigtimedwait
+346	UNIMPL	NOHIDE	sigwaitinfo
+347	MSTD	BSD	{ int __acl_get_file(const char *path, \
+			    acl_type_t type, struct acl *aclp); }
+348	MSTD	BSD	{ int __acl_set_file(const char *path, \
+			    acl_type_t type, struct acl *aclp); }
+349	MSTD	BSD	{ int __acl_get_fd(int filedes, acl_type_t type, \
+			    struct acl *aclp); }
+350	MSTD	BSD	{ int __acl_set_fd(int filedes, acl_type_t type, \
+			    struct acl *aclp); }
+351	MSTD	BSD	{ int __acl_delete_file(const char *path, \
+			    acl_type_t type); }
+352	MSTD	BSD	{ int __acl_delete_fd(int filedes, acl_type_t type); }
+353	MSTD	BSD	{ int __acl_aclcheck_file(const char *path, \
+			    acl_type_t type, struct acl *aclp); }
+354	MSTD	BSD	{ int __acl_aclcheck_fd(int filedes, acl_type_t type, \
+			    struct acl *aclp); }
+355	STD	BSD	{ int extattrctl(const char *path, int cmd, \
+			    const char *filename, int attrnamespace, \
+			    const char *attrname); }
+356	STD	BSD	{ int extattr_set_file(const char *path, \
+			    int attrnamespace, const char *attrname, \
+			    void *data, size_t nbytes); }
+357	STD	BSD	{ ssize_t extattr_get_file(const char *path, \
+			    int attrnamespace, const char *attrname, \
+			    void *data, size_t nbytes); }
+358	STD	BSD	{ int extattr_delete_file(const char *path, \
+			    int attrnamespace, const char *attrname); }
+359	NOSTD	BSD	{ int aio_waitcomplete(struct aiocb **aiocbp, struct timespec *timeout); }
+360	MSTD	BSD	{ int getresuid(uid_t *ruid, uid_t *euid, uid_t *suid); }
+361	MSTD	BSD	{ int getresgid(gid_t *rgid, gid_t *egid, gid_t *sgid); }
+362	MSTD	BSD	{ int kqueue(void); }
+363	MSTD	BSD	{ int kevent(int fd, \
+			    const struct kevent *changelist, int nchanges, \
+			    struct kevent *eventlist, int nevents, \
+			    const struct timespec *timeout); }
+364	UNIMPL	BSD	__cap_get_proc
+365	UNIMPL	BSD	__cap_set_proc
+366	UNIMPL	BSD	__cap_get_fd
+367	UNIMPL	BSD	__cap_get_file
+368	UNIMPL	BSD	__cap_set_fd
+369	UNIMPL	BSD	__cap_set_file
+370	NODEF	NOHIDE	lkmressys lkmressys nosys_args int
+371	STD	BSD	{ int extattr_set_fd(int fd, int attrnamespace, \
+			    const char *attrname, void *data, \
+			    size_t nbytes); }
+372	STD	BSD	{ ssize_t extattr_get_fd(int fd, int attrnamespace, \
+			    const char *attrname, void *data, size_t nbytes); }
+373	STD	BSD	{ int extattr_delete_fd(int fd, int attrnamespace, \
+			    const char *attrname); }
+374	MSTD	BSD	{ int __setugid(int flag); }
+375	NOIMPL	BSD	{ int nfsclnt(int flag, caddr_t argp); }
+376	STD	BSD	{ int eaccess(char *path, int flags); }
+377	UNIMPL	BSD	afs_syscall
+378	STD	BSD	{ int nmount(struct iovec *iovp, unsigned int iovcnt, \
+			    int flags); }
+379	STD	BSD	{ int kse_exit(void); }
+380	STD	BSD	{ int kse_wakeup(void); }
+381	STD	BSD	{ int kse_new(struct kse_mailbox * mbx, \
+			int new_grp_flag); }
+382	STD	BSD	{ int thread_wakeup(struct thread_mailbox *tmbx); }
+383	STD	BSD	{ int kse_yield(void); }
+384	UNIMPL	BSD	__mac_get_proc
+385	UNIMPL	BSD	__mac_set_proc
+386	UNIMPL	BSD	__mac_get_fd
+387	UNIMPL	BSD	__mac_get_file
+388	UNIMPL	BSD	__mac_set_fd
+389	UNIMPL	BSD	__mac_set_file
+390	STD	BSD	{ int kenv(int what, const char *name, char *value, \
+			    int len); }
+391	STD	BSD	{ int lchflags(const char *path, int flags); }
+392	STD	BSD	{ int uuidgen(struct uuid *store, int count); }
diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c
new file mode 100644
index 0000000..fc5fd8f
--- /dev/null
+++ b/sys/kern/sysv_ipc.c
@@ -0,0 +1,97 @@
+/* $FreeBSD$ */
+/*	$NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $	*/
+
+/*
+ * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Herb Peyerl.
+ * 4. The name of Herb Peyerl may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#include <sys/ipc.h>
+#include <sys/proc.h>
+#include <sys/ucred.h>
+
+void (*shmfork_hook)(struct proc *, struct proc *) = NULL;
+void (*shmexit_hook)(struct proc *) = NULL;
+
+/* called from kern_fork.c */
+void
+shmfork(p1, p2)
+	struct proc *p1, *p2;
+{
+
+	if (shmfork_hook != NULL)
+		shmfork_hook(p1, p2);
+	return;
+}
+
+/* called from kern_exit.c */
+void
+shmexit(p)
+	struct proc *p;
+{
+
+	if (shmexit_hook != NULL)
+		shmexit_hook(p);
+	return;
+}
+
+/*
+ * Check for ipc permission
+ */
+
+int
+ipcperm(td, perm, mode)
+	struct thread *td;
+	struct ipc_perm *perm;
+	int mode;
+{
+	struct ucred *cred = td->td_ucred;
+
+	/* Check for user match. */
+	if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) {
+		if (mode & IPC_M)
+			return (suser(td) == 0 ? 0 : EPERM);
+		/* Check for group match. */
+		mode >>= 3;
+		if (!groupmember(perm->gid, cred) &&
+		    !groupmember(perm->cgid, cred))
+			/* Check for `other' match. */
+			mode >>= 3;
+	}
+
+	if (mode & IPC_M)
+		return (0);
+	return ((mode & perm->mode) == mode ||
+	    suser(td) == 0 ? 0 : EACCES);
+}
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
new file mode 100644
index 0000000..4dd2249
--- /dev/null
+++ b/sys/kern/sysv_msg.c
@@ -0,0 +1,1240 @@
+/* $FreeBSD$ */
+
+/*
+ * Implementation of SVID messages
+ *
+ * Author:  Daniel Boulet
+ *
+ * Copyright 1993 Daniel Boulet and RTMX Inc.
+ *
+ * This system call was implemented by Daniel Boulet under contract from RTMX.
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/msg.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/jail.h>
+
+static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
+
+static void msginit(void);
+static int msgunload(void);
+static int sysvmsg_modload(struct module *, int, void *);
+
+#define MSG_DEBUG
+#undef MSG_DEBUG_OK
+
+static void msg_freehdr(struct msg *msghdr);
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *msgcalls[] = {
+	(sy_call_t *)msgctl, (sy_call_t *)msgget,
+	(sy_call_t *)msgsnd, (sy_call_t *)msgrcv
+};
+
+struct msg {
+	struct	msg *msg_next;	/* next msg in the chain */
+	long	msg_type;	/* type of this message */
+    				/* >0 -> type of this message */
+    				/* 0 -> free header */
+	u_short	msg_ts;		/* size of this message */
+	short	msg_spot;	/* location of start of msg in buffer */
+};
+
+
+#ifndef MSGSSZ
+#define MSGSSZ	8		/* Each segment must be 2^N long */
+#endif
+#ifndef MSGSEG
+#define MSGSEG	2048		/* must be less than 32767 */
+#endif
+#define MSGMAX	(MSGSSZ*MSGSEG)
+#ifndef MSGMNB
+#define MSGMNB	2048		/* max # of bytes in a queue */
+#endif
+#ifndef MSGMNI
+#define MSGMNI	40
+#endif
+#ifndef MSGTQL
+#define MSGTQL	40
+#endif
+
+/*
+ * Based on the configuration parameters described in an SVR2 (yes, two)
+ * config(1m) man page.
+ *
+ * Each message is broken up and stored in segments that are msgssz bytes
+ * long.  For efficiency reasons, this should be a power of two.  Also,
+ * it doesn't make sense if it is less than 8 or greater than about 256.
+ * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
+ * two between 8 and 1024 inclusive (and panic's if it isn't).
+ */
+struct msginfo msginfo = {
+                MSGMAX,         /* max chars in a message */
+                MSGMNI,         /* # of message queue identifiers */
+                MSGMNB,         /* max chars in a queue */
+                MSGTQL,         /* max messages in system */
+                MSGSSZ,         /* size of a message segment */
+                		/* (must be small power of 2 greater than 4) */
+                MSGSEG          /* number of message segments */
+};
+
+/*
+ * macros to convert between msqid_ds's and msqid's.
+ * (specific to this implementation)
+ */
+#define MSQID(ix,ds)	((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
+#define MSQID_IX(id)	((id) & 0xffff)
+#define MSQID_SEQ(id)	(((id) >> 16) & 0xffff)
+
+/*
+ * The rest of this file is specific to this particular implementation.
+ */
+
+struct msgmap {
+	short	next;		/* next segment in buffer */
+    				/* -1 -> available */
+    				/* 0..(MSGSEG-1) -> index of next segment */
+};
+
+#define MSG_LOCKED	01000	/* Is this msqid_ds locked? */
+
+static int nfree_msgmaps;	/* # of free map entries */
+static short free_msgmaps;	/* head of linked list of free map entries */
+static struct msg *free_msghdrs;/* list of free msg headers */
+static char *msgpool;		/* MSGMAX byte long msg buffer pool */
+static struct msgmap *msgmaps;	/* MSGSEG msgmap structures */
+static struct msg *msghdrs;	/* MSGTQL msg headers */
+static struct msqid_ds *msqids;	/* MSGMNI msqid_ds struct's */
+
+static void
+msginit()
+{
+	register int i;
+
+	TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg);
+	TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz);
+	msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
+	TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni);
+
+	msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
+	if (msgpool == NULL)
+		panic("msgpool is NULL");
+	msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
+	if (msgmaps == NULL)
+		panic("msgmaps is NULL");
+	msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
+	if (msghdrs == NULL)
+		panic("msghdrs is NULL");
+	msqids = malloc(sizeof(struct msqid_ds) * msginfo.msgmni, M_MSG, M_WAITOK);
+	if (msqids == NULL)
+		panic("msqids is NULL");
+
+	/*
+	 * msginfo.msgssz should be a power of two for efficiency reasons.
+	 * It is also pretty silly if msginfo.msgssz is less than 8
+	 * or greater than about 256 so ...
+	 */
+
+	i = 8;
+	while (i < 1024 && i != msginfo.msgssz)
+		i <<= 1;
+    	if (i != msginfo.msgssz) {
+		printf("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
+		    msginfo.msgssz);
+		panic("msginfo.msgssz not a small power of 2");
+	}
+
+	if (msginfo.msgseg > 32767) {
+		printf("msginfo.msgseg=%d\n", msginfo.msgseg);
+		panic("msginfo.msgseg > 32767");
+	}
+
+	if (msgmaps == NULL)
+		panic("msgmaps is NULL");
+
+	for (i = 0; i < msginfo.msgseg; i++) {
+		if (i > 0)
+			msgmaps[i-1].next = i;
+		msgmaps[i].next = -1;	/* implies entry is available */
+	}
+	free_msgmaps = 0;
+	nfree_msgmaps = msginfo.msgseg;
+
+	if (msghdrs == NULL)
+		panic("msghdrs is NULL");
+
+	for (i = 0; i < msginfo.msgtql; i++) {
+		msghdrs[i].msg_type = 0;
+		if (i > 0)
+			msghdrs[i-1].msg_next = &msghdrs[i];
+		msghdrs[i].msg_next = NULL;
+    	}
+	free_msghdrs = &msghdrs[0];
+
+	if (msqids == NULL)
+		panic("msqids is NULL");
+
+	for (i = 0; i < msginfo.msgmni; i++) {
+		msqids[i].msg_qbytes = 0;	/* implies entry is available */
+		msqids[i].msg_perm.seq = 0;	/* reset to a known value */
+		msqids[i].msg_perm.mode = 0;
+	}
+}
+
+static int
+msgunload()
+{
+	struct msqid_ds *msqptr;
+	int msqid;
+
+	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+		/*
+		 * Look for an unallocated and unlocked msqid_ds.
+		 * msqid_ds's can be locked by msgsnd or msgrcv while
+		 * they are copying the message in/out.  We can't
+		 * re-use the entry until they release it.
+		 */
+		msqptr = &msqids[msqid];
+		if (msqptr->msg_qbytes != 0 ||
+		    (msqptr->msg_perm.mode & MSG_LOCKED) != 0)
+			break;
+	}
+	if (msqid != msginfo.msgmni)
+		return (EBUSY);
+
+	free(msgpool, M_MSG);
+	free(msgmaps, M_MSG);
+	free(msghdrs, M_MSG);
+	free(msqids, M_MSG);
+	return (0);
+}
+
+
+static int
+sysvmsg_modload(struct module *module, int cmd, void *arg)
+{
+	int error = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		msginit();
+		break;
+	case MOD_UNLOAD:
+		error = msgunload();
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t sysvmsg_mod = {
+	"sysvmsg",
+	&sysvmsg_modload,
+	NULL
+};
+
+SYSCALL_MODULE_HELPER(msgsys);
+SYSCALL_MODULE_HELPER(msgctl);
+SYSCALL_MODULE_HELPER(msgget);
+SYSCALL_MODULE_HELPER(msgsnd);
+SYSCALL_MODULE_HELPER(msgrcv);
+
+DECLARE_MODULE(sysvmsg, sysvmsg_mod,
+	SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
+MODULE_VERSION(sysvmsg, 1);
+
+/*
+ * Entry point for all MSG calls
+ *
+ * MPSAFE
+ */
+int
+msgsys(td, uap)
+	struct thread *td;
+	/* XXX actually varargs. */
+	struct msgsys_args /* {
+		u_int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+		int	a5;
+		int	a6;
+	} */ *uap;
+{
+	int error;
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+	if (uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
+		return (EINVAL);
+	mtx_lock(&Giant);
+	error = (*msgcalls[uap->which])(td, &uap->a2);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static void
+msg_freehdr(msghdr)
+	struct msg *msghdr;
+{
+	while (msghdr->msg_ts > 0) {
+		short next;
+		if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
+			panic("msghdr->msg_spot out of range");
+		next = msgmaps[msghdr->msg_spot].next;
+		msgmaps[msghdr->msg_spot].next = free_msgmaps;
+		free_msgmaps = msghdr->msg_spot;
+		nfree_msgmaps++;
+		msghdr->msg_spot = next;
+		if (msghdr->msg_ts >= msginfo.msgssz)
+			msghdr->msg_ts -= msginfo.msgssz;
+		else
+			msghdr->msg_ts = 0;
+	}
+	if (msghdr->msg_spot != -1)
+		panic("msghdr->msg_spot != -1");
+	msghdr->msg_next = free_msghdrs;
+	free_msghdrs = msghdr;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args {
+	int	msqid;
+	int	cmd;
+	struct	msqid_ds *buf;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+msgctl(td, uap)
+	struct thread *td;
+	register struct msgctl_args *uap;
+{
+	int msqid = uap->msqid;
+	int cmd = uap->cmd;
+	struct msqid_ds *user_msqptr = uap->buf;
+	int rval, error;
+	struct msqid_ds msqbuf;
+	register struct msqid_ds *msqptr;
+
+#ifdef MSG_DEBUG_OK
+	printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr);
+#endif
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+
+	mtx_lock(&Giant);
+	msqid = IPCID_TO_IX(msqid);
+
+	if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+		printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+		    msginfo.msgmni);
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+
+	msqptr = &msqids[msqid];
+
+	if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+		printf("no such msqid\n");
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+	if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+		printf("wrong sequence number\n");
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+
+	error = 0;
+	rval = 0;
+
+	switch (cmd) {
+
+	case IPC_RMID:
+	{
+		struct msg *msghdr;
+		if ((error = ipcperm(td, &msqptr->msg_perm, IPC_M)))
+			goto done2;
+		/* Free the message headers */
+		msghdr = msqptr->msg_first;
+		while (msghdr != NULL) {
+			struct msg *msghdr_tmp;
+
+			/* Free the segments of each message */
+			msqptr->msg_cbytes -= msghdr->msg_ts;
+			msqptr->msg_qnum--;
+			msghdr_tmp = msghdr;
+			msghdr = msghdr->msg_next;
+			msg_freehdr(msghdr_tmp);
+		}
+
+		if (msqptr->msg_cbytes != 0)
+			panic("msg_cbytes is screwed up");
+		if (msqptr->msg_qnum != 0)
+			panic("msg_qnum is screwed up");
+
+		msqptr->msg_qbytes = 0;	/* Mark it as free */
+
+		wakeup((caddr_t)msqptr);
+	}
+
+		break;
+
+	case IPC_SET:
+		if ((error = ipcperm(td, &msqptr->msg_perm, IPC_M)))
+			goto done2;
+		if ((error = copyin(user_msqptr, &msqbuf, sizeof(msqbuf))) != 0)
+			goto done2;
+		if (msqbuf.msg_qbytes > msqptr->msg_qbytes) {
+			error = suser(td);
+			if (error)
+				goto done2;
+		}
+		if (msqbuf.msg_qbytes > msginfo.msgmnb) {
+#ifdef MSG_DEBUG_OK
+			printf("can't increase msg_qbytes beyond %d (truncating)\n",
+			    msginfo.msgmnb);
+#endif
+			msqbuf.msg_qbytes = msginfo.msgmnb;	/* silently restrict qbytes to system limit */
+		}
+		if (msqbuf.msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+			printf("can't reduce msg_qbytes to 0\n");
+#endif
+			error = EINVAL;		/* non-standard errno! */
+			goto done2;
+		}
+		msqptr->msg_perm.uid = msqbuf.msg_perm.uid;	/* change the owner */
+		msqptr->msg_perm.gid = msqbuf.msg_perm.gid;	/* change the owner */
+		msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) |
+		    (msqbuf.msg_perm.mode & 0777);
+		msqptr->msg_qbytes = msqbuf.msg_qbytes;
+		msqptr->msg_ctime = time_second;
+		break;
+
+	case IPC_STAT:
+		if ((error = ipcperm(td, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+			printf("requester doesn't have read access\n");
+#endif
+			goto done2;
+		}
+		error = copyout((caddr_t)msqptr, user_msqptr,
+		    sizeof(struct msqid_ds));
+		break;
+
+	default:
+#ifdef MSG_DEBUG_OK
+		printf("invalid command %d\n", cmd);
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+
+	if (error == 0)
+		td->td_retval[0] = rval;
+done2:
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgget_args {
+	key_t	key;
+	int	msgflg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+msgget(td, uap)
+	struct thread *td;
+	register struct msgget_args *uap;
+{
+	int msqid, error = 0;
+	int key = uap->key;
+	int msgflg = uap->msgflg;
+	struct ucred *cred = td->td_ucred;
+	register struct msqid_ds *msqptr = NULL;
+
+#ifdef MSG_DEBUG_OK
+	printf("msgget(0x%x, 0%o)\n", key, msgflg);
+#endif
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+
+	mtx_lock(&Giant);
+	if (key != IPC_PRIVATE) {
+		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+			msqptr = &msqids[msqid];
+			if (msqptr->msg_qbytes != 0 &&
+			    msqptr->msg_perm.key == key)
+				break;
+		}
+		if (msqid < msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+			printf("found public key\n");
+#endif
+			if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
+#ifdef MSG_DEBUG_OK
+				printf("not exclusive\n");
+#endif
+				error = EEXIST;
+				goto done2;
+			}
+			if ((error = ipcperm(td, &msqptr->msg_perm, msgflg & 0700 ))) {
+#ifdef MSG_DEBUG_OK
+				printf("requester doesn't have 0%o access\n",
+				    msgflg & 0700);
+#endif
+				goto done2;
+			}
+			goto found;
+		}
+	}
+
+#ifdef MSG_DEBUG_OK
+	printf("need to allocate the msqid_ds\n");
+#endif
+	if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
+		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+			/*
+			 * Look for an unallocated and unlocked msqid_ds.
+			 * msqid_ds's can be locked by msgsnd or msgrcv while
+			 * they are copying the message in/out.  We can't
+			 * re-use the entry until they release it.
+			 */
+			msqptr = &msqids[msqid];
+			if (msqptr->msg_qbytes == 0 &&
+			    (msqptr->msg_perm.mode & MSG_LOCKED) == 0)
+				break;
+		}
+		if (msqid == msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+			printf("no more msqid_ds's available\n");
+#endif
+			error = ENOSPC;
+			goto done2;
+		}
+#ifdef MSG_DEBUG_OK
+		printf("msqid %d is available\n", msqid);
+#endif
+		msqptr->msg_perm.key = key;
+		msqptr->msg_perm.cuid = cred->cr_uid;
+		msqptr->msg_perm.uid = cred->cr_uid;
+		msqptr->msg_perm.cgid = cred->cr_gid;
+		msqptr->msg_perm.gid = cred->cr_gid;
+		msqptr->msg_perm.mode = (msgflg & 0777);
+		/* Make sure that the returned msqid is unique */
+		msqptr->msg_perm.seq++;
+		msqptr->msg_first = NULL;
+		msqptr->msg_last = NULL;
+		msqptr->msg_cbytes = 0;
+		msqptr->msg_qnum = 0;
+		msqptr->msg_qbytes = msginfo.msgmnb;
+		msqptr->msg_lspid = 0;
+		msqptr->msg_lrpid = 0;
+		msqptr->msg_stime = 0;
+		msqptr->msg_rtime = 0;
+		msqptr->msg_ctime = time_second;
+	} else {
+#ifdef MSG_DEBUG_OK
+		printf("didn't find it and wasn't asked to create it\n");
+#endif
+		error = ENOENT;
+		goto done2;
+	}
+
+found:
+	/* Construct the unique msqid */
+	td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgsnd_args {
+	int	msqid;
+	void	*msgp;
+	size_t	msgsz;
+	int	msgflg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+msgsnd(td, uap)
+	struct thread *td;
+	register struct msgsnd_args *uap;
+{
+	int msqid = uap->msqid;
+	void *user_msgp = uap->msgp;
+	size_t msgsz = uap->msgsz;
+	int msgflg = uap->msgflg;
+	int segs_needed, error = 0;
+	register struct msqid_ds *msqptr;
+	register struct msg *msghdr;
+	short next;
+
+#ifdef MSG_DEBUG_OK
+	printf("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz,
+	    msgflg);
+#endif
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+
+	mtx_lock(&Giant);
+	msqid = IPCID_TO_IX(msqid);
+
+	if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+		printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+		    msginfo.msgmni);
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+
+	msqptr = &msqids[msqid];
+	if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+		printf("no such message queue id\n");
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+	if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+		printf("wrong sequence number\n");
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+
+	if ((error = ipcperm(td, &msqptr->msg_perm, IPC_W))) {
+#ifdef MSG_DEBUG_OK
+		printf("requester doesn't have write access\n");
+#endif
+		goto done2;
+	}
+
+	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
+#ifdef MSG_DEBUG_OK
+	printf("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz,
+	    segs_needed);
+#endif
+	for (;;) {
+		int need_more_resources = 0;
+
+		/*
+		 * check msgsz
+		 * (inside this loop in case msg_qbytes changes while we sleep)
+		 */
+
+		if (msgsz > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+			printf("msgsz > msqptr->msg_qbytes\n");
+#endif
+			error = EINVAL;
+			goto done2;
+		}
+
+		if (msqptr->msg_perm.mode & MSG_LOCKED) {
+#ifdef MSG_DEBUG_OK
+			printf("msqid is locked\n");
+#endif
+			need_more_resources = 1;
+		}
+		if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+			printf("msgsz + msg_cbytes > msg_qbytes\n");
+#endif
+			need_more_resources = 1;
+		}
+		if (segs_needed > nfree_msgmaps) {
+#ifdef MSG_DEBUG_OK
+			printf("segs_needed > nfree_msgmaps\n");
+#endif
+			need_more_resources = 1;
+		}
+		if (free_msghdrs == NULL) {
+#ifdef MSG_DEBUG_OK
+			printf("no more msghdrs\n");
+#endif
+			need_more_resources = 1;
+		}
+
+		if (need_more_resources) {
+			int we_own_it;
+
+			if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+				printf("need more resources but caller doesn't want to wait\n");
+#endif
+				error = EAGAIN;
+				goto done2;
+			}
+
+			if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) {
+#ifdef MSG_DEBUG_OK
+				printf("we don't own the msqid_ds\n");
+#endif
+				we_own_it = 0;
+			} else {
+				/* Force later arrivals to wait for our
+				   request */
+#ifdef MSG_DEBUG_OK
+				printf("we own the msqid_ds\n");
+#endif
+				msqptr->msg_perm.mode |= MSG_LOCKED;
+				we_own_it = 1;
+			}
+#ifdef MSG_DEBUG_OK
+			printf("goodnight\n");
+#endif
+			error = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH,
+			    "msgwait", 0);
+#ifdef MSG_DEBUG_OK
+			printf("good morning, error=%d\n", error);
+#endif
+			if (we_own_it)
+				msqptr->msg_perm.mode &= ~MSG_LOCKED;
+			if (error != 0) {
+#ifdef MSG_DEBUG_OK
+				printf("msgsnd:  interrupted system call\n");
+#endif
+				error = EINTR;
+				goto done2;
+			}
+
+			/*
+			 * Make sure that the msq queue still exists
+			 */
+
+			if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+				printf("msqid deleted\n");
+#endif
+				error = EIDRM;
+				goto done2;
+			}
+
+		} else {
+#ifdef MSG_DEBUG_OK
+			printf("got all the resources that we need\n");
+#endif
+			break;
+		}
+	}
+
+	/*
+	 * We have the resources that we need.
+	 * Make sure!
+	 */
+
+	if (msqptr->msg_perm.mode & MSG_LOCKED)
+		panic("msg_perm.mode & MSG_LOCKED");
+	if (segs_needed > nfree_msgmaps)
+		panic("segs_needed > nfree_msgmaps");
+	if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes)
+		panic("msgsz + msg_cbytes > msg_qbytes");
+	if (free_msghdrs == NULL)
+		panic("no more msghdrs");
+
+	/*
+	 * Re-lock the msqid_ds in case we page-fault when copying in the
+	 * message
+	 */
+
+	if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0)
+		panic("msqid_ds is already locked");
+	msqptr->msg_perm.mode |= MSG_LOCKED;
+
+	/*
+	 * Allocate a message header
+	 */
+
+	msghdr = free_msghdrs;
+	free_msghdrs = msghdr->msg_next;
+	msghdr->msg_spot = -1;
+	msghdr->msg_ts = msgsz;
+
+	/*
+	 * Allocate space for the message
+	 */
+
+	while (segs_needed > 0) {
+		if (nfree_msgmaps <= 0)
+			panic("not enough msgmaps");
+		if (free_msgmaps == -1)
+			panic("nil free_msgmaps");
+		next = free_msgmaps;
+		if (next <= -1)
+			panic("next too low #1");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #1");
+#ifdef MSG_DEBUG_OK
+		printf("allocating segment %d to message\n", next);
+#endif
+		free_msgmaps = msgmaps[next].next;
+		nfree_msgmaps--;
+		msgmaps[next].next = msghdr->msg_spot;
+		msghdr->msg_spot = next;
+		segs_needed--;
+	}
+
+	/*
+	 * Copy in the message type
+	 */
+
+	if ((error = copyin(user_msgp, &msghdr->msg_type,
+	    sizeof(msghdr->msg_type))) != 0) {
+#ifdef MSG_DEBUG_OK
+		printf("error %d copying the message type\n", error);
+#endif
+		msg_freehdr(msghdr);
+		msqptr->msg_perm.mode &= ~MSG_LOCKED;
+		wakeup((caddr_t)msqptr);
+		goto done2;
+	}
+	user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+	/*
+	 * Validate the message type
+	 */
+
+	if (msghdr->msg_type < 1) {
+		msg_freehdr(msghdr);
+		msqptr->msg_perm.mode &= ~MSG_LOCKED;
+		wakeup((caddr_t)msqptr);
+#ifdef MSG_DEBUG_OK
+		printf("mtype (%d) < 1\n", msghdr->msg_type);
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+
+	/*
+	 * Copy in the message body
+	 */
+
+	next = msghdr->msg_spot;
+	while (msgsz > 0) {
+		size_t tlen;
+		if (msgsz > msginfo.msgssz)
+			tlen = msginfo.msgssz;
+		else
+			tlen = msgsz;
+		if (next <= -1)
+			panic("next too low #2");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #2");
+		if ((error = copyin(user_msgp, &msgpool[next * msginfo.msgssz],
+		    tlen)) != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("error %d copying in message segment\n", error);
+#endif
+			msg_freehdr(msghdr);
+			msqptr->msg_perm.mode &= ~MSG_LOCKED;
+			wakeup((caddr_t)msqptr);
+			goto done2;
+		}
+		msgsz -= tlen;
+		user_msgp = (char *)user_msgp + tlen;
+		next = msgmaps[next].next;
+	}
+	if (next != -1)
+		panic("didn't use all the msg segments");
+
+	/*
+	 * We've got the message.  Unlock the msqid_ds.
+	 */
+
+	msqptr->msg_perm.mode &= ~MSG_LOCKED;
+
+	/*
+	 * Make sure that the msqid_ds is still allocated.
+	 */
+
+	if (msqptr->msg_qbytes == 0) {
+		msg_freehdr(msghdr);
+		wakeup((caddr_t)msqptr);
+		error = EIDRM;
+		goto done2;
+	}
+
+	/*
+	 * Put the message into the queue
+	 */
+
+	if (msqptr->msg_first == NULL) {
+		msqptr->msg_first = msghdr;
+		msqptr->msg_last = msghdr;
+	} else {
+		msqptr->msg_last->msg_next = msghdr;
+		msqptr->msg_last = msghdr;
+	}
+	msqptr->msg_last->msg_next = NULL;
+
+	msqptr->msg_cbytes += msghdr->msg_ts;
+	msqptr->msg_qnum++;
+	msqptr->msg_lspid = td->td_proc->p_pid;
+	msqptr->msg_stime = time_second;
+
+	wakeup((caddr_t)msqptr);
+	td->td_retval[0] = 0;
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgrcv_args {
+	int	msqid;
+	void	*msgp;
+	size_t	msgsz;
+	long	msgtyp;
+	int	msgflg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+msgrcv(td, uap)
+	struct thread *td;
+	register struct msgrcv_args *uap;
+{
+	int msqid = uap->msqid;
+	void *user_msgp = uap->msgp;
+	size_t msgsz = uap->msgsz;
+	long msgtyp = uap->msgtyp;
+	int msgflg = uap->msgflg;
+	size_t len;
+	register struct msqid_ds *msqptr;
+	register struct msg *msghdr;
+	int error = 0;
+	short next;
+
+#ifdef MSG_DEBUG_OK
+	printf("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp,
+	    msgsz, msgtyp, msgflg);
+#endif
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+
+	mtx_lock(&Giant);
+	msqid = IPCID_TO_IX(msqid);
+
+	if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+		printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+		    msginfo.msgmni);
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+
+	msqptr = &msqids[msqid];
+	if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+		printf("no such message queue id\n");
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+	if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+		printf("wrong sequence number\n");
+#endif
+		error = EINVAL;
+		goto done2;
+	}
+
+	if ((error = ipcperm(td, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+		printf("requester doesn't have read access\n");
+#endif
+		goto done2;
+	}
+
+	msghdr = NULL;
+	while (msghdr == NULL) {
+		if (msgtyp == 0) {
+			msghdr = msqptr->msg_first;
+			if (msghdr != NULL) {
+				if (msgsz < msghdr->msg_ts &&
+				    (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+					printf("first message on the queue is too big (want %d, got %d)\n",
+					    msgsz, msghdr->msg_ts);
+#endif
+					error = E2BIG;
+					goto done2;
+				}
+				if (msqptr->msg_first == msqptr->msg_last) {
+					msqptr->msg_first = NULL;
+					msqptr->msg_last = NULL;
+				} else {
+					msqptr->msg_first = msghdr->msg_next;
+					if (msqptr->msg_first == NULL)
+						panic("msg_first/last screwed up #1");
+				}
+			}
+		} else {
+			struct msg *previous;
+			struct msg **prev;
+
+			previous = NULL;
+			prev = &(msqptr->msg_first);
+			while ((msghdr = *prev) != NULL) {
+				/*
+				 * Is this message's type an exact match or is
+				 * this message's type less than or equal to
+				 * the absolute value of a negative msgtyp?
+				 * Note that the second half of this test can
+				 * NEVER be true if msgtyp is positive since
+				 * msg_type is always positive!
+				 */
+
+				if (msgtyp == msghdr->msg_type ||
+				    msghdr->msg_type <= -msgtyp) {
+#ifdef MSG_DEBUG_OK
+					printf("found message type %d, requested %d\n",
+					    msghdr->msg_type, msgtyp);
+#endif
+					if (msgsz < msghdr->msg_ts &&
+					    (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+						printf("requested message on the queue is too big (want %d, got %d)\n",
+						    msgsz, msghdr->msg_ts);
+#endif
+						error = E2BIG;
+						goto done2;
+					}
+					*prev = msghdr->msg_next;
+					if (msghdr == msqptr->msg_last) {
+						if (previous == NULL) {
+							if (prev !=
+							    &msqptr->msg_first)
+								panic("msg_first/last screwed up #2");
+							msqptr->msg_first =
+							    NULL;
+							msqptr->msg_last =
+							    NULL;
+						} else {
+							if (prev ==
+							    &msqptr->msg_first)
+								panic("msg_first/last screwed up #3");
+							msqptr->msg_last =
+							    previous;
+						}
+					}
+					break;
+				}
+				previous = msghdr;
+				prev = &(msghdr->msg_next);
+			}
+		}
+
+		/*
+		 * We've either extracted the msghdr for the appropriate
+		 * message or there isn't one.
+		 * If there is one then bail out of this loop.
+		 */
+
+		if (msghdr != NULL)
+			break;
+
+		/*
+		 * Hmph!  No message found.  Does the user want to wait?
+		 */
+
+		if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("no appropriate message found (msgtyp=%d)\n",
+			    msgtyp);
+#endif
+			/* The SVID says to return ENOMSG. */
+			error = ENOMSG;
+			goto done2;
+		}
+
+		/*
+		 * Wait for something to happen
+		 */
+
+#ifdef MSG_DEBUG_OK
+		printf("msgrcv:  goodnight\n");
+#endif
+		error = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, "msgwait",
+		    0);
+#ifdef MSG_DEBUG_OK
+		printf("msgrcv:  good morning (error=%d)\n", error);
+#endif
+
+		if (error != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("msgsnd:  interrupted system call\n");
+#endif
+			error = EINTR;
+			goto done2;
+		}
+
+		/*
+		 * Make sure that the msq queue still exists
+		 */
+
+		if (msqptr->msg_qbytes == 0 ||
+		    msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+			printf("msqid deleted\n");
+#endif
+			error = EIDRM;
+			goto done2;
+		}
+	}
+
+	/*
+	 * Return the message to the user.
+	 *
+	 * First, do the bookkeeping (before we risk being interrupted).
+	 */
+
+	msqptr->msg_cbytes -= msghdr->msg_ts;
+	msqptr->msg_qnum--;
+	msqptr->msg_lrpid = td->td_proc->p_pid;
+	msqptr->msg_rtime = time_second;
+
+	/*
+	 * Make msgsz the actual amount that we'll be returning.
+	 * Note that this effectively truncates the message if it is too long
+	 * (since msgsz is never increased).
+	 */
+
+#ifdef MSG_DEBUG_OK
+	printf("found a message, msgsz=%d, msg_ts=%d\n", msgsz,
+	    msghdr->msg_ts);
+#endif
+	if (msgsz > msghdr->msg_ts)
+		msgsz = msghdr->msg_ts;
+
+	/*
+	 * Return the type to the user.
+	 */
+
+	error = copyout((caddr_t)&(msghdr->msg_type), user_msgp,
+	    sizeof(msghdr->msg_type));
+	if (error != 0) {
+#ifdef MSG_DEBUG_OK
+		printf("error (%d) copying out message type\n", error);
+#endif
+		msg_freehdr(msghdr);
+		wakeup((caddr_t)msqptr);
+		goto done2;
+	}
+	user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+	/*
+	 * Return the segments to the user
+	 */
+
+	next = msghdr->msg_spot;
+	for (len = 0; len < msgsz; len += msginfo.msgssz) {
+		size_t tlen;
+
+		if (msgsz - len > msginfo.msgssz)
+			tlen = msginfo.msgssz;
+		else
+			tlen = msgsz - len;
+		if (next <= -1)
+			panic("next too low #3");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #3");
+		error = copyout((caddr_t)&msgpool[next * msginfo.msgssz],
+		    user_msgp, tlen);
+		if (error != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("error (%d) copying out message segment\n",
+			    error);
+#endif
+			msg_freehdr(msghdr);
+			wakeup((caddr_t)msqptr);
+			goto done2;
+		}
+		user_msgp = (char *)user_msgp + tlen;
+		next = msgmaps[next].next;
+	}
+
+	/*
+	 * Done, return the actual number of bytes copied out.
+	 */
+
+	msg_freehdr(msghdr);
+	wakeup((caddr_t)msqptr);
+	td->td_retval[0] = msgsz;
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static int
+sysctl_msqids(SYSCTL_HANDLER_ARGS)
+{
+
+	return (SYSCTL_OUT(req, msqids,
+	    sizeof(struct msqid_ds) * msginfo.msgmni));
+}
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RD, &msginfo.msgmni, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RD, &msginfo.msgmnb, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RD, &msginfo.msgtql, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RD, &msginfo.msgssz, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RD, &msginfo.msgseg, 0, "")
+SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLFLAG_RD,
+    NULL, 0, sysctl_msqids, "", "Message queue IDs");
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
new file mode 100644
index 0000000..af784b8
--- /dev/null
+++ b/sys/kern/sysv_sem.c
@@ -0,0 +1,1193 @@
+/* $FreeBSD$ */
+
+/*
+ * Implementation of SVID semaphores
+ *
+ * Author:  Daniel Boulet
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sem.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/jail.h>
+
+static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
+
+static void seminit(void);
+static int sysvsem_modload(struct module *, int, void *);
+static int semunload(void);
+static void semexit_myhook(struct proc *p);
+static int sysctl_sema(SYSCTL_HANDLER_ARGS);
+
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args;
+int __semctl(struct thread *td, struct __semctl_args *uap);
+struct semget_args;
+int semget(struct thread *td, struct semget_args *uap);
+struct semop_args;
+int semop(struct thread *td, struct semop_args *uap);
+#endif
+
+static struct sem_undo *semu_alloc(struct thread *td);
+static int semundo_adjust(struct thread *td, struct sem_undo **supptr, 
+		int semid, int semnum, int adjval);
+static void semundo_clear(int semid, int semnum);
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *semcalls[] = {
+	(sy_call_t *)__semctl, (sy_call_t *)semget,
+	(sy_call_t *)semop
+};
+
+static int	semtot = 0;
+static struct semid_ds *sema;	/* semaphore id pool */
+static struct sem *sem;		/* semaphore pool */
+static struct sem_undo *semu_list; /* list of active undo structures */
+static int	*semu;		/* undo structure pool */
+
+struct sem {
+	u_short	semval;		/* semaphore value */
+	pid_t	sempid;		/* pid of last operation */
+	u_short	semncnt;	/* # awaiting semval > cval */
+	u_short	semzcnt;	/* # awaiting semval = 0 */
+};
+
+/*
+ * Undo structure (one per process)
+ */
+struct sem_undo {
+	struct	sem_undo *un_next;	/* ptr to next active undo structure */
+	struct	proc *un_proc;		/* owner of this structure */
+	short	un_cnt;			/* # of active entries */
+	struct undo {
+		short	un_adjval;	/* adjust on exit values */
+		short	un_num;		/* semaphore # */
+		int	un_id;		/* semid */
+	} un_ent[1];			/* undo entries */
+};
+
+/*
+ * Configuration parameters
+ */
+#ifndef SEMMNI
+#define SEMMNI	10		/* # of semaphore identifiers */
+#endif
+#ifndef SEMMNS
+#define SEMMNS	60		/* # of semaphores in system */
+#endif
+#ifndef SEMUME
+#define SEMUME	10		/* max # of undo entries per process */
+#endif
+#ifndef SEMMNU
+#define SEMMNU	30		/* # of undo structures in system */
+#endif
+
+/* shouldn't need tuning */
+#ifndef SEMMAP
+#define SEMMAP	30		/* # of entries in semaphore map */
+#endif
+#ifndef SEMMSL
+#define SEMMSL	SEMMNS		/* max # of semaphores per id */
+#endif
+#ifndef SEMOPM
+#define SEMOPM	100		/* max # of operations per semop call */
+#endif
+
+#define SEMVMX	32767		/* semaphore maximum value */
+#define SEMAEM	16384		/* adjust on exit max value */
+
+/*
+ * Due to the way semaphore memory is allocated, we have to ensure that
+ * SEMUSZ is properly aligned.
+ */
+
+#define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
+
+/* actual size of an undo structure */
+#define SEMUSZ	SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))
+
+/*
+ * Macro to find a particular sem_undo vector
+ */
+#define SEMU(ix)	((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz))
+
+/*
+ * semaphore info struct
+ */
+struct seminfo seminfo = {
+                SEMMAP,         /* # of entries in semaphore map */
+                SEMMNI,         /* # of semaphore identifiers */
+                SEMMNS,         /* # of semaphores in system */
+                SEMMNU,         /* # of undo structures in system */
+                SEMMSL,         /* max # of semaphores per id */
+                SEMOPM,         /* max # of operations per semop call */
+                SEMUME,         /* max # of undo entries per process */
+                SEMUSZ,         /* size in bytes of undo structure */
+                SEMVMX,         /* semaphore maximum value */
+                SEMAEM          /* adjust on exit max value */
+};
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RD, &seminfo.semmni, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RD, &seminfo.semmns, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RD, &seminfo.semmnu, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RD, &seminfo.semopm, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RD, &seminfo.semume, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RD, &seminfo.semusz, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0, "");
+SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLFLAG_RD,
+    NULL, 0, sysctl_sema, "", "");
+
+static void
+seminit(void)
+{
+	register int i;
+
+	TUNABLE_INT_FETCH("kern.ipc.semmap", &seminfo.semmap);
+	TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
+	TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
+	TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
+	TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl);
+	TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm);
+	TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume);
+	TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz);
+	TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx);
+	TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem);
+
+	sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
+	if (sem == NULL)
+		panic("sem is NULL");
+	sema = malloc(sizeof(struct semid_ds) * seminfo.semmni, M_SEM, M_WAITOK);
+	if (sema == NULL)
+		panic("sema is NULL");
+	semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
+	if (semu == NULL)
+		panic("semu is NULL");
+
+	for (i = 0; i < seminfo.semmni; i++) {
+		sema[i].sem_base = 0;
+		sema[i].sem_perm.mode = 0;
+	}
+	for (i = 0; i < seminfo.semmnu; i++) {
+		register struct sem_undo *suptr = SEMU(i);
+		suptr->un_proc = NULL;
+	}
+	semu_list = NULL;
+	at_exit(semexit_myhook);
+}
+
+static int
+semunload(void)
+{
+
+	if (semtot != 0)
+		return (EBUSY);
+
+	free(sem, M_SEM);
+	free(sema, M_SEM);
+	free(semu, M_SEM);
+	rm_at_exit(semexit_myhook);
+	return (0);
+}
+
+static int
+sysvsem_modload(struct module *module, int cmd, void *arg)
+{
+	int error = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		seminit();
+		break;
+	case MOD_UNLOAD:
+		error = semunload();
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t sysvsem_mod = {
+	"sysvsem",
+	&sysvsem_modload,
+	NULL
+};
+
+SYSCALL_MODULE_HELPER(semsys);
+SYSCALL_MODULE_HELPER(__semctl);
+SYSCALL_MODULE_HELPER(semget);
+SYSCALL_MODULE_HELPER(semop);
+
+DECLARE_MODULE(sysvsem, sysvsem_mod,
+	SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
+MODULE_VERSION(sysvsem, 1);
+
+/*
+ * Entry point for all SEM calls
+ *
+ * MPSAFE
+ */
+int
+semsys(td, uap)
+	struct thread *td;
+	/* XXX actually varargs. */
+	struct semsys_args /* {
+		u_int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+		int	a5;
+	} */ *uap;
+{
+	int error;
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+	if (uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
+		return (EINVAL);
+	mtx_lock(&Giant);
+	error = (*semcalls[uap->which])(td, &uap->a2);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Allocate a new sem_undo structure for a process
+ * (returns ptr to structure or NULL if no more room)
+ */
+
+static struct sem_undo *
+semu_alloc(td)
+	struct thread *td;
+{
+	register int i;
+	register struct sem_undo *suptr;
+	register struct sem_undo **supptr;
+	int attempt;
+
+	/*
+	 * Try twice to allocate something.
+	 * (we'll purge any empty structures after the first pass so
+	 * two passes are always enough)
+	 */
+
+	for (attempt = 0; attempt < 2; attempt++) {
+		/*
+		 * Look for a free structure.
+		 * Fill it in and return it if we find one.
+		 */
+
+		for (i = 0; i < seminfo.semmnu; i++) {
+			suptr = SEMU(i);
+			if (suptr->un_proc == NULL) {
+				suptr->un_next = semu_list;
+				semu_list = suptr;
+				suptr->un_cnt = 0;
+				suptr->un_proc = td->td_proc;
+				return(suptr);
+			}
+		}
+
+		/*
+		 * We didn't find a free one, if this is the first attempt
+		 * then try to free some structures.
+		 */
+
+		if (attempt == 0) {
+			/* All the structures are in use - try to free some */
+			int did_something = 0;
+
+			supptr = &semu_list;
+			while ((suptr = *supptr) != NULL) {
+				if (suptr->un_cnt == 0)  {
+					suptr->un_proc = NULL;
+					*supptr = suptr->un_next;
+					did_something = 1;
+				} else
+					supptr = &(suptr->un_next);
+			}
+
+			/* If we didn't free anything then just give-up */
+			if (!did_something)
+				return(NULL);
+		} else {
+			/*
+			 * The second pass failed even though we freed
+			 * something after the first pass!
+			 * This is IMPOSSIBLE!
+			 */
+			panic("semu_alloc - second attempt failed");
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Adjust a particular entry for a particular proc
+ */
+
+static int
+semundo_adjust(td, supptr, semid, semnum, adjval)
+	register struct thread *td;
+	struct sem_undo **supptr;
+	int semid, semnum;
+	int adjval;
+{
+	struct proc *p = td->td_proc;
+	register struct sem_undo *suptr;
+	register struct undo *sunptr;
+	int i;
+
+	/* Look for and remember the sem_undo if the caller doesn't provide
+	   it */
+
+	suptr = *supptr;
+	if (suptr == NULL) {
+		for (suptr = semu_list; suptr != NULL;
+		    suptr = suptr->un_next) {
+			if (suptr->un_proc == p) {
+				*supptr = suptr;
+				break;
+			}
+		}
+		if (suptr == NULL) {
+			if (adjval == 0)
+				return(0);
+			suptr = semu_alloc(td);
+			if (suptr == NULL)
+				return(ENOSPC);
+			*supptr = suptr;
+		}
+	}
+
+	/*
+	 * Look for the requested entry and adjust it (delete if adjval becomes
+	 * 0).
+	 */
+	sunptr = &suptr->un_ent[0];
+	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
+		if (sunptr->un_id != semid || sunptr->un_num != semnum)
+			continue;
+		if (adjval != 0) {
+			adjval += sunptr->un_adjval;
+			if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
+				return (ERANGE);
+		}
+		sunptr->un_adjval = adjval;
+		if (sunptr->un_adjval == 0) {
+			suptr->un_cnt--;
+			if (i < suptr->un_cnt)
+				suptr->un_ent[i] =
+				    suptr->un_ent[suptr->un_cnt];
+		}
+		return(0);
+	}
+
+	/* Didn't find the right entry - create it */
+	if (adjval == 0)
+		return(0);
+	if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
+		return (ERANGE);
+	if (suptr->un_cnt != seminfo.semume) {
+		sunptr = &suptr->un_ent[suptr->un_cnt];
+		suptr->un_cnt++;
+		sunptr->un_adjval = adjval;
+		sunptr->un_id = semid; sunptr->un_num = semnum;
+	} else
+		return(EINVAL);
+	return(0);
+}
+
+static void
+semundo_clear(semid, semnum)
+	int semid, semnum;
+{
+	register struct sem_undo *suptr;
+
+	for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) {
+		register struct undo *sunptr = &suptr->un_ent[0];
+		register int i = 0;
+
+		while (i < suptr->un_cnt) {
+			if (sunptr->un_id == semid) {
+				if (semnum == -1 || sunptr->un_num == semnum) {
+					suptr->un_cnt--;
+					if (i < suptr->un_cnt) {
+						suptr->un_ent[i] =
+						  suptr->un_ent[suptr->un_cnt];
+						continue;
+					}
+				}
+				if (semnum != -1)
+					break;
+			}
+			i++, sunptr++;
+		}
+	}
+}
+
+/*
+ * Note that the user-mode half of this passes a union, not a pointer
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args {
+	int	semid;
+	int	semnum;
+	int	cmd;
+	union	semun *arg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+__semctl(td, uap)
+	struct thread *td;
+	register struct __semctl_args *uap;
+{
+	int semid = uap->semid;
+	int semnum = uap->semnum;
+	int cmd = uap->cmd;
+	union semun *arg = uap->arg;
+	union semun real_arg;
+	struct ucred *cred = td->td_ucred;
+	int i, rval, error;
+	struct semid_ds sbuf;
+	register struct semid_ds *semaptr;
+	u_short usval;
+
+#ifdef SEM_DEBUG
+	printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg);
+#endif
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+
+	mtx_lock(&Giant);
+	switch(cmd) {
+	case SEM_STAT:
+		if (semid < 0 || semid >= seminfo.semmni)
+			UGAR(EINVAL);
+		semaptr = &sema[semid];
+		if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 )
+			UGAR(EINVAL);
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+			UGAR(error);
+		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			UGAR(error);
+		error = copyout((caddr_t)semaptr, real_arg.buf,
+			sizeof(struct semid_ds));
+		rval = IXSEQ_TO_IPCID(semid,semaptr->sem_perm);
+		if (error == 0)
+			td->td_retval[0] = rval;
+		goto done2;
+	}
+
+	semid = IPCID_TO_IX(semid);
+	if (semid < 0 || semid >= seminfo.semmni) {
+		error = EINVAL;
+		goto done2;
+	}
+
+	semaptr = &sema[semid];
+	if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+	    semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
+		error = EINVAL;
+		goto done2;
+	}
+
+	error = 0;
+	rval = 0;
+
+	switch (cmd) {
+	case IPC_RMID:
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_M)))
+			goto done2;
+		semaptr->sem_perm.cuid = cred->cr_uid;
+		semaptr->sem_perm.uid = cred->cr_uid;
+		semtot -= semaptr->sem_nsems;
+		for (i = semaptr->sem_base - sem; i < semtot; i++)
+			sem[i] = sem[i + semaptr->sem_nsems];
+		for (i = 0; i < seminfo.semmni; i++) {
+			if ((sema[i].sem_perm.mode & SEM_ALLOC) &&
+			    sema[i].sem_base > semaptr->sem_base)
+				sema[i].sem_base -= semaptr->sem_nsems;
+		}
+		semaptr->sem_perm.mode = 0;
+		semundo_clear(semid, -1);
+		wakeup((caddr_t)semaptr);
+		break;
+
+	case IPC_SET:
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_M)))
+			goto done2;
+		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			goto done2;
+		if ((error = copyin(real_arg.buf, (caddr_t)&sbuf,
+		    sizeof(sbuf))) != 0) {
+			goto done2;
+		}
+		semaptr->sem_perm.uid = sbuf.sem_perm.uid;
+		semaptr->sem_perm.gid = sbuf.sem_perm.gid;
+		semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) |
+		    (sbuf.sem_perm.mode & 0777);
+		semaptr->sem_ctime = time_second;
+		break;
+
+	case IPC_STAT:
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+			goto done2;
+		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			goto done2;
+		error = copyout((caddr_t)semaptr, real_arg.buf,
+				sizeof(struct semid_ds));
+		break;
+
+	case GETNCNT:
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+			goto done2;
+		if (semnum < 0 || semnum >= semaptr->sem_nsems) {
+			error = EINVAL;
+			goto done2;
+		}
+		rval = semaptr->sem_base[semnum].semncnt;
+		break;
+
+	case GETPID:
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+			goto done2;
+		if (semnum < 0 || semnum >= semaptr->sem_nsems) {
+			error = EINVAL;
+			goto done2;
+		}
+		rval = semaptr->sem_base[semnum].sempid;
+		break;
+
+	case GETVAL:
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+			goto done2;
+		if (semnum < 0 || semnum >= semaptr->sem_nsems) {
+			error = EINVAL;
+			goto done2;
+		}
+		rval = semaptr->sem_base[semnum].semval;
+		break;
+
+	case GETALL:
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+			goto done2;
+		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			goto done2;
+		for (i = 0; i < semaptr->sem_nsems; i++) {
+			error = copyout((caddr_t)&semaptr->sem_base[i].semval,
+			    &real_arg.array[i], sizeof(real_arg.array[0]));
+			if (error != 0)
+				break;
+		}
+		break;
+
+	case GETZCNT:
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_R)))
+			goto done2;
+		if (semnum < 0 || semnum >= semaptr->sem_nsems) {
+			error = EINVAL;
+			goto done2;
+		}
+		rval = semaptr->sem_base[semnum].semzcnt;
+		break;
+
+	case SETVAL:
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_W)))
+			goto done2;
+		if (semnum < 0 || semnum >= semaptr->sem_nsems) {
+			error = EINVAL;
+			goto done2;
+		}
+		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			goto done2;
+		if (real_arg.val < 0 || real_arg.val > seminfo.semvmx) {
+			error = ERANGE;
+			goto done2;
+		}
+		semaptr->sem_base[semnum].semval = real_arg.val;
+		semundo_clear(semid, semnum);
+		wakeup((caddr_t)semaptr);
+		break;
+
+	case SETALL:
+		if ((error = ipcperm(td, &semaptr->sem_perm, IPC_W)))
+			goto done2;
+		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			goto done2;
+		for (i = 0; i < semaptr->sem_nsems; i++) {
+			error = copyin(&real_arg.array[i],
+			    (caddr_t)&usval, sizeof(real_arg.array[0]));
+			if (error != 0)
+				break;
+			if (usval > seminfo.semvmx) {
+				error = ERANGE;
+				break;
+			}
+			semaptr->sem_base[i].semval = usval;
+		}
+		semundo_clear(semid, -1);
+		wakeup((caddr_t)semaptr);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	if (error == 0)
+		td->td_retval[0] = rval;
+done2:
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semget_args {
+	key_t	key;
+	int	nsems;
+	int	semflg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+semget(td, uap)
+	struct thread *td;
+	register struct semget_args *uap;
+{
+	int semid, error = 0;
+	int key = uap->key;
+	int nsems = uap->nsems;
+	int semflg = uap->semflg;
+	struct ucred *cred = td->td_ucred;
+
+#ifdef SEM_DEBUG
+	printf("semget(0x%x, %d, 0%o)\n", key, nsems, semflg);
+#endif
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+
+	mtx_lock(&Giant);
+	if (key != IPC_PRIVATE) {
+		for (semid = 0; semid < seminfo.semmni; semid++) {
+			if ((sema[semid].sem_perm.mode & SEM_ALLOC) &&
+			    sema[semid].sem_perm.key == key)
+				break;
+		}
+		if (semid < seminfo.semmni) {
+#ifdef SEM_DEBUG
+			printf("found public key\n");
+#endif
+			if ((error = ipcperm(td, &sema[semid].sem_perm,
+			    semflg & 0700))) {
+				goto done2;
+			}
+			if (nsems > 0 && sema[semid].sem_nsems < nsems) {
+#ifdef SEM_DEBUG
+				printf("too small\n");
+#endif
+				error = EINVAL;
+				goto done2;
+			}
+			if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
+#ifdef SEM_DEBUG
+				printf("not exclusive\n");
+#endif
+				error = EEXIST;
+				goto done2;
+			}
+			goto found;
+		}
+	}
+
+#ifdef SEM_DEBUG
+	printf("need to allocate the semid_ds\n");
+#endif
+	if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
+		if (nsems <= 0 || nsems > seminfo.semmsl) {
+#ifdef SEM_DEBUG
+			printf("nsems out of range (0<%d<=%d)\n", nsems,
+			    seminfo.semmsl);
+#endif
+			error = EINVAL;
+			goto done2;
+		}
+		if (nsems > seminfo.semmns - semtot) {
+#ifdef SEM_DEBUG
+			printf("not enough semaphores left (need %d, got %d)\n",
+			    nsems, seminfo.semmns - semtot);
+#endif
+			error = ENOSPC;
+			goto done2;
+		}
+		for (semid = 0; semid < seminfo.semmni; semid++) {
+			if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0)
+				break;
+		}
+		if (semid == seminfo.semmni) {
+#ifdef SEM_DEBUG
+			printf("no more semid_ds's available\n");
+#endif
+			error = ENOSPC;
+			goto done2;
+		}
+#ifdef SEM_DEBUG
+		printf("semid %d is available\n", semid);
+#endif
+		sema[semid].sem_perm.key = key;
+		sema[semid].sem_perm.cuid = cred->cr_uid;
+		sema[semid].sem_perm.uid = cred->cr_uid;
+		sema[semid].sem_perm.cgid = cred->cr_gid;
+		sema[semid].sem_perm.gid = cred->cr_gid;
+		sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
+		sema[semid].sem_perm.seq =
+		    (sema[semid].sem_perm.seq + 1) & 0x7fff;
+		sema[semid].sem_nsems = nsems;
+		sema[semid].sem_otime = 0;
+		sema[semid].sem_ctime = time_second;
+		sema[semid].sem_base = &sem[semtot];
+		semtot += nsems;
+		bzero(sema[semid].sem_base,
+		    sizeof(sema[semid].sem_base[0])*nsems);
+#ifdef SEM_DEBUG
+		printf("sembase = 0x%x, next = 0x%x\n", sema[semid].sem_base,
+		    &sem[semtot]);
+#endif
+	} else {
+#ifdef SEM_DEBUG
+		printf("didn't find it and wasn't asked to create it\n");
+#endif
+		error = ENOENT;
+		goto done2;
+	}
+
+found:
+	td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semop_args {
+	int	semid;
+	struct	sembuf *sops;
+	u_int	nsops;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+semop(td, uap)
+	struct thread *td;
+	register struct semop_args *uap;
+{
+	int semid = uap->semid;
+	u_int nsops = uap->nsops;
+	struct sembuf *sops = NULL;
+	register struct semid_ds *semaptr;
+	register struct sembuf *sopptr = 0;
+	register struct sem *semptr = 0;
+	struct sem_undo *suptr;
+	int i, j, error;
+	int do_wakeup, do_undos;
+
+#ifdef SEM_DEBUG
+	printf("call to semop(%d, 0x%x, %u)\n", semid, sops, nsops);
+#endif
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+
+	mtx_lock(&Giant);
+	semid = IPCID_TO_IX(semid);	/* Convert back to zero origin */
+
+	if (semid < 0 || semid >= seminfo.semmni) {
+		error = EINVAL;
+		goto done2;
+	}
+
+	semaptr = &sema[semid];
+	if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) {
+		error = EINVAL;
+		goto done2;
+	}
+	if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
+		error = EINVAL;
+		goto done2;
+	}
+	if (nsops > seminfo.semopm) {
+#ifdef SEM_DEBUG
+		printf("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
+		    nsops);
+#endif
+		error = E2BIG;
+		goto done2;
+	}
+
+	/* Allocate memory for sem_ops */
+	sops = malloc(nsops * sizeof(sops[0]), M_SEM, M_WAITOK);
+	if (!sops)
+		panic("Failed to allocate %d sem_ops", nsops);
+
+	if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
+#ifdef SEM_DEBUG
+		printf("error = %d from copyin(%08x, %08x, %d)\n", error,
+		    uap->sops, sops, nsops * sizeof(sops[0]));
+#endif
+		goto done2;
+	}
+
+	/*
+	 * Initial pass thru sops to see what permissions are needed.
+	 * Also perform any checks that don't need repeating on each
+	 * attempt to satisfy the request vector.
+	 */
+	j = 0;		/* permission needed */
+	do_undos = 0;
+	for (i = 0; i < nsops; i++) {
+		sopptr = &sops[i];
+		if (sopptr->sem_num >= semaptr->sem_nsems) {
+			error = EFBIG;
+			goto done2;
+		}
+		if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
+			do_undos = 1;
+		j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
+	}
+
+	if ((error = ipcperm(td, &semaptr->sem_perm, j))) {
+#ifdef SEM_DEBUG
+		printf("error = %d from ipaccess\n", error);
+#endif
+		goto done2;
+	}
+
+	/*
+	 * Loop trying to satisfy the vector of requests.
+	 * If we reach a point where we must wait, any requests already
+	 * performed are rolled back and we go to sleep until some other
+	 * process wakes us up.  At this point, we start all over again.
+	 *
+	 * This ensures that from the perspective of other tasks, a set
+	 * of requests is atomic (never partially satisfied).
+	 */
+	for (;;) {
+		do_wakeup = 0;
+		error = 0;	/* error return if necessary */
+
+		for (i = 0; i < nsops; i++) {
+			sopptr = &sops[i];
+			semptr = &semaptr->sem_base[sopptr->sem_num];
+
+#ifdef SEM_DEBUG
+			printf("semop:  semaptr=%x, sem_base=%x, semptr=%x, sem[%d]=%d : op=%d, flag=%s\n",
+			    semaptr, semaptr->sem_base, semptr,
+			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
+			    (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait");
+#endif
+
+			if (sopptr->sem_op < 0) {
+				if (semptr->semval + sopptr->sem_op < 0) {
+#ifdef SEM_DEBUG
+					printf("semop:  can't do it now\n");
+#endif
+					break;
+				} else {
+					semptr->semval += sopptr->sem_op;
+					if (semptr->semval == 0 &&
+					    semptr->semzcnt > 0)
+						do_wakeup = 1;
+				}
+			} else if (sopptr->sem_op == 0) {
+				if (semptr->semval != 0) {
+#ifdef SEM_DEBUG
+					printf("semop:  not zero now\n");
+#endif
+					break;
+				}
+			} else if (semptr->semval + sopptr->sem_op >
+			    seminfo.semvmx) {
+				error = ERANGE;
+				break;
+			} else {
+				if (semptr->semncnt > 0)
+					do_wakeup = 1;
+				semptr->semval += sopptr->sem_op;
+			}
+		}
+
+		/*
+		 * Did we get through the entire vector?
+		 */
+		if (i >= nsops)
+			goto done;
+
+		/*
+		 * No ... rollback anything that we've already done
+		 */
+#ifdef SEM_DEBUG
+		printf("semop:  rollback 0 through %d\n", i-1);
+#endif
+		for (j = 0; j < i; j++)
+			semaptr->sem_base[sops[j].sem_num].semval -=
+			    sops[j].sem_op;
+
+		/* If we detected an error, return it */
+		if (error != 0)
+			goto done2;
+
+		/*
+		 * If the request that we couldn't satisfy has the
+		 * NOWAIT flag set then return with EAGAIN.
+		 */
+		if (sopptr->sem_flg & IPC_NOWAIT) {
+			error = EAGAIN;
+			goto done2;
+		}
+
+		if (sopptr->sem_op == 0)
+			semptr->semzcnt++;
+		else
+			semptr->semncnt++;
+
+#ifdef SEM_DEBUG
+		printf("semop:  good night!\n");
+#endif
+		error = tsleep((caddr_t)semaptr, (PZERO - 4) | PCATCH,
+		    "semwait", 0);
+#ifdef SEM_DEBUG
+		printf("semop:  good morning (error=%d)!\n", error);
+#endif
+
+		if (error != 0) {
+			error = EINTR;
+			goto done2;
+		}
+#ifdef SEM_DEBUG
+		printf("semop:  good morning!\n");
+#endif
+
+		/*
+		 * Make sure that the semaphore still exists
+		 */
+		if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+		    semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
+			error = EIDRM;
+			goto done2;
+		}
+
+		/*
+		 * The semaphore is still alive.  Readjust the count of
+		 * waiting processes.
+		 */
+		if (sopptr->sem_op == 0)
+			semptr->semzcnt--;
+		else
+			semptr->semncnt--;
+	}
+
+done:
+	/*
+	 * Process any SEM_UNDO requests.
+	 */
+	if (do_undos) {
+		suptr = NULL;
+		for (i = 0; i < nsops; i++) {
+			/*
+			 * We only need to deal with SEM_UNDO's for non-zero
+			 * op's.
+			 */
+			int adjval;
+
+			if ((sops[i].sem_flg & SEM_UNDO) == 0)
+				continue;
+			adjval = sops[i].sem_op;
+			if (adjval == 0)
+				continue;
+			error = semundo_adjust(td, &suptr, semid,
+			    sops[i].sem_num, -adjval);
+			if (error == 0)
+				continue;
+
+			/*
+			 * Oh-Oh!  We ran out of either sem_undo's or undo's.
+			 * Rollback the adjustments to this point and then
+			 * rollback the semaphore ups and down so we can return
+			 * with an error with all structures restored.  We
+			 * rollback the undo's in the exact reverse order that
+			 * we applied them.  This guarantees that we won't run
+			 * out of space as we roll things back out.
+			 */
+			for (j = i - 1; j >= 0; j--) {
+				if ((sops[j].sem_flg & SEM_UNDO) == 0)
+					continue;
+				adjval = sops[j].sem_op;
+				if (adjval == 0)
+					continue;
+				if (semundo_adjust(td, &suptr, semid,
+				    sops[j].sem_num, adjval) != 0)
+					panic("semop - can't undo undos");
+			}
+
+			for (j = 0; j < nsops; j++)
+				semaptr->sem_base[sops[j].sem_num].semval -=
+				    sops[j].sem_op;
+
+#ifdef SEM_DEBUG
+			printf("error = %d from semundo_adjust\n", error);
+#endif
+			goto done2;
+		} /* loop through the sops */
+	} /* if (do_undos) */
+
+	/* We're definitely done - set the sempid's and time */
+	for (i = 0; i < nsops; i++) {
+		sopptr = &sops[i];
+		semptr = &semaptr->sem_base[sopptr->sem_num];
+		semptr->sempid = td->td_proc->p_pid;
+	}
+	semaptr->sem_otime = time_second;
+
+	/*
+	 * Do a wakeup if any semaphore was up'd whilst something was
+	 * sleeping on it.
+	 */
+	if (do_wakeup) {
+#ifdef SEM_DEBUG
+		printf("semop:  doing wakeup\n");
+#endif
+		wakeup((caddr_t)semaptr);
+#ifdef SEM_DEBUG
+		printf("semop:  back from wakeup\n");
+#endif
+	}
+#ifdef SEM_DEBUG
+	printf("semop:  done\n");
+#endif
+	td->td_retval[0] = 0;
+done2:
+	if (sops)
+	    free(sops, M_SEM);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Go through the undo structures for this process and apply the adjustments to
+ * semaphores.
+ */
+static void
+semexit_myhook(p)
+	struct proc *p;
+{
+	register struct sem_undo *suptr;
+	register struct sem_undo **supptr;
+
+	/*
+	 * Go through the chain of undo vectors looking for one
+	 * associated with this process.
+	 */
+
+	for (supptr = &semu_list; (suptr = *supptr) != NULL;
+	    supptr = &suptr->un_next) {
+		if (suptr->un_proc == p)
+			break;
+	}
+
+	if (suptr == NULL)
+		return;
+
+#ifdef SEM_DEBUG
+	printf("proc @%08x has undo structure with %d entries\n", p,
+	    suptr->un_cnt);
+#endif
+
+	/*
+	 * If there are any active undo elements then process them.
+	 */
+	if (suptr->un_cnt > 0) {
+		int ix;
+
+		for (ix = 0; ix < suptr->un_cnt; ix++) {
+			int semid = suptr->un_ent[ix].un_id;
+			int semnum = suptr->un_ent[ix].un_num;
+			int adjval = suptr->un_ent[ix].un_adjval;
+			struct semid_ds *semaptr;
+
+			semaptr = &sema[semid];
+			if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
+				panic("semexit - semid not allocated");
+			if (semnum >= semaptr->sem_nsems)
+				panic("semexit - semnum out of range");
+
+#ifdef SEM_DEBUG
+			printf("semexit:  %08x id=%d num=%d(adj=%d) ; sem=%d\n",
+			    suptr->un_proc, suptr->un_ent[ix].un_id,
+			    suptr->un_ent[ix].un_num,
+			    suptr->un_ent[ix].un_adjval,
+			    semaptr->sem_base[semnum].semval);
+#endif
+
+			if (adjval < 0) {
+				if (semaptr->sem_base[semnum].semval < -adjval)
+					semaptr->sem_base[semnum].semval = 0;
+				else
+					semaptr->sem_base[semnum].semval +=
+					    adjval;
+			} else
+				semaptr->sem_base[semnum].semval += adjval;
+
+			wakeup((caddr_t)semaptr);
+#ifdef SEM_DEBUG
+			printf("semexit:  back from wakeup\n");
+#endif
+		}
+	}
+
+	/*
+	 * Deallocate the undo vector.
+	 */
+#ifdef SEM_DEBUG
+	printf("removing vector\n");
+#endif
+	suptr->un_proc = NULL;
+	*supptr = suptr->un_next;
+}
+
+static int
+sysctl_sema(SYSCTL_HANDLER_ARGS)
+{
+
+	return (SYSCTL_OUT(req, sema,
+	    sizeof(struct semid_ds) * seminfo.semmni));
+}
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
new file mode 100644
index 0000000..85356a0
--- /dev/null
+++ b/sys/kern/sysv_shm.c
@@ -0,0 +1,890 @@
+/* $FreeBSD$ */
+/*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
+
+/*
+ * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Adam Glass and Charles
+ *	Hannum.
+ * 4. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_compat.h"
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/sysctl.h>
+#include <sys/shm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
+
+struct oshmctl_args;
+static int oshmctl(struct thread *td, struct oshmctl_args *uap);
+
+static int shmget_allocate_segment(struct thread *td,
+    struct shmget_args *uap, int mode);
+static int shmget_existing(struct thread *td, struct shmget_args *uap,
+    int mode, int segnum);
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *shmcalls[] = {
+	(sy_call_t *)shmat, (sy_call_t *)oshmctl,
+	(sy_call_t *)shmdt, (sy_call_t *)shmget,
+	(sy_call_t *)shmctl
+};
+
+#define	SHMSEG_FREE     	0x0200
+#define	SHMSEG_REMOVED  	0x0400
+#define	SHMSEG_ALLOCATED	0x0800
+#define	SHMSEG_WANTED		0x1000
+
+static int shm_last_free, shm_nused, shm_committed, shmalloced;
+static struct shmid_ds	*shmsegs;
+
+struct shm_handle {
+	/* vm_offset_t kva; */
+	vm_object_t shm_object;
+};
+
+struct shmmap_state {
+	vm_offset_t va;
+	int shmid;
+};
+
+static void shm_deallocate_segment(struct shmid_ds *);
+static int shm_find_segment_by_key(key_t);
+static struct shmid_ds *shm_find_segment_by_shmid(int);
+static struct shmid_ds *shm_find_segment_by_shmidx(int);
+static int shm_delete_mapping(struct proc *p, struct shmmap_state *);
+static void shmrealloc(void);
+static void shminit(void);
+static int sysvshm_modload(struct module *, int, void *);
+static int shmunload(void);
+static void shmexit_myhook(struct proc *p);
+static void shmfork_myhook(struct proc *p1, struct proc *p2);
+static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
+
+/*
+ * Tuneable values.
+ */
+#ifndef SHMMAXPGS
+#define	SHMMAXPGS	8192	/* Note: sysv shared memory is swap backed. */
+#endif
+#ifndef SHMMAX
+#define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
+#endif
+#ifndef SHMMIN
+#define	SHMMIN	1
+#endif
+#ifndef SHMMNI
+#define	SHMMNI	192
+#endif
+#ifndef SHMSEG
+#define	SHMSEG	128
+#endif
+#ifndef SHMALL
+#define	SHMALL	(SHMMAXPGS)
+#endif
+
+struct	shminfo shminfo = {
+	SHMMAX,
+	SHMMIN,
+	SHMMNI,
+	SHMSEG,
+	SHMALL
+};
+
+static int shm_use_phys;
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RD, &shminfo.shmmni, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RD, &shminfo.shmseg, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
+    &shm_use_phys, 0, "");
+SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLFLAG_RD,
+    NULL, 0, sysctl_shmsegs, "", "");
+
+static int
+shm_find_segment_by_key(key)
+	key_t key;
+{
+	int i;
+
+	for (i = 0; i < shmalloced; i++)
+		if ((shmsegs[i].shm_perm.mode & SHMSEG_ALLOCATED) &&
+		    shmsegs[i].shm_perm.key == key)
+			return i;
+	return -1;
+}
+
+static struct shmid_ds *
+shm_find_segment_by_shmid(shmid)
+	int shmid;
+{
+	int segnum;
+	struct shmid_ds *shmseg;
+
+	segnum = IPCID_TO_IX(shmid);
+	if (segnum < 0 || segnum >= shmalloced)
+		return NULL;
+	shmseg = &shmsegs[segnum];
+	if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED))
+	    != SHMSEG_ALLOCATED ||
+	    shmseg->shm_perm.seq != IPCID_TO_SEQ(shmid))
+		return NULL;
+	return shmseg;
+}
+
+static struct shmid_ds *
+shm_find_segment_by_shmidx(int segnum)
+{
+	struct shmid_ds *shmseg;
+
+	if (segnum < 0 || segnum >= shmalloced)
+		return NULL;
+	shmseg = &shmsegs[segnum];
+	if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED))
+	    != SHMSEG_ALLOCATED )
+		return NULL;
+	return shmseg;
+}
+
+static void
+shm_deallocate_segment(shmseg)
+	struct shmid_ds *shmseg;
+{
+	struct shm_handle *shm_handle;
+	size_t size;
+
+	GIANT_REQUIRED;
+
+	shm_handle = shmseg->shm_internal;
+	vm_object_deallocate(shm_handle->shm_object);
+	free((caddr_t)shm_handle, M_SHM);
+	shmseg->shm_internal = NULL;
+	size = round_page(shmseg->shm_segsz);
+	shm_committed -= btoc(size);
+	shm_nused--;
+	shmseg->shm_perm.mode = SHMSEG_FREE;
+}
+
+static int
+shm_delete_mapping(p, shmmap_s)
+	struct proc *p;
+	struct shmmap_state *shmmap_s;
+{
+	struct shmid_ds *shmseg;
+	int segnum, result;
+	size_t size;
+
+	GIANT_REQUIRED;
+
+	segnum = IPCID_TO_IX(shmmap_s->shmid);
+	shmseg = &shmsegs[segnum];
+	size = round_page(shmseg->shm_segsz);
+	result = vm_map_remove(&p->p_vmspace->vm_map, shmmap_s->va,
+	    shmmap_s->va + size);
+	if (result != KERN_SUCCESS)
+		return EINVAL;
+	shmmap_s->shmid = -1;
+	shmseg->shm_dtime = time_second;
+	if ((--shmseg->shm_nattch <= 0) &&
+	    (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
+		shm_deallocate_segment(shmseg);
+		shm_last_free = segnum;
+	}
+	return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmdt_args {
+	void *shmaddr;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+shmdt(td, uap)
+	struct thread *td;
+	struct shmdt_args *uap;
+{
+	struct proc *p = td->td_proc;
+	struct shmmap_state *shmmap_s;
+	int i;
+	int error = 0;
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+	mtx_lock(&Giant);
+	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ 	if (shmmap_s == NULL) {
+		error = EINVAL;
+		goto done2;
+	}
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
+		if (shmmap_s->shmid != -1 &&
+		    shmmap_s->va == (vm_offset_t)uap->shmaddr) {
+			break;
+		}
+	}
+	if (i == shminfo.shmseg) {
+		error = EINVAL;
+		goto done2;
+	}
+	error = shm_delete_mapping(p, shmmap_s);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args {
+	int shmid;
+	void *shmaddr;
+	int shmflg;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+shmat(td, uap)
+	struct thread *td;
+	struct shmat_args *uap;
+{
+	struct proc *p = td->td_proc;
+	int i, flags;
+	struct shmid_ds *shmseg;
+	struct shmmap_state *shmmap_s = NULL;
+	struct shm_handle *shm_handle;
+	vm_offset_t attach_va;
+	vm_prot_t prot;
+	vm_size_t size;
+	int rv;
+	int error = 0;
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+	mtx_lock(&Giant);
+	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+	if (shmmap_s == NULL) {
+		size = shminfo.shmseg * sizeof(struct shmmap_state);
+		shmmap_s = malloc(size, M_SHM, M_WAITOK);
+		for (i = 0; i < shminfo.shmseg; i++)
+			shmmap_s[i].shmid = -1;
+		p->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+	}
+	shmseg = shm_find_segment_by_shmid(uap->shmid);
+	if (shmseg == NULL) {
+		error = EINVAL;
+		goto done2;
+	}
+	error = ipcperm(td, &shmseg->shm_perm,
+	    (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
+	if (error)
+		goto done2;
+	for (i = 0; i < shminfo.shmseg; i++) {
+		if (shmmap_s->shmid == -1)
+			break;
+		shmmap_s++;
+	}
+	if (i >= shminfo.shmseg) {
+		error = EMFILE;
+		goto done2;
+	}
+	size = round_page(shmseg->shm_segsz);
+#ifdef VM_PROT_READ_IS_EXEC
+	prot = VM_PROT_READ | VM_PROT_EXECUTE;
+#else
+	prot = VM_PROT_READ;
+#endif
+	if ((uap->shmflg & SHM_RDONLY) == 0)
+		prot |= VM_PROT_WRITE;
+	flags = MAP_ANON | MAP_SHARED;
+	if (uap->shmaddr) {
+		flags |= MAP_FIXED;
+		if (uap->shmflg & SHM_RND) {
+			attach_va = (vm_offset_t)uap->shmaddr & ~(SHMLBA-1);
+		} else if (((vm_offset_t)uap->shmaddr & (SHMLBA-1)) == 0) {
+			attach_va = (vm_offset_t)uap->shmaddr;
+		} else {
+			error = EINVAL;
+			goto done2;
+		}
+	} else {
+		/*
+		 * This is just a hint to vm_map_find() about where to
+		 * put it.
+		 */
+		attach_va = round_page((vm_offset_t)p->p_vmspace->vm_taddr
+		    + maxtsiz + maxdsiz);
+	}
+
+	shm_handle = shmseg->shm_internal;
+	vm_object_reference(shm_handle->shm_object);
+	rv = vm_map_find(&p->p_vmspace->vm_map, shm_handle->shm_object,
+		0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0);
+	if (rv != KERN_SUCCESS) {
+		error = ENOMEM;
+		goto done2;
+	}
+	vm_map_inherit(&p->p_vmspace->vm_map,
+		attach_va, attach_va + size, VM_INHERIT_SHARE);
+
+	shmmap_s->va = attach_va;
+	shmmap_s->shmid = uap->shmid;
+	shmseg->shm_lpid = p->p_pid;
+	shmseg->shm_atime = time_second;
+	shmseg->shm_nattch++;
+	td->td_retval[0] = attach_va;
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+struct oshmid_ds {
+	struct	ipc_perm shm_perm;	/* operation perms */
+	int	shm_segsz;		/* size of segment (bytes) */
+	ushort	shm_cpid;		/* pid, creator */
+	ushort	shm_lpid;		/* pid, last operation */
+	short	shm_nattch;		/* no. of current attaches */
+	time_t	shm_atime;		/* last attach time */
+	time_t	shm_dtime;		/* last detach time */
+	time_t	shm_ctime;		/* last change time */
+	void	*shm_handle;		/* internal handle for shm segment */
+};
+
+struct oshmctl_args {
+	int shmid;
+	int cmd;
+	struct oshmid_ds *ubuf;
+};
+
+/*
+ * MPSAFE
+ */
+static int
+oshmctl(td, uap)
+	struct thread *td;
+	struct oshmctl_args *uap;
+{
+#ifdef COMPAT_43
+	int error = 0;
+	struct shmid_ds *shmseg;
+	struct oshmid_ds outbuf;
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+	mtx_lock(&Giant);
+	shmseg = shm_find_segment_by_shmid(uap->shmid);
+	if (shmseg == NULL) {
+		error = EINVAL;
+		goto done2;
+	}
+	switch (uap->cmd) {
+	case IPC_STAT:
+		error = ipcperm(td, &shmseg->shm_perm, IPC_R);
+		if (error)
+			goto done2;
+		outbuf.shm_perm = shmseg->shm_perm;
+		outbuf.shm_segsz = shmseg->shm_segsz;
+		outbuf.shm_cpid = shmseg->shm_cpid;
+		outbuf.shm_lpid = shmseg->shm_lpid;
+		outbuf.shm_nattch = shmseg->shm_nattch;
+		outbuf.shm_atime = shmseg->shm_atime;
+		outbuf.shm_dtime = shmseg->shm_dtime;
+		outbuf.shm_ctime = shmseg->shm_ctime;
+		outbuf.shm_handle = shmseg->shm_internal;
+		error = copyout((caddr_t)&outbuf, uap->ubuf, sizeof(outbuf));
+		if (error)
+			goto done2;
+		break;
+	default:
+		/* XXX casting to (sy_call_t *) is bogus, as usual. */
+		error = ((sy_call_t *)shmctl)(td, uap);
+		break;
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+#else
+	return EINVAL;
+#endif
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmctl_args {
+	int shmid;
+	int cmd;
+	struct shmid_ds *buf;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+shmctl(td, uap)
+	struct thread *td;
+	struct shmctl_args *uap;
+{
+	int error = 0;
+	struct shmid_ds inbuf;
+	struct shmid_ds *shmseg;
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+	mtx_lock(&Giant);
+	switch (uap->cmd) {
+	case IPC_INFO:
+		error = copyout( (caddr_t)&shminfo, uap->buf, sizeof( shminfo ) );
+		if (error)
+			goto done2;
+		td->td_retval[0] = shmalloced;
+		goto done2;
+	case SHM_INFO: {
+		struct shm_info shm_info;
+		shm_info.used_ids = shm_nused;
+		shm_info.shm_rss = 0;	/*XXX where to get from ? */
+		shm_info.shm_tot = 0;	/*XXX where to get from ? */
+		shm_info.shm_swp = 0;	/*XXX where to get from ? */
+		shm_info.swap_attempts = 0;	/*XXX where to get from ? */
+		shm_info.swap_successes = 0;	/*XXX where to get from ? */
+		error = copyout( (caddr_t)&shm_info, uap->buf, sizeof( shm_info ) );
+		if (error)
+			goto done2;
+		td->td_retval[0] = shmalloced;
+		goto done2;
+	}
+	}
+	if( (uap->cmd) == SHM_STAT )
+		shmseg = shm_find_segment_by_shmidx(uap->shmid);
+	else
+		shmseg = shm_find_segment_by_shmid(uap->shmid);
+	if (shmseg == NULL) {
+		error = EINVAL;
+		goto done2;
+	}
+	switch (uap->cmd) {
+	case SHM_STAT:
+	case IPC_STAT:
+		error = ipcperm(td, &shmseg->shm_perm, IPC_R);
+		if (error)
+			goto done2;
+		error = copyout((caddr_t)shmseg, uap->buf, sizeof(inbuf));
+		if (error)
+			goto done2;
+		else if( (uap->cmd) == SHM_STAT )
+			td->td_retval[0] = IXSEQ_TO_IPCID( uap->shmid, shmseg->shm_perm );
+		break;
+	case IPC_SET:
+		error = ipcperm(td, &shmseg->shm_perm, IPC_M);
+		if (error)
+			goto done2;
+		error = copyin(uap->buf, (caddr_t)&inbuf, sizeof(inbuf));
+		if (error)
+			goto done2;
+		shmseg->shm_perm.uid = inbuf.shm_perm.uid;
+		shmseg->shm_perm.gid = inbuf.shm_perm.gid;
+		shmseg->shm_perm.mode =
+		    (shmseg->shm_perm.mode & ~ACCESSPERMS) |
+		    (inbuf.shm_perm.mode & ACCESSPERMS);
+		shmseg->shm_ctime = time_second;
+		break;
+	case IPC_RMID:
+		error = ipcperm(td, &shmseg->shm_perm, IPC_M);
+		if (error)
+			goto done2;
+		shmseg->shm_perm.key = IPC_PRIVATE;
+		shmseg->shm_perm.mode |= SHMSEG_REMOVED;
+		if (shmseg->shm_nattch <= 0) {
+			shm_deallocate_segment(shmseg);
+			shm_last_free = IPCID_TO_IX(uap->shmid);
+		}
+		break;
+#if 0
+	case SHM_LOCK:
+	case SHM_UNLOCK:
+#endif
+	default:
+		error = EINVAL;
+		break;
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmget_args {
+	key_t key;
+	size_t size;
+	int shmflg;
+};
+#endif
+
+static int
+shmget_existing(td, uap, mode, segnum)
+	struct thread *td;
+	struct shmget_args *uap;
+	int mode;
+	int segnum;
+{
+	struct shmid_ds *shmseg;
+	int error;
+
+	shmseg = &shmsegs[segnum];
+	if (shmseg->shm_perm.mode & SHMSEG_REMOVED) {
+		/*
+		 * This segment is in the process of being allocated.  Wait
+		 * until it's done, and look the key up again (in case the
+		 * allocation failed or it was freed).
+		 */
+		shmseg->shm_perm.mode |= SHMSEG_WANTED;
+		error = tsleep((caddr_t)shmseg, PLOCK | PCATCH, "shmget", 0);
+		if (error)
+			return error;
+		return EAGAIN;
+	}
+	if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
+		return EEXIST;
+	error = ipcperm(td, &shmseg->shm_perm, mode);
+	if (error)
+		return error;
+	if (uap->size && uap->size > shmseg->shm_segsz)
+		return EINVAL;
+	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+	return 0;
+}
+
+static int
+shmget_allocate_segment(td, uap, mode)
+	struct thread *td;
+	struct shmget_args *uap;
+	int mode;
+{
+	int i, segnum, shmid, size;
+	struct ucred *cred = td->td_ucred;
+	struct shmid_ds *shmseg;
+	struct shm_handle *shm_handle;
+
+	GIANT_REQUIRED;
+
+	if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
+		return EINVAL;
+	if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
+		return ENOSPC;
+	size = round_page(uap->size);
+	if (shm_committed + btoc(size) > shminfo.shmall)
+		return ENOMEM;
+	if (shm_last_free < 0) {
+		shmrealloc();	/* Maybe expand the shmsegs[] array. */
+		for (i = 0; i < shmalloced; i++)
+			if (shmsegs[i].shm_perm.mode & SHMSEG_FREE)
+				break;
+		if (i == shmalloced)
+			return ENOSPC;
+		segnum = i;
+	} else  {
+		segnum = shm_last_free;
+		shm_last_free = -1;
+	}
+	shmseg = &shmsegs[segnum];
+	/*
+	 * In case we sleep in malloc(), mark the segment present but deleted
+	 * so that noone else tries to create the same key.
+	 */
+	shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
+	shmseg->shm_perm.key = uap->key;
+	shmseg->shm_perm.seq = (shmseg->shm_perm.seq + 1) & 0x7fff;
+	shm_handle = (struct shm_handle *)
+	    malloc(sizeof(struct shm_handle), M_SHM, M_WAITOK);
+	shmid = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+	
+	/*
+	 * We make sure that we have allocated a pager before we need
+	 * to.
+	 */
+	if (shm_use_phys) {
+		shm_handle->shm_object =
+		    vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0);
+	} else {
+		shm_handle->shm_object =
+		    vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0);
+	}
+	vm_object_clear_flag(shm_handle->shm_object, OBJ_ONEMAPPING);
+	vm_object_set_flag(shm_handle->shm_object, OBJ_NOSPLIT);
+
+	shmseg->shm_internal = shm_handle;
+	shmseg->shm_perm.cuid = shmseg->shm_perm.uid = cred->cr_uid;
+	shmseg->shm_perm.cgid = shmseg->shm_perm.gid = cred->cr_gid;
+	shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) |
+	    (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
+	shmseg->shm_segsz = uap->size;
+	shmseg->shm_cpid = td->td_proc->p_pid;
+	shmseg->shm_lpid = shmseg->shm_nattch = 0;
+	shmseg->shm_atime = shmseg->shm_dtime = 0;
+	shmseg->shm_ctime = time_second;
+	shm_committed += btoc(size);
+	shm_nused++;
+	if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
+		/*
+		 * Somebody else wanted this key while we were asleep.  Wake
+		 * them up now.
+		 */
+		shmseg->shm_perm.mode &= ~SHMSEG_WANTED;
+		wakeup((caddr_t)shmseg);
+	}
+	td->td_retval[0] = shmid;
+	return 0;
+}
+
+/*
+ * MPSAFE
+ */
+int
+shmget(td, uap)
+	struct thread *td;
+	struct shmget_args *uap;
+{
+	int segnum, mode;
+	int error;
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+	mtx_lock(&Giant);
+	mode = uap->shmflg & ACCESSPERMS;
+	if (uap->key != IPC_PRIVATE) {
+	again:
+		segnum = shm_find_segment_by_key(uap->key);
+		if (segnum >= 0) {
+			error = shmget_existing(td, uap, mode, segnum);
+			if (error == EAGAIN)
+				goto again;
+			goto done2;
+		}
+		if ((uap->shmflg & IPC_CREAT) == 0) {
+			error = ENOENT;
+			goto done2;
+		}
+	}
+	error = shmget_allocate_segment(td, uap, mode);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+shmsys(td, uap)
+	struct thread *td;
+	/* XXX actually varargs. */
+	struct shmsys_args /* {
+		u_int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+	} */ *uap;
+{
+	int error;
+
+	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
+		return (ENOSYS);
+	if (uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
+		return (EINVAL);
+	mtx_lock(&Giant);
+	error = (*shmcalls[uap->which])(td, &uap->a2);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static void
+shmfork_myhook(p1, p2)
+	struct proc *p1, *p2;
+{
+	struct shmmap_state *shmmap_s;
+	size_t size;
+	int i;
+
+	size = shminfo.shmseg * sizeof(struct shmmap_state);
+	shmmap_s = malloc(size, M_SHM, M_WAITOK);
+	bcopy((caddr_t)p1->p_vmspace->vm_shm, (caddr_t)shmmap_s, size);
+	p2->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+		if (shmmap_s->shmid != -1)
+			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++;
+}
+
+static void
+shmexit_myhook(p)
+	struct proc *p;
+{
+	struct shmmap_state *shmmap_s;
+	int i;
+
+	GIANT_REQUIRED;
+
+	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+		if (shmmap_s->shmid != -1)
+			shm_delete_mapping(p, shmmap_s);
+	free((caddr_t)p->p_vmspace->vm_shm, M_SHM);
+	p->p_vmspace->vm_shm = NULL;
+}
+
+static void
+shmrealloc(void)
+{
+	int i;
+	struct shmid_ds *newsegs;
+
+	if (shmalloced >= shminfo.shmmni)
+		return;
+
+	newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
+	if (newsegs == NULL)
+		return;
+	for (i = 0; i < shmalloced; i++)
+		bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
+	for (; i < shminfo.shmmni; i++) {
+		shmsegs[i].shm_perm.mode = SHMSEG_FREE;
+		shmsegs[i].shm_perm.seq = 0;
+	}
+	free(shmsegs, M_SHM);
+	shmsegs = newsegs;
+	shmalloced = shminfo.shmmni;
+}
+
+static void
+shminit()
+{
+	int i;
+
+	TUNABLE_INT_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall);
+	shminfo.shmmax = shminfo.shmall * PAGE_SIZE;
+	TUNABLE_INT_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
+	TUNABLE_INT_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
+	TUNABLE_INT_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
+	TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);
+
+	shmalloced = shminfo.shmmni;
+	shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
+	if (shmsegs == NULL)
+		panic("cannot allocate initial memory for sysvshm");
+	for (i = 0; i < shmalloced; i++) {
+		shmsegs[i].shm_perm.mode = SHMSEG_FREE;
+		shmsegs[i].shm_perm.seq = 0;
+	}
+	shm_last_free = 0;
+	shm_nused = 0;
+	shm_committed = 0;
+	shmexit_hook = &shmexit_myhook;
+	shmfork_hook = &shmfork_myhook;
+}
+
+static int
+shmunload()
+{
+
+	if (shm_nused > 0)
+		return (EBUSY);
+
+	free(shmsegs, M_SHM);
+	shmexit_hook = NULL;
+	shmfork_hook = NULL;
+	return (0);
+}
+
+static int
+sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
+{
+
+	return (SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0])));
+}
+
+static int
+sysvshm_modload(struct module *module, int cmd, void *arg)
+{
+	int error = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		shminit();
+		break;
+	case MOD_UNLOAD:
+		error = shmunload();
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t sysvshm_mod = {
+	"sysvshm",
+	&sysvshm_modload,
+	NULL
+};
+
+SYSCALL_MODULE_HELPER(shmsys);
+SYSCALL_MODULE_HELPER(shmat);
+SYSCALL_MODULE_HELPER(shmctl);
+SYSCALL_MODULE_HELPER(shmdt);
+SYSCALL_MODULE_HELPER(shmget);
+
+DECLARE_MODULE(sysvshm, sysvshm_mod,
+	SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
+MODULE_VERSION(sysvshm, 1);
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
new file mode 100644
index 0000000..b9c5743
--- /dev/null
+++ b/sys/kern/tty.c
@@ -0,0 +1,2660 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Copyright (c) 2002 Networks Associates Technologies, Inc.
+ * All rights reserved.
+ *
+ * Portions of this software were developed for the FreeBSD Project by
+ * ThinkSec AS and NAI Labs, the Security Research Division of Network
+ * Associates, Inc.  under DARPA/SPAWAR contract N66001-01-C-8035
+ * ("CBOSS"), as part of the DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty.c	8.8 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+/*-
+ * TODO:
+ *	o Fix races for sending the start char in ttyflush().
+ *	o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect().
+ *	  With luck, there will be MIN chars before select() returns().
+ *	o Handle CLOCAL consistently for ptys.  Perhaps disallow setting it.
+ *	o Don't allow input in TS_ZOMBIE case.  It would be visible through
+ *	  FIONREAD.
+ *	o Do the new sio locking stuff here and use it to avoid special
+ *	  case for EXTPROC?
+ *	o Lock PENDIN too?
+ *	o Move EXTPROC and/or PENDIN to t_state?
+ *	o Wrap most of ttioctl in spltty/splx.
+ *	o Implement TIOCNOTTY or remove it from <sys/ioctl.h>.
+ *	o Send STOP if IXOFF is toggled off while TS_TBLOCK is set.
+ *	o Don't allow certain termios flags to affect disciplines other
+ *	  than TTYDISC.  Cancel their effects before switch disciplines
+ *	  and ignore them if they are set while we are in another
+ *	  discipline.
+ *	o Now that historical speed conversions are handled here, don't
+ *	  do them in drivers.
+ *	o Check for TS_CARR_ON being set while everything is closed and not
+ *	  waiting for carrier.  TS_CARR_ON isn't cleared if nothing is open,
+ *	  so it would live until the next open even if carrier drops.
+ *	o Restore TS_WOPEN since it is useful in pstat.  It must be cleared
+ *	  only when _all_ openers leave open().
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/filio.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sx.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/proc.h>
+#define	TTYDEFCHARS
+#include <sys/tty.h>
+#undef	TTYDEFCHARS
+#include <sys/fcntl.h>
+#include <sys/conf.h>
+#include <sys/dkstat.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+MALLOC_DEFINE(M_TTYS, "ttys", "tty data structures");
+
+static int	proc_compare(struct proc *p1, struct proc *p2);
+static int	ttnread(struct tty *tp);
+static void	ttyecho(int c, struct tty *tp);
+static int	ttyoutput(int c, struct tty *tp);
+static void	ttypend(struct tty *tp);
+static void	ttyretype(struct tty *tp);
+static void	ttyrub(int c, struct tty *tp);
+static void	ttyrubo(struct tty *tp, int cnt);
+static void	ttyunblock(struct tty *tp);
+static int	ttywflush(struct tty *tp);
+static int	filt_ttyread(struct knote *kn, long hint);
+static void	filt_ttyrdetach(struct knote *kn);
+static int	filt_ttywrite(struct knote *kn, long hint);
+static void	filt_ttywdetach(struct knote *kn);
+
+/*
+ * Table with character classes and parity. The 8th bit indicates parity,
+ * the 7th bit indicates the character is an alphameric or underscore (for
+ * ALTWERASE), and the low 6 bits indicate delay type.  If the low 6 bits
+ * are 0 then the character needs no special processing on output; classes
+ * other than 0 might be translated or (not currently) require delays.
+ */
+#define	E	0x00	/* Even parity. */
+#define	O	0x80	/* Odd parity. */
+#define	PARITY(c)	(char_type[c] & O)
+
+#define	ALPHA	0x40	/* Alpha or underscore. */
+#define	ISALPHA(c)	(char_type[(c) & TTY_CHARMASK] & ALPHA)
+
+#define	CCLASSMASK	0x3f
+#define	CCLASS(c)	(char_type[c] & CCLASSMASK)
+
+#define	BS	BACKSPACE
+#define	CC	CONTROL
+#define	CR	RETURN
+#define	NA	ORDINARY | ALPHA
+#define	NL	NEWLINE
+#define	NO	ORDINARY
+#define	TB	TAB
+#define	VT	VTAB
+
+static u_char const char_type[] = {
+	E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC,	/* nul - bel */
+	O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */
+	O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */
+	E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */
+	O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */
+	E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */
+	E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */
+	O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */
+	O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */
+	E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */
+	E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */
+	O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */
+	E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */
+	O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */
+	O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */
+	E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */
+	/*
+	 * Meta chars; should be settable per character set;
+	 * for now, treat them all as normal characters.
+	 */
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
+};
+#undef	BS
+#undef	CC
+#undef	CR
+#undef	NA
+#undef	NL
+#undef	NO
+#undef	TB
+#undef	VT
+
+/* Macros to clear/set/test flags. */
+#define	SET(t, f)	(t) |= (f)
+#define	CLR(t, f)	(t) &= ~(f)
+#define	ISSET(t, f)	((t) & (f))
+
+#undef MAX_INPUT		/* XXX wrong in <sys/syslimits.h> */
+#define	MAX_INPUT	TTYHOG	/* XXX limit is usually larger for !ICANON */
+
+/*
+ * list of struct tty where pstat(8) can pick it up with sysctl
+ */
+static SLIST_HEAD(, tty) tty_list;
+
+static int  drainwait = 5*60;
+SYSCTL_INT(_kern, OID_AUTO, drainwait, CTLFLAG_RW, &drainwait,
+	0, "Output drain timeout in seconds");
+
+/*
+ * Initial open of tty, or (re)entry to standard tty line discipline.
+ */
+int
+ttyopen(dev_t device, struct tty *tp)
+{
+	int s;
+
+	s = spltty();
+	tp->t_dev = device;
+	if (!ISSET(tp->t_state, TS_ISOPEN)) {
+		SET(tp->t_state, TS_ISOPEN);
+		if (ISSET(tp->t_cflag, CLOCAL))
+			SET(tp->t_state, TS_CONNECTED);
+		bzero(&tp->t_winsize, sizeof(tp->t_winsize));
+	}
+	/* XXX don't hang forever on output */
+	if (tp->t_timeout < 0)
+		tp->t_timeout = drainwait*hz;
+	ttsetwater(tp);
+	splx(s);
+	return (0);
+}
+
+/*
+ * Handle close() on a tty line: flush and set to initial state,
+ * bumping generation number so that pending read/write calls
+ * can detect recycling of the tty.
+ * XXX our caller should have done `spltty(); l_close(); ttyclose();'
+ * and l_close() should have flushed, but we repeat the spltty() and
+ * the flush in case there are buggy callers.
+ */
+int
+ttyclose(struct tty *tp)
+{
+	int s;
+
+	funsetown(&tp->t_sigio);
+	s = spltty();
+	if (constty == tp)
+		constty = NULL;
+
+	ttyflush(tp, FREAD | FWRITE);
+	clist_free_cblocks(&tp->t_canq);
+	clist_free_cblocks(&tp->t_outq);
+	clist_free_cblocks(&tp->t_rawq);
+
+	tp->t_gen++;
+	tp->t_line = TTYDISC;
+	tp->t_pgrp = NULL;
+	tp->t_session = NULL;
+	tp->t_state = 0;
+	splx(s);
+	return (0);
+}
+
+#define	FLUSHQ(q) {							\
+	if ((q)->c_cc)							\
+		ndflush(q, (q)->c_cc);					\
+}
+
+/* Is 'c' a line delimiter ("break" character)? */
+#define	TTBREAKC(c, lflag)							\
+	((c) == '\n' || (((c) == cc[VEOF] ||				\
+	  (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) &&	\
+	 (c) != _POSIX_VDISABLE))
+
+/*
+ * Process input of a single character received on a tty.
+ */
+int
+ttyinput(int c, struct tty *tp)
+{
+	tcflag_t iflag, lflag;
+	cc_t *cc;
+	int i, err;
+
+	/*
+	 * If input is pending take it first.
+	 */
+	lflag = tp->t_lflag;
+	if (ISSET(lflag, PENDIN))
+		ttypend(tp);
+	/*
+	 * Gather stats.
+	 */
+	if (ISSET(lflag, ICANON)) {
+		++tk_cancc;
+		++tp->t_cancc;
+	} else {
+		++tk_rawcc;
+		++tp->t_rawcc;
+	}
+	++tk_nin;
+
+	/*
+	 * Block further input iff:
+	 * current input > threshold AND input is available to user program
+	 * AND input flow control is enabled and not yet invoked.
+	 * The 3 is slop for PARMRK.
+	 */
+	iflag = tp->t_iflag;
+	if (tp->t_rawq.c_cc + tp->t_canq.c_cc > tp->t_ihiwat - 3 &&
+	    (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) &&
+	    (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) &&
+	    !ISSET(tp->t_state, TS_TBLOCK))
+		ttyblock(tp);
+
+	/* Handle exceptional conditions (break, parity, framing). */
+	cc = tp->t_cc;
+	err = (ISSET(c, TTY_ERRORMASK));
+	if (err) {
+		CLR(c, TTY_ERRORMASK);
+		if (ISSET(err, TTY_BI)) {
+			if (ISSET(iflag, IGNBRK))
+				return (0);
+			if (ISSET(iflag, BRKINT)) {
+				ttyflush(tp, FREAD | FWRITE);
+				if (tp->t_pgrp != NULL) {
+					PGRP_LOCK(tp->t_pgrp);
+					pgsignal(tp->t_pgrp, SIGINT, 1);
+					PGRP_UNLOCK(tp->t_pgrp);
+				}
+				goto endcase;
+			}
+			if (ISSET(iflag, PARMRK))
+				goto parmrk;
+		} else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK))
+			|| ISSET(err, TTY_FE)) {
+			if (ISSET(iflag, IGNPAR))
+				return (0);
+			else if (ISSET(iflag, PARMRK)) {
+parmrk:
+				if (tp->t_rawq.c_cc + tp->t_canq.c_cc >
+				    MAX_INPUT - 3)
+					goto input_overflow;
+				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+				(void)putc(0 | TTY_QUOTE, &tp->t_rawq);
+				(void)putc(c | TTY_QUOTE, &tp->t_rawq);
+				goto endcase;
+			} else
+				c = 0;
+		}
+	}
+
+	if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP))
+		CLR(c, 0x80);
+	if (!ISSET(lflag, EXTPROC)) {
+		/*
+		 * Check for literal nexting very first
+		 */
+		if (ISSET(tp->t_state, TS_LNCH)) {
+			SET(c, TTY_QUOTE);
+			CLR(tp->t_state, TS_LNCH);
+		}
+		/*
+		 * Scan for special characters.  This code
+		 * is really just a big case statement with
+		 * non-constant cases.  The bottom of the
+		 * case statement is labeled ``endcase'', so goto
+		 * it after a case match, or similar.
+		 */
+
+		/*
+		 * Control chars which aren't controlled
+		 * by ICANON, ISIG, or IXON.
+		 */
+		if (ISSET(lflag, IEXTEN)) {
+			if (CCEQ(cc[VLNEXT], c)) {
+				if (ISSET(lflag, ECHO)) {
+					if (ISSET(lflag, ECHOE)) {
+						(void)ttyoutput('^', tp);
+						(void)ttyoutput('\b', tp);
+					} else
+						ttyecho(c, tp);
+				}
+				SET(tp->t_state, TS_LNCH);
+				goto endcase;
+			}
+			if (CCEQ(cc[VDISCARD], c)) {
+				if (ISSET(lflag, FLUSHO))
+					CLR(tp->t_lflag, FLUSHO);
+				else {
+					ttyflush(tp, FWRITE);
+					ttyecho(c, tp);
+					if (tp->t_rawq.c_cc + tp->t_canq.c_cc)
+						ttyretype(tp);
+					SET(tp->t_lflag, FLUSHO);
+				}
+				goto startoutput;
+			}
+		}
+		/*
+		 * Signals.
+		 */
+		if (ISSET(lflag, ISIG)) {
+			if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) {
+				if (!ISSET(lflag, NOFLSH))
+					ttyflush(tp, FREAD | FWRITE);
+				ttyecho(c, tp);
+				if (tp->t_pgrp != NULL) {
+					PGRP_LOCK(tp->t_pgrp);
+					pgsignal(tp->t_pgrp,
+					    CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT, 1);
+					PGRP_UNLOCK(tp->t_pgrp);
+				}
+				goto endcase;
+			}
+			if (CCEQ(cc[VSUSP], c)) {
+				if (!ISSET(lflag, NOFLSH))
+					ttyflush(tp, FREAD);
+				ttyecho(c, tp);
+				if (tp->t_pgrp != NULL) {
+					PGRP_LOCK(tp->t_pgrp);
+					pgsignal(tp->t_pgrp, SIGTSTP, 1);
+					PGRP_UNLOCK(tp->t_pgrp);
+				}
+				goto endcase;
+			}
+		}
+		/*
+		 * Handle start/stop characters.
+		 */
+		if (ISSET(iflag, IXON)) {
+			if (CCEQ(cc[VSTOP], c)) {
+				if (!ISSET(tp->t_state, TS_TTSTOP)) {
+					SET(tp->t_state, TS_TTSTOP);
+					(*tp->t_stop)(tp, 0);
+					return (0);
+				}
+				if (!CCEQ(cc[VSTART], c))
+					return (0);
+				/*
+				 * if VSTART == VSTOP then toggle
+				 */
+				goto endcase;
+			}
+			if (CCEQ(cc[VSTART], c))
+				goto restartoutput;
+		}
+		/*
+		 * IGNCR, ICRNL, & INLCR
+		 */
+		if (c == '\r') {
+			if (ISSET(iflag, IGNCR))
+				return (0);
+			else if (ISSET(iflag, ICRNL))
+				c = '\n';
+		} else if (c == '\n' && ISSET(iflag, INLCR))
+			c = '\r';
+	}
+	if (!ISSET(tp->t_lflag, EXTPROC) && ISSET(lflag, ICANON)) {
+		/*
+		 * From here on down canonical mode character
+		 * processing takes place.
+		 */
+		/*
+		 * erase or erase2 (^H / ^?)
+		 */
+		if (CCEQ(cc[VERASE], c) || CCEQ(cc[VERASE2], c) ) {
+			if (tp->t_rawq.c_cc)
+				ttyrub(unputc(&tp->t_rawq), tp);
+			goto endcase;
+		}
+		/*
+		 * kill (^U)
+		 */
+		if (CCEQ(cc[VKILL], c)) {
+			if (ISSET(lflag, ECHOKE) &&
+			    tp->t_rawq.c_cc == tp->t_rocount &&
+			    !ISSET(lflag, ECHOPRT))
+				while (tp->t_rawq.c_cc)
+					ttyrub(unputc(&tp->t_rawq), tp);
+			else {
+				ttyecho(c, tp);
+				if (ISSET(lflag, ECHOK) ||
+				    ISSET(lflag, ECHOKE))
+					ttyecho('\n', tp);
+				FLUSHQ(&tp->t_rawq);
+				tp->t_rocount = 0;
+			}
+			CLR(tp->t_state, TS_LOCAL);
+			goto endcase;
+		}
+		/*
+		 * word erase (^W)
+		 */
+		if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) {
+			int ctype;
+
+			/*
+			 * erase whitespace
+			 */
+			while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t')
+				ttyrub(c, tp);
+			if (c == -1)
+				goto endcase;
+			/*
+			 * erase last char of word and remember the
+			 * next chars type (for ALTWERASE)
+			 */
+			ttyrub(c, tp);
+			c = unputc(&tp->t_rawq);
+			if (c == -1)
+				goto endcase;
+			if (c == ' ' || c == '\t') {
+				(void)putc(c, &tp->t_rawq);
+				goto endcase;
+			}
+			ctype = ISALPHA(c);
+			/*
+			 * erase rest of word
+			 */
+			do {
+				ttyrub(c, tp);
+				c = unputc(&tp->t_rawq);
+				if (c == -1)
+					goto endcase;
+			} while (c != ' ' && c != '\t' &&
+			    (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype));
+			(void)putc(c, &tp->t_rawq);
+			goto endcase;
+		}
+		/*
+		 * reprint line (^R)
+		 */
+		if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) {
+			ttyretype(tp);
+			goto endcase;
+		}
+		/*
+		 * ^T - kernel info and generate SIGINFO
+		 */
+		if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) {
+			if (ISSET(lflag, ISIG) && tp->t_pgrp != NULL) {
+				PGRP_LOCK(tp->t_pgrp);
+				pgsignal(tp->t_pgrp, SIGINFO, 1);
+				PGRP_UNLOCK(tp->t_pgrp);
+			}
+			if (!ISSET(lflag, NOKERNINFO))
+				ttyinfo(tp);
+			goto endcase;
+		}
+	}
+	/*
+	 * Check for input buffer overflow
+	 */
+	if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) {
+input_overflow:
+		if (ISSET(iflag, IMAXBEL)) {
+			if (tp->t_outq.c_cc < tp->t_ohiwat)
+				(void)ttyoutput(CTRL('g'), tp);
+		}
+		goto endcase;
+	}
+
+	if (   c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP)
+	     && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR))
+		(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+
+	/*
+	 * Put data char in q for user and
+	 * wakeup on seeing a line delimiter.
+	 */
+	if (putc(c, &tp->t_rawq) >= 0) {
+		if (!ISSET(lflag, ICANON)) {
+			ttwakeup(tp);
+			ttyecho(c, tp);
+			goto endcase;
+		}
+		if (TTBREAKC(c, lflag)) {
+			tp->t_rocount = 0;
+			catq(&tp->t_rawq, &tp->t_canq);
+			ttwakeup(tp);
+		} else if (tp->t_rocount++ == 0)
+			tp->t_rocol = tp->t_column;
+		if (ISSET(tp->t_state, TS_ERASE)) {
+			/*
+			 * end of prterase \.../
+			 */
+			CLR(tp->t_state, TS_ERASE);
+			(void)ttyoutput('/', tp);
+		}
+		i = tp->t_column;
+		ttyecho(c, tp);
+		if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) {
+			/*
+			 * Place the cursor over the '^' of the ^D.
+			 */
+			i = imin(2, tp->t_column - i);
+			while (i > 0) {
+				(void)ttyoutput('\b', tp);
+				i--;
+			}
+		}
+	}
+endcase:
+	/*
+	 * IXANY means allow any character to restart output.
+	 */
+	if (ISSET(tp->t_state, TS_TTSTOP) &&
+	    !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP])
+		return (0);
+restartoutput:
+	CLR(tp->t_lflag, FLUSHO);
+	CLR(tp->t_state, TS_TTSTOP);
+startoutput:
+	return (ttstart(tp));
+}
+
+/*
+ * Output a single character on a tty, doing output processing
+ * as needed (expanding tabs, newline processing, etc.).
+ * Returns < 0 if succeeds, otherwise returns char to resend.
+ * Must be recursive.
+ */
+static int
+ttyoutput(int c, struct tty *tp)
+{
+	tcflag_t oflag;
+	int col, s;
+
+	oflag = tp->t_oflag;
+	if (!ISSET(oflag, OPOST)) {
+		if (ISSET(tp->t_lflag, FLUSHO))
+			return (-1);
+		if (putc(c, &tp->t_outq))
+			return (c);
+		tk_nout++;
+		tp->t_outcc++;
+		return (-1);
+	}
+	/*
+	 * Do tab expansion if OXTABS is set.  Special case if we external
+	 * processing, we don't do the tab expansion because we'll probably
+	 * get it wrong.  If tab expansion needs to be done, let it happen
+	 * externally.
+	 */
+	CLR(c, ~TTY_CHARMASK);
+	if (c == '\t' &&
+	    ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) {
+		c = 8 - (tp->t_column & 7);
+		if (!ISSET(tp->t_lflag, FLUSHO)) {
+			s = spltty();		/* Don't interrupt tabs. */
+			c -= b_to_q("        ", c, &tp->t_outq);
+			tk_nout += c;
+			tp->t_outcc += c;
+			splx(s);
+		}
+		tp->t_column += c;
+		return (c ? -1 : '\t');
+	}
+	if (c == CEOT && ISSET(oflag, ONOEOT))
+		return (-1);
+
+	/*
+	 * Newline translation: if ONLCR is set,
+	 * translate newline into "\r\n".
+	 */
+	if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) {
+		tk_nout++;
+		tp->t_outcc++;
+		if (!ISSET(tp->t_lflag, FLUSHO) && putc('\r', &tp->t_outq))
+			return (c);
+	}
+	/* If OCRNL is set, translate "\r" into "\n". */
+	else if (c == '\r' && ISSET(tp->t_oflag, OCRNL))
+		c = '\n';
+	/* If ONOCR is set, don't transmit CRs when on column 0. */
+	else if (c == '\r' && ISSET(tp->t_oflag, ONOCR) && tp->t_column == 0)
+		return (-1);
+
+	tk_nout++;
+	tp->t_outcc++;
+	if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
+		return (c);
+
+	col = tp->t_column;
+	switch (CCLASS(c)) {
+	case BACKSPACE:
+		if (col > 0)
+			--col;
+		break;
+	case CONTROL:
+		break;
+	case NEWLINE:
+		if (ISSET(tp->t_oflag, ONLCR | ONLRET))
+			col = 0;
+		break;
+	case RETURN:
+		col = 0;
+		break;
+	case ORDINARY:
+		++col;
+		break;
+	case TAB:
+		col = (col + 8) & ~7;
+		break;
+	}
+	tp->t_column = col;
+	return (-1);
+}
+
+/*
+ * Ioctls for all tty devices.  Called after line-discipline specific ioctl
+ * has been called to do discipline-specific functions and/or reject any
+ * of these ioctl commands.
+ */
+/* ARGSUSED */
+int
+ttioctl(struct tty *tp, u_long cmd, void *data, int flag)
+{
+	struct proc *p;
+	struct thread *td;
+	struct pgrp *pgrp;
+	int s, error;
+
+	td = curthread;			/* XXX */
+	p = td->td_proc;
+
+	/* If the ioctl involves modification, hang if in the background. */
+	switch (cmd) {
+	case  TIOCCBRK:
+	case  TIOCCONS:
+	case  TIOCDRAIN:
+	case  TIOCEXCL:
+	case  TIOCFLUSH:
+#ifdef TIOCHPCL
+	case  TIOCHPCL:
+#endif
+	case  TIOCNXCL:
+	case  TIOCSBRK:
+	case  TIOCSCTTY:
+	case  TIOCSDRAINWAIT:
+	case  TIOCSETA:
+	case  TIOCSETAF:
+	case  TIOCSETAW:
+	case  TIOCSETD:
+	case  TIOCSPGRP:
+	case  TIOCSTART:
+	case  TIOCSTAT:
+	case  TIOCSTI:
+	case  TIOCSTOP:
+	case  TIOCSWINSZ:
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+	case  TIOCLBIC:
+	case  TIOCLBIS:
+	case  TIOCLSET:
+	case  TIOCSETC:
+	case OTIOCSETD:
+	case  TIOCSETN:
+	case  TIOCSETP:
+	case  TIOCSLTC:
+#endif
+		sx_slock(&proctree_lock);
+		PROC_LOCK(p);
+		while (isbackground(p, tp) && !(p->p_flag & P_PPWAIT) &&
+		    !SIGISMEMBER(p->p_sigignore, SIGTTOU) &&
+		    !SIGISMEMBER(p->p_sigmask, SIGTTOU)) {
+			pgrp = p->p_pgrp;
+			PROC_UNLOCK(p);
+			if (pgrp->pg_jobc == 0) {
+				sx_sunlock(&proctree_lock);
+				return (EIO);
+			}
+			PGRP_LOCK(pgrp);
+			sx_sunlock(&proctree_lock);
+			pgsignal(pgrp, SIGTTOU, 1);
+			PGRP_UNLOCK(pgrp);
+			error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1",
+					 0);
+			if (error)
+				return (error);
+			sx_slock(&proctree_lock);
+			PROC_LOCK(p);
+		}
+		PROC_UNLOCK(p);
+		sx_sunlock(&proctree_lock);
+		break;
+	}
+
+	switch (cmd) {			/* Process the ioctl. */
+	case FIOASYNC:			/* set/clear async i/o */
+		s = spltty();
+		if (*(int *)data)
+			SET(tp->t_state, TS_ASYNC);
+		else
+			CLR(tp->t_state, TS_ASYNC);
+		splx(s);
+		break;
+	case FIONBIO:			/* set/clear non-blocking i/o */
+		break;			/* XXX: delete. */
+	case FIONREAD:			/* get # bytes to read */
+		s = spltty();
+		*(int *)data = ttnread(tp);
+		splx(s);
+		break;
+
+	case FIOSETOWN:
+		/*
+		 * Policy -- Don't allow FIOSETOWN on someone else's
+		 *           controlling tty
+		 */
+		if (tp->t_session != NULL && !isctty(p, tp))
+			return (ENOTTY);
+
+		error = fsetown(*(int *)data, &tp->t_sigio);
+		if (error)
+			return (error);
+		break;
+	case FIOGETOWN:
+		if (tp->t_session != NULL && !isctty(p, tp))
+			return (ENOTTY);
+		*(int *)data = fgetown(tp->t_sigio);
+		break;
+
+	case TIOCEXCL:			/* set exclusive use of tty */
+		s = spltty();
+		SET(tp->t_state, TS_XCLUDE);
+		splx(s);
+		break;
+	case TIOCFLUSH: {		/* flush buffers */
+		int flags = *(int *)data;
+
+		if (flags == 0)
+			flags = FREAD | FWRITE;
+		else
+			flags &= FREAD | FWRITE;
+		ttyflush(tp, flags);
+		break;
+	}
+	case TIOCCONS:			/* become virtual console */
+		if (*(int *)data) {
+			struct nameidata nid;
+
+			if (constty && constty != tp &&
+			    ISSET(constty->t_state, TS_CONNECTED))
+				return (EBUSY);
+
+			/* Ensure user can open the real console. */
+			NDINIT(&nid, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE,
+			    "/dev/console", td);
+			if ((error = namei(&nid)) != 0)
+				return (error);
+			NDFREE(&nid, NDF_ONLY_PNBUF);
+			error = VOP_ACCESS(nid.ni_vp, VREAD, td->td_ucred, td);
+			vput(nid.ni_vp);
+			if (error)
+				return (error);
+
+			constty = tp;
+		} else if (tp == constty)
+			constty = NULL;
+		break;
+	case TIOCDRAIN:			/* wait till output drained */
+		error = ttywait(tp);
+		if (error)
+			return (error);
+		break;
+	case TIOCGETA: {		/* get termios struct */
+		struct termios *t = (struct termios *)data;
+
+		bcopy(&tp->t_termios, t, sizeof(struct termios));
+		break;
+	}
+	case TIOCGETD:			/* get line discipline */
+		*(int *)data = tp->t_line;
+		break;
+	case TIOCGWINSZ:		/* get window size */
+		*(struct winsize *)data = tp->t_winsize;
+		break;
+	case TIOCGPGRP:			/* get pgrp of tty */
+		if (!isctty(p, tp))
+			return (ENOTTY);
+		*(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+		break;
+#ifdef TIOCHPCL
+	case TIOCHPCL:			/* hang up on last close */
+		s = spltty();
+		SET(tp->t_cflag, HUPCL);
+		splx(s);
+		break;
+#endif
+	case TIOCNXCL:			/* reset exclusive use of tty */
+		s = spltty();
+		CLR(tp->t_state, TS_XCLUDE);
+		splx(s);
+		break;
+	case TIOCOUTQ:			/* output queue size */
+		*(int *)data = tp->t_outq.c_cc;
+		break;
+	case TIOCSETA:			/* set termios struct */
+	case TIOCSETAW:			/* drain output, set */
+	case TIOCSETAF: {		/* drn out, fls in, set */
+		struct termios *t = (struct termios *)data;
+
+		if (t->c_ispeed == 0)
+			t->c_ispeed = t->c_ospeed;
+		if (t->c_ispeed == 0)
+			t->c_ispeed = tp->t_ospeed;
+		if (t->c_ispeed == 0)
+			return (EINVAL);
+		s = spltty();
+		if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
+			error = ttywait(tp);
+			if (error) {
+				splx(s);
+				return (error);
+			}
+			if (cmd == TIOCSETAF)
+				ttyflush(tp, FREAD);
+		}
+		if (!ISSET(t->c_cflag, CIGNORE)) {
+			/*
+			 * Set device hardware.
+			 */
+			if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
+				splx(s);
+				return (error);
+			}
+			if (ISSET(t->c_cflag, CLOCAL) &&
+			    !ISSET(tp->t_cflag, CLOCAL)) {
+				/*
+				 * XXX disconnections would be too hard to
+				 * get rid of without this kludge.  The only
+				 * way to get rid of controlling terminals
+				 * is to exit from the session leader.
+				 */
+				CLR(tp->t_state, TS_ZOMBIE);
+
+				wakeup(TSA_CARR_ON(tp));
+				ttwakeup(tp);
+				ttwwakeup(tp);
+			}
+			if ((ISSET(tp->t_state, TS_CARR_ON) ||
+			     ISSET(t->c_cflag, CLOCAL)) &&
+			    !ISSET(tp->t_state, TS_ZOMBIE))
+				SET(tp->t_state, TS_CONNECTED);
+			else
+				CLR(tp->t_state, TS_CONNECTED);
+			tp->t_cflag = t->c_cflag;
+			tp->t_ispeed = t->c_ispeed;
+			if (t->c_ospeed != 0)
+				tp->t_ospeed = t->c_ospeed;
+			ttsetwater(tp);
+		}
+		if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) &&
+		    cmd != TIOCSETAF) {
+			if (ISSET(t->c_lflag, ICANON))
+				SET(tp->t_lflag, PENDIN);
+			else {
+				/*
+				 * XXX we really shouldn't allow toggling
+				 * ICANON while we're in a non-termios line
+				 * discipline.  Now we have to worry about
+				 * panicing for a null queue.
+				 */
+				if (tp->t_canq.c_cbreserved > 0 &&
+				    tp->t_rawq.c_cbreserved > 0) {
+					catq(&tp->t_rawq, &tp->t_canq);
+					/*
+					 * XXX the queue limits may be
+					 * different, so the old queue
+					 * swapping method no longer works.
+					 */
+					catq(&tp->t_canq, &tp->t_rawq);
+				}
+				CLR(tp->t_lflag, PENDIN);
+			}
+			ttwakeup(tp);
+		}
+		tp->t_iflag = t->c_iflag;
+		tp->t_oflag = t->c_oflag;
+		/*
+		 * Make the EXTPROC bit read only.
+		 */
+		if (ISSET(tp->t_lflag, EXTPROC))
+			SET(t->c_lflag, EXTPROC);
+		else
+			CLR(t->c_lflag, EXTPROC);
+		tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN);
+		if (t->c_cc[VMIN] != tp->t_cc[VMIN] ||
+		    t->c_cc[VTIME] != tp->t_cc[VTIME])
+			ttwakeup(tp);
+		bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc));
+		splx(s);
+		break;
+	}
+	case TIOCSETD: {		/* set line discipline */
+		int t = *(int *)data;
+		dev_t device = tp->t_dev;
+
+		if ((u_int)t >= nlinesw)
+			return (ENXIO);
+		if (t != tp->t_line) {
+			s = spltty();
+			(*linesw[tp->t_line].l_close)(tp, flag);
+			error = (*linesw[t].l_open)(device, tp);
+			if (error) {
+				(void)(*linesw[tp->t_line].l_open)(device, tp);
+				splx(s);
+				return (error);
+			}
+			tp->t_line = t;
+			splx(s);
+		}
+		break;
+	}
+	case TIOCSTART:			/* start output, like ^Q */
+		s = spltty();
+		if (ISSET(tp->t_state, TS_TTSTOP) ||
+		    ISSET(tp->t_lflag, FLUSHO)) {
+			CLR(tp->t_lflag, FLUSHO);
+			CLR(tp->t_state, TS_TTSTOP);
+			ttstart(tp);
+		}
+		splx(s);
+		break;
+	case TIOCSTI:			/* simulate terminal input */
+		if ((flag & FREAD) == 0 && suser(td))
+			return (EPERM);
+		if (!isctty(p, tp) && suser(td))
+			return (EACCES);
+		s = spltty();
+		(*linesw[tp->t_line].l_rint)(*(u_char *)data, tp);
+		splx(s);
+		break;
+	case TIOCSTOP:			/* stop output, like ^S */
+		s = spltty();
+		if (!ISSET(tp->t_state, TS_TTSTOP)) {
+			SET(tp->t_state, TS_TTSTOP);
+			(*tp->t_stop)(tp, 0);
+		}
+		splx(s);
+		break;
+	case TIOCSCTTY:			/* become controlling tty */
+		/* Session ctty vnode pointer set in vnode layer. */
+		sx_slock(&proctree_lock);
+		if (!SESS_LEADER(p) ||
+		    ((p->p_session->s_ttyvp || tp->t_session) &&
+		     (tp->t_session != p->p_session))) {
+			sx_sunlock(&proctree_lock);
+			return (EPERM);
+		}
+		tp->t_session = p->p_session;
+		tp->t_pgrp = p->p_pgrp;
+		SESS_LOCK(p->p_session);
+		p->p_session->s_ttyp = tp;
+		SESS_UNLOCK(p->p_session);
+		PROC_LOCK(p);
+		p->p_flag |= P_CONTROLT;
+		PROC_UNLOCK(p);
+		sx_sunlock(&proctree_lock);
+		break;
+	case TIOCSPGRP: {		/* set pgrp of tty */
+		sx_slock(&proctree_lock);
+		pgrp = pgfind(*(int *)data);
+		if (!isctty(p, tp)) {
+			if (pgrp != NULL)
+				PGRP_UNLOCK(pgrp);
+			sx_sunlock(&proctree_lock);
+			return (ENOTTY);
+		}
+		if (pgrp == NULL) {
+			sx_sunlock(&proctree_lock);
+			return (EPERM);
+		}
+		PGRP_UNLOCK(pgrp);
+		if (pgrp->pg_session != p->p_session) {
+			sx_sunlock(&proctree_lock);
+			return (EPERM);
+		}
+		sx_sunlock(&proctree_lock);
+		tp->t_pgrp = pgrp;
+		break;
+	}
+	case TIOCSTAT:			/* simulate control-T */
+		s = spltty();
+		ttyinfo(tp);
+		splx(s);
+		break;
+	case TIOCSWINSZ:		/* set window size */
+		if (bcmp((caddr_t)&tp->t_winsize, data,
+		    sizeof (struct winsize))) {
+			tp->t_winsize = *(struct winsize *)data;
+			if (tp->t_pgrp != NULL) {
+				PGRP_LOCK(tp->t_pgrp);
+				pgsignal(tp->t_pgrp, SIGWINCH, 1);
+				PGRP_UNLOCK(tp->t_pgrp);
+			}
+		}
+		break;
+	case TIOCSDRAINWAIT:
+		error = suser(td);
+		if (error)
+			return (error);
+		tp->t_timeout = *(int *)data * hz;
+		wakeup(TSA_OCOMPLETE(tp));
+		wakeup(TSA_OLOWAT(tp));
+		break;
+	case TIOCGDRAINWAIT:
+		*(int *)data = tp->t_timeout / hz;
+		break;
+	default:
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+		return (ttcompat(tp, cmd, data, flag));
+#else
+		return (ENOIOCTL);
+#endif
+	}
+	return (0);
+}
+
+int
+ttypoll(dev_t dev, int events, struct thread *td)
+{
+	int s;
+	int revents = 0;
+	struct tty *tp;
+
+	tp = dev->si_tty;
+	if (tp == NULL)	/* XXX used to return ENXIO, but that means true! */
+		return ((events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM))
+			| POLLHUP);
+
+	s = spltty();
+	if (events & (POLLIN | POLLRDNORM)) {
+		if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE))
+			revents |= events & (POLLIN | POLLRDNORM);
+		else
+			selrecord(td, &tp->t_rsel);
+	}
+	if (events & (POLLOUT | POLLWRNORM)) {
+		if ((tp->t_outq.c_cc <= tp->t_olowat &&
+		     ISSET(tp->t_state, TS_CONNECTED))
+		    || ISSET(tp->t_state, TS_ZOMBIE))
+			revents |= events & (POLLOUT | POLLWRNORM);
+		else
+			selrecord(td, &tp->t_wsel);
+	}
+	splx(s);
+	return (revents);
+}
+
+static struct filterops ttyread_filtops =
+	{ 1, NULL, filt_ttyrdetach, filt_ttyread };
+static struct filterops ttywrite_filtops =
+	{ 1, NULL, filt_ttywdetach, filt_ttywrite };
+
+int
+ttykqfilter(dev_t dev, struct knote *kn)
+{
+	struct tty *tp = dev->si_tty;
+	struct klist *klist;
+	int s;
+
+	switch (kn->kn_filter) {
+	case EVFILT_READ:
+		klist = &tp->t_rsel.si_note;
+		kn->kn_fop = &ttyread_filtops;
+		break;
+	case EVFILT_WRITE:
+		klist = &tp->t_wsel.si_note;
+		kn->kn_fop = &ttywrite_filtops;
+		break;
+	default:
+		return (1);
+	}
+
+	kn->kn_hook = (caddr_t)dev;
+
+	s = spltty();
+	SLIST_INSERT_HEAD(klist, kn, kn_selnext);
+	splx(s);
+
+	return (0);
+}
+
+static void
+filt_ttyrdetach(struct knote *kn)
+{
+	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
+	int s = spltty();
+
+	SLIST_REMOVE(&tp->t_rsel.si_note, kn, knote, kn_selnext);
+	splx(s);
+}
+
+static int
+filt_ttyread(struct knote *kn, long hint)
+{
+	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
+
+	kn->kn_data = ttnread(tp);
+	if (ISSET(tp->t_state, TS_ZOMBIE)) {
+		kn->kn_flags |= EV_EOF;
+		return (1);
+	}
+	return (kn->kn_data > 0);
+}
+
+static void
+filt_ttywdetach(struct knote *kn)
+{
+	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
+	int s = spltty();
+
+	SLIST_REMOVE(&tp->t_wsel.si_note, kn, knote, kn_selnext);
+	splx(s);
+}
+
+static int
+filt_ttywrite(struct knote *kn, long hint)
+{
+	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
+
+	kn->kn_data = tp->t_outq.c_cc;
+	if (ISSET(tp->t_state, TS_ZOMBIE))
+		return (1);
+	return (kn->kn_data <= tp->t_olowat &&
+	    ISSET(tp->t_state, TS_CONNECTED));
+}
+
+/*
+ * Must be called at spltty().
+ */
+static int
+ttnread(struct tty *tp)
+{
+	int nread;
+
+	if (ISSET(tp->t_lflag, PENDIN))
+		ttypend(tp);
+	nread = tp->t_canq.c_cc;
+	if (!ISSET(tp->t_lflag, ICANON)) {
+		nread += tp->t_rawq.c_cc;
+		if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0)
+			nread = 0;
+	}
+	return (nread);
+}
+
+/*
+ * Wait for output to drain.
+ */
+int
+ttywait(struct tty *tp)
+{
+	int error, s;
+
+	error = 0;
+	s = spltty();
+	while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
+	       ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) {
+		(*tp->t_oproc)(tp);
+		if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
+		    ISSET(tp->t_state, TS_CONNECTED)) {
+			SET(tp->t_state, TS_SO_OCOMPLETE);
+			error = ttysleep(tp, TSA_OCOMPLETE(tp),
+					 TTOPRI | PCATCH, "ttywai",
+					 tp->t_timeout);
+			if (error) {
+				if (error == EWOULDBLOCK)
+					error = EIO;
+				break;
+			}
+		} else
+			break;
+	}
+	if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)))
+		error = EIO;
+	splx(s);
+	return (error);
+}
+
+/*
+ * Flush if successfully wait.
+ */
+static int
+ttywflush(struct tty *tp)
+{
+	int error;
+
+	if ((error = ttywait(tp)) == 0)
+		ttyflush(tp, FREAD);
+	return (error);
+}
+
+/*
+ * Flush tty read and/or write queues, notifying anyone waiting.
+ */
+void
+ttyflush(struct tty *tp, int rw)
+{
+	int s;
+
+	s = spltty();
+#if 0
+again:
+#endif
+	if (rw & FWRITE) {
+		FLUSHQ(&tp->t_outq);
+		CLR(tp->t_state, TS_TTSTOP);
+	}
+	(*tp->t_stop)(tp, rw);
+	if (rw & FREAD) {
+		FLUSHQ(&tp->t_canq);
+		FLUSHQ(&tp->t_rawq);
+		CLR(tp->t_lflag, PENDIN);
+		tp->t_rocount = 0;
+		tp->t_rocol = 0;
+		CLR(tp->t_state, TS_LOCAL);
+		ttwakeup(tp);
+		if (ISSET(tp->t_state, TS_TBLOCK)) {
+			if (rw & FWRITE)
+				FLUSHQ(&tp->t_outq);
+			ttyunblock(tp);
+
+			/*
+			 * Don't let leave any state that might clobber the
+			 * next line discipline (although we should do more
+			 * to send the START char).  Not clearing the state
+			 * may have caused the "putc to a clist with no
+			 * reserved cblocks" panic/printf.
+			 */
+			CLR(tp->t_state, TS_TBLOCK);
+
+#if 0 /* forget it, sleeping isn't always safe and we don't know when it is */
+			if (ISSET(tp->t_iflag, IXOFF)) {
+				/*
+				 * XXX wait a bit in the hope that the stop
+				 * character (if any) will go out.  Waiting
+				 * isn't good since it allows races.  This
+				 * will be fixed when the stop character is
+				 * put in a special queue.  Don't bother with
+				 * the checks in ttywait() since the timeout
+				 * will save us.
+				 */
+				SET(tp->t_state, TS_SO_OCOMPLETE);
+				ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI,
+					 "ttyfls", hz / 10);
+				/*
+				 * Don't try sending the stop character again.
+				 */
+				CLR(tp->t_state, TS_TBLOCK);
+				goto again;
+			}
+#endif
+		}
+	}
+	if (rw & FWRITE) {
+		FLUSHQ(&tp->t_outq);
+		ttwwakeup(tp);
+	}
+	splx(s);
+}
+
+/*
+ * Copy in the default termios characters.
+ */
+void
+termioschars(struct termios *t)
+{
+
+	bcopy(ttydefchars, t->c_cc, sizeof t->c_cc);
+}
+
+/*
+ * Old interface.
+ */
+void
+ttychars(struct tty *tp)
+{
+
+	termioschars(&tp->t_termios);
+}
+
+/*
+ * Handle input high water.  Send stop character for the IXOFF case.  Turn
+ * on our input flow control bit and propagate the changes to the driver.
+ * XXX the stop character should be put in a special high priority queue.
+ */
+void
+ttyblock(struct tty *tp)
+{
+
+	SET(tp->t_state, TS_TBLOCK);
+	if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE &&
+	    putc(tp->t_cc[VSTOP], &tp->t_outq) != 0)
+		CLR(tp->t_state, TS_TBLOCK);	/* try again later */
+	ttstart(tp);
+}
+
+/*
+ * Handle input low water.  Send start character for the IXOFF case.  Turn
+ * off our input flow control bit and propagate the changes to the driver.
+ * XXX the start character should be put in a special high priority queue.
+ */
+static void
+ttyunblock(struct tty *tp)
+{
+
+	CLR(tp->t_state, TS_TBLOCK);
+	if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE &&
+	    putc(tp->t_cc[VSTART], &tp->t_outq) != 0)
+		SET(tp->t_state, TS_TBLOCK);	/* try again later */
+	ttstart(tp);
+}
+
+#ifdef notyet
+/* Not used by any current (i386) drivers. */
+/*
+ * Restart after an inter-char delay.
+ */
+void
+ttrstrt(void *tp_arg)
+{
+	struct tty *tp;
+	int s;
+
+	KASSERT(tp_arg != NULL, ("ttrstrt"));
+
+	tp = tp_arg;
+	s = spltty();
+
+	CLR(tp->t_state, TS_TIMEOUT);
+	ttstart(tp);
+
+	splx(s);
+}
+#endif
+
+int
+ttstart(struct tty *tp)
+{
+
+	if (tp->t_oproc != NULL)	/* XXX: Kludge for pty. */
+		(*tp->t_oproc)(tp);
+	return (0);
+}
+
+/*
+ * "close" a line discipline
+ */
+int
+ttylclose(struct tty *tp, int flag)
+{
+
+	if (flag & FNONBLOCK || ttywflush(tp))
+		ttyflush(tp, FREAD | FWRITE);
+	return (0);
+}
+
+/*
+ * Handle modem control transition on a tty.
+ * Flag indicates new state of carrier.
+ * Returns 0 if the line should be turned off, otherwise 1.
+ */
+int
+ttymodem(struct tty *tp, int flag)
+{
+
+	if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) {
+		/*
+		 * MDMBUF: do flow control according to carrier flag
+		 * XXX TS_CAR_OFLOW doesn't do anything yet.  TS_TTSTOP
+		 * works if IXON and IXANY are clear.
+		 */
+		if (flag) {
+			CLR(tp->t_state, TS_CAR_OFLOW);
+			CLR(tp->t_state, TS_TTSTOP);
+			ttstart(tp);
+		} else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) {
+			SET(tp->t_state, TS_CAR_OFLOW);
+			SET(tp->t_state, TS_TTSTOP);
+			(*tp->t_stop)(tp, 0);
+		}
+	} else if (flag == 0) {
+		/*
+		 * Lost carrier.
+		 */
+		CLR(tp->t_state, TS_CARR_ON);
+		if (ISSET(tp->t_state, TS_ISOPEN) &&
+		    !ISSET(tp->t_cflag, CLOCAL)) {
+			SET(tp->t_state, TS_ZOMBIE);
+			CLR(tp->t_state, TS_CONNECTED);
+			if (tp->t_session) {
+				sx_slock(&proctree_lock);
+				if (tp->t_session->s_leader) {
+					struct proc *p;
+
+					p = tp->t_session->s_leader;
+					PROC_LOCK(p);
+					psignal(p, SIGHUP);
+					PROC_UNLOCK(p);
+				}
+				sx_sunlock(&proctree_lock);
+			}
+			ttyflush(tp, FREAD | FWRITE);
+			return (0);
+		}
+	} else {
+		/*
+		 * Carrier now on.
+		 */
+		SET(tp->t_state, TS_CARR_ON);
+		if (!ISSET(tp->t_state, TS_ZOMBIE))
+			SET(tp->t_state, TS_CONNECTED);
+		wakeup(TSA_CARR_ON(tp));
+		ttwakeup(tp);
+		ttwwakeup(tp);
+	}
+	return (1);
+}
+
+/*
+ * Reinput pending characters after state switch
+ * call at spltty().
+ */
+static void
+ttypend(struct tty *tp)
+{
+	struct clist tq;
+	int c;
+
+	CLR(tp->t_lflag, PENDIN);
+	SET(tp->t_state, TS_TYPEN);
+	/*
+	 * XXX this assumes too much about clist internals.  It may even
+	 * fail if the cblock slush pool is empty.  We can't allocate more
+	 * cblocks here because we are called from an interrupt handler
+	 * and clist_alloc_cblocks() can wait.
+	 */
+	tq = tp->t_rawq;
+	bzero(&tp->t_rawq, sizeof tp->t_rawq);
+	tp->t_rawq.c_cbmax = tq.c_cbmax;
+	tp->t_rawq.c_cbreserved = tq.c_cbreserved;
+	while ((c = getc(&tq)) >= 0)
+		ttyinput(c, tp);
+	CLR(tp->t_state, TS_TYPEN);
+}
+
+/*
+ * Process a read call on a tty device.
+ */
+int
+ttread(struct tty *tp, struct uio *uio, int flag)
+{
+	struct clist *qp;
+	int c;
+	tcflag_t lflag;
+	cc_t *cc = tp->t_cc;
+	struct proc *p = curproc;
+	int s, first, error = 0;
+	int has_stime = 0, last_cc = 0;
+	long slp = 0;		/* XXX this should be renamed `timo'. */
+	struct timeval stime;
+	struct pgrp *pg;
+
+loop:
+	s = spltty();
+	lflag = tp->t_lflag;
+	/*
+	 * take pending input first
+	 */
+	if (ISSET(lflag, PENDIN)) {
+		ttypend(tp);
+		splx(s);	/* reduce latency */
+		s = spltty();
+		lflag = tp->t_lflag;	/* XXX ttypend() clobbers it */
+	}
+
+	/*
+	 * Hang process if it's in the background.
+	 */
+	if (isbackground(p, tp)) {
+		splx(s);
+		sx_slock(&proctree_lock);
+		PROC_LOCK(p);
+		if (SIGISMEMBER(p->p_sigignore, SIGTTIN) ||
+		    SIGISMEMBER(p->p_sigmask, SIGTTIN) ||
+		    (p->p_flag & P_PPWAIT) || p->p_pgrp->pg_jobc == 0) {
+			PROC_UNLOCK(p);
+			sx_sunlock(&proctree_lock);
+			return (EIO);
+		}
+		pg = p->p_pgrp;
+		PROC_UNLOCK(p);
+		PGRP_LOCK(pg);
+		sx_sunlock(&proctree_lock);
+		pgsignal(pg, SIGTTIN, 1);
+		PGRP_UNLOCK(pg);
+		error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0);
+		if (error)
+			return (error);
+		goto loop;
+	}
+
+	if (ISSET(tp->t_state, TS_ZOMBIE)) {
+		splx(s);
+		return (0);	/* EOF */
+	}
+
+	/*
+	 * If canonical, use the canonical queue,
+	 * else use the raw queue.
+	 *
+	 * (should get rid of clists...)
+	 */
+	qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq;
+
+	if (flag & IO_NDELAY) {
+		if (qp->c_cc > 0)
+			goto read;
+		if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) {
+			splx(s);
+			return (0);
+		}
+		splx(s);
+		return (EWOULDBLOCK);
+	}
+	if (!ISSET(lflag, ICANON)) {
+		int m = cc[VMIN];
+		long t = cc[VTIME];
+		struct timeval timecopy;
+
+		/*
+		 * Check each of the four combinations.
+		 * (m > 0 && t == 0) is the normal read case.
+		 * It should be fairly efficient, so we check that and its
+		 * companion case (m == 0 && t == 0) first.
+		 * For the other two cases, we compute the target sleep time
+		 * into slp.
+		 */
+		if (t == 0) {
+			if (qp->c_cc < m)
+				goto sleep;
+			if (qp->c_cc > 0)
+				goto read;
+
+			/* m, t and qp->c_cc are all 0.  0 is enough input. */
+			splx(s);
+			return (0);
+		}
+		t *= 100000;		/* time in us */
+#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \
+			 ((t1).tv_usec - (t2).tv_usec))
+		if (m > 0) {
+			if (qp->c_cc <= 0)
+				goto sleep;
+			if (qp->c_cc >= m)
+				goto read;
+			getmicrotime(&timecopy);
+			if (!has_stime) {
+				/* first character, start timer */
+				has_stime = 1;
+				stime = timecopy;
+				slp = t;
+			} else if (qp->c_cc > last_cc) {
+				/* got a character, restart timer */
+				stime = timecopy;
+				slp = t;
+			} else {
+				/* nothing, check expiration */
+				slp = t - diff(timecopy, stime);
+				if (slp <= 0)
+					goto read;
+			}
+			last_cc = qp->c_cc;
+		} else {	/* m == 0 */
+			if (qp->c_cc > 0)
+				goto read;
+			getmicrotime(&timecopy);
+			if (!has_stime) {
+				has_stime = 1;
+				stime = timecopy;
+				slp = t;
+			} else {
+				slp = t - diff(timecopy, stime);
+				if (slp <= 0) {
+					/* Timed out, but 0 is enough input. */
+					splx(s);
+					return (0);
+				}
+			}
+		}
+#undef diff
+		/*
+		 * Rounding down may make us wake up just short
+		 * of the target, so we round up.
+		 * The formula is ceiling(slp * hz/1000000).
+		 * 32-bit arithmetic is enough for hz < 169.
+		 * XXX see tvtohz() for how to avoid overflow if hz
+		 * is large (divide by `tick' and/or arrange to
+		 * use tvtohz() if hz is large).
+		 */
+		slp = (long) (((u_long)slp * hz) + 999999) / 1000000;
+		goto sleep;
+	}
+	if (qp->c_cc <= 0) {
+sleep:
+		/*
+		 * There is no input, or not enough input and we can block.
+		 */
+		error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH,
+				 ISSET(tp->t_state, TS_CONNECTED) ?
+				 "ttyin" : "ttyhup", (int)slp);
+		splx(s);
+		if (error == EWOULDBLOCK)
+			error = 0;
+		else if (error)
+			return (error);
+		/*
+		 * XXX what happens if another process eats some input
+		 * while we are asleep (not just here)?  It would be
+		 * safest to detect changes and reset our state variables
+		 * (has_stime and last_cc).
+		 */
+		slp = 0;
+		goto loop;
+	}
+read:
+	splx(s);
+	/*
+	 * Input present, check for input mapping and processing.
+	 */
+	first = 1;
+	if (ISSET(lflag, ICANON | ISIG))
+		goto slowcase;
+	for (;;) {
+		char ibuf[IBUFSIZ];
+		int icc;
+
+		icc = imin(uio->uio_resid, IBUFSIZ);
+		icc = q_to_b(qp, ibuf, icc);
+		if (icc <= 0) {
+			if (first)
+				goto loop;
+			break;
+		}
+		error = uiomove(ibuf, icc, uio);
+		/*
+		 * XXX if there was an error then we should ungetc() the
+		 * unmoved chars and reduce icc here.
+		 */
+		if (error)
+			break;
+		if (uio->uio_resid == 0)
+			break;
+		first = 0;
+	}
+	goto out;
+slowcase:
+	for (;;) {
+		c = getc(qp);
+		if (c < 0) {
+			if (first)
+				goto loop;
+			break;
+		}
+		/*
+		 * delayed suspend (^Y)
+		 */
+		if (CCEQ(cc[VDSUSP], c) &&
+		    ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) {
+			if (tp->t_pgrp != NULL) {
+				PGRP_LOCK(tp->t_pgrp);
+				pgsignal(tp->t_pgrp, SIGTSTP, 1);
+				PGRP_UNLOCK(tp->t_pgrp);
+			}
+			if (first) {
+				error = ttysleep(tp, &lbolt, TTIPRI | PCATCH,
+						 "ttybg3", 0);
+				if (error)
+					break;
+				goto loop;
+			}
+			break;
+		}
+		/*
+		 * Interpret EOF only in canonical mode.
+		 */
+		if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON))
+			break;
+		/*
+		 * Give user character.
+		 */
+		error = ureadc(c, uio);
+		if (error)
+			/* XXX should ungetc(c, qp). */
+			break;
+		if (uio->uio_resid == 0)
+			break;
+		/*
+		 * In canonical mode check for a "break character"
+		 * marking the end of a "line of input".
+		 */
+		if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag))
+			break;
+		first = 0;
+	}
+
+out:
+	/*
+	 * Look to unblock input now that (presumably)
+	 * the input queue has gone down.
+	 */
+	s = spltty();
+	if (ISSET(tp->t_state, TS_TBLOCK) &&
+	    tp->t_rawq.c_cc + tp->t_canq.c_cc <= tp->t_ilowat)
+		ttyunblock(tp);
+	splx(s);
+
+	return (error);
+}
+
+/*
+ * Check the output queue on tp for space for a kernel message (from uprintf
+ * or tprintf).  Allow some space over the normal hiwater mark so we don't
+ * lose messages due to normal flow control, but don't let the tty run amok.
+ * Sleeps here are not interruptible, but we return prematurely if new signals
+ * arrive.
+ */
+int
+ttycheckoutq(struct tty *tp, int wait)
+{
+	int hiwat, s;
+	sigset_t oldmask;
+
+	hiwat = tp->t_ohiwat;
+	SIGEMPTYSET(oldmask);
+	s = spltty();
+	if (wait)
+		oldmask = curproc->p_siglist;
+	if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100)
+		while (tp->t_outq.c_cc > hiwat) {
+			ttstart(tp);
+			if (tp->t_outq.c_cc <= hiwat)
+				break;
+			if (!(wait && SIGSETEQ(curproc->p_siglist, oldmask))) {
+				splx(s);
+				return (0);
+			}
+			SET(tp->t_state, TS_SO_OLOWAT);
+			tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz);
+		}
+	splx(s);
+	return (1);
+}
+
+/*
+ * Process a write call on a tty device.
+ */
+int
+ttwrite(struct tty *tp, struct uio *uio, int flag)
+{
+	char *cp = NULL;
+	int cc, ce;
+	struct proc *p;
+	int i, hiwat, cnt, error, s;
+	char obuf[OBUFSIZ];
+
+	hiwat = tp->t_ohiwat;
+	cnt = uio->uio_resid;
+	error = 0;
+	cc = 0;
+loop:
+	s = spltty();
+	if (ISSET(tp->t_state, TS_ZOMBIE)) {
+		splx(s);
+		if (uio->uio_resid == cnt)
+			error = EIO;
+		goto out;
+	}
+	if (!ISSET(tp->t_state, TS_CONNECTED)) {
+		if (flag & IO_NDELAY) {
+			splx(s);
+			error = EWOULDBLOCK;
+			goto out;
+		}
+		error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+				 "ttydcd", 0);
+		splx(s);
+		if (error)
+			goto out;
+		goto loop;
+	}
+	splx(s);
+	/*
+	 * Hang the process if it's in the background.
+	 */
+	p = curproc;
+	sx_slock(&proctree_lock);
+	PROC_LOCK(p);
+	if (isbackground(p, tp) &&
+	    ISSET(tp->t_lflag, TOSTOP) && !(p->p_flag & P_PPWAIT) &&
+	    !SIGISMEMBER(p->p_sigignore, SIGTTOU) &&
+	    !SIGISMEMBER(p->p_sigmask, SIGTTOU)) {
+		if (p->p_pgrp->pg_jobc == 0) {
+			PROC_UNLOCK(p);
+			sx_sunlock(&proctree_lock);
+			error = EIO;
+			goto out;
+		}
+		PROC_UNLOCK(p);
+		PGRP_LOCK(p->p_pgrp);
+		sx_sunlock(&proctree_lock);
+		pgsignal(p->p_pgrp, SIGTTOU, 1);
+		PGRP_UNLOCK(p->p_pgrp);
+		error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0);
+		if (error)
+			goto out;
+		goto loop;
+	} else {
+		PROC_UNLOCK(p);
+		sx_sunlock(&proctree_lock);
+	}
+	/*
+	 * Process the user's data in at most OBUFSIZ chunks.  Perform any
+	 * output translation.  Keep track of high water mark, sleep on
+	 * overflow awaiting device aid in acquiring new space.
+	 */
+	while (uio->uio_resid > 0 || cc > 0) {
+		if (ISSET(tp->t_lflag, FLUSHO)) {
+			uio->uio_resid = 0;
+			return (0);
+		}
+		if (tp->t_outq.c_cc > hiwat)
+			goto ovhiwat;
+		/*
+		 * Grab a hunk of data from the user, unless we have some
+		 * leftover from last time.
+		 */
+		if (cc == 0) {
+			cc = imin(uio->uio_resid, OBUFSIZ);
+			cp = obuf;
+			error = uiomove(cp, cc, uio);
+			if (error) {
+				cc = 0;
+				break;
+			}
+		}
+		/*
+		 * If nothing fancy need be done, grab those characters we
+		 * can handle without any of ttyoutput's processing and
+		 * just transfer them to the output q.  For those chars
+		 * which require special processing (as indicated by the
+		 * bits in char_type), call ttyoutput.  After processing
+		 * a hunk of data, look for FLUSHO so ^O's will take effect
+		 * immediately.
+		 */
+		while (cc > 0) {
+			if (!ISSET(tp->t_oflag, OPOST))
+				ce = cc;
+			else {
+				ce = cc - scanc((u_int)cc, (u_char *)cp,
+						char_type, CCLASSMASK);
+				/*
+				 * If ce is zero, then we're processing
+				 * a special character through ttyoutput.
+				 */
+				if (ce == 0) {
+					tp->t_rocount = 0;
+					if (ttyoutput(*cp, tp) >= 0) {
+						/* No Clists, wait a bit. */
+						ttstart(tp);
+						if (flag & IO_NDELAY) {
+							error = EWOULDBLOCK;
+							goto out;
+						}
+						error = ttysleep(tp, &lbolt,
+								 TTOPRI|PCATCH,
+								 "ttybf1", 0);
+						if (error)
+							goto out;
+						goto loop;
+					}
+					cp++;
+					cc--;
+					if (ISSET(tp->t_lflag, FLUSHO) ||
+					    tp->t_outq.c_cc > hiwat)
+						goto ovhiwat;
+					continue;
+				}
+			}
+			/*
+			 * A bunch of normal characters have been found.
+			 * Transfer them en masse to the output queue and
+			 * continue processing at the top of the loop.
+			 * If there are any further characters in this
+			 * <= OBUFSIZ chunk, the first should be a character
+			 * requiring special handling by ttyoutput.
+			 */
+			tp->t_rocount = 0;
+			i = b_to_q(cp, ce, &tp->t_outq);
+			ce -= i;
+			tp->t_column += ce;
+			cp += ce, cc -= ce, tk_nout += ce;
+			tp->t_outcc += ce;
+			if (i > 0) {
+				/* No Clists, wait a bit. */
+				ttstart(tp);
+				if (flag & IO_NDELAY) {
+					error = EWOULDBLOCK;
+					goto out;
+				}
+				error = ttysleep(tp, &lbolt, TTOPRI | PCATCH,
+						 "ttybf2", 0);
+				if (error)
+					goto out;
+				goto loop;
+			}
+			if (ISSET(tp->t_lflag, FLUSHO) ||
+			    tp->t_outq.c_cc > hiwat)
+				break;
+		}
+		ttstart(tp);
+	}
+out:
+	/*
+	 * If cc is nonzero, we leave the uio structure inconsistent, as the
+	 * offset and iov pointers have moved forward, but it doesn't matter
+	 * (the call will either return short or restart with a new uio).
+	 */
+	uio->uio_resid += cc;
+	return (error);
+
+ovhiwat:
+	ttstart(tp);
+	s = spltty();
+	/*
+	 * This can only occur if FLUSHO is set in t_lflag,
+	 * or if ttstart/oproc is synchronous (or very fast).
+	 */
+	if (tp->t_outq.c_cc <= hiwat) {
+		splx(s);
+		goto loop;
+	}
+	if (flag & IO_NDELAY) {
+		splx(s);
+		uio->uio_resid += cc;
+		return (uio->uio_resid == cnt ? EWOULDBLOCK : 0);
+	}
+	SET(tp->t_state, TS_SO_OLOWAT);
+	error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri",
+			 tp->t_timeout);
+	splx(s);
+	if (error == EWOULDBLOCK)
+		error = EIO;
+	if (error)
+		goto out;
+	goto loop;
+}
+
+/*
+ * Rubout one character from the rawq of tp
+ * as cleanly as possible.
+ */
+static void
+ttyrub(int c, struct tty *tp)
+{
+	char *cp;
+	int savecol;
+	int tabc, s;
+
+	if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC))
+		return;
+	CLR(tp->t_lflag, FLUSHO);
+	if (ISSET(tp->t_lflag, ECHOE)) {
+		if (tp->t_rocount == 0) {
+			/*
+			 * Screwed by ttwrite; retype
+			 */
+			ttyretype(tp);
+			return;
+		}
+		if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE))
+			ttyrubo(tp, 2);
+		else {
+			CLR(c, ~TTY_CHARMASK);
+			switch (CCLASS(c)) {
+			case ORDINARY:
+				ttyrubo(tp, 1);
+				break;
+			case BACKSPACE:
+			case CONTROL:
+			case NEWLINE:
+			case RETURN:
+			case VTAB:
+				if (ISSET(tp->t_lflag, ECHOCTL))
+					ttyrubo(tp, 2);
+				break;
+			case TAB:
+				if (tp->t_rocount < tp->t_rawq.c_cc) {
+					ttyretype(tp);
+					return;
+				}
+				s = spltty();
+				savecol = tp->t_column;
+				SET(tp->t_state, TS_CNTTB);
+				SET(tp->t_lflag, FLUSHO);
+				tp->t_column = tp->t_rocol;
+				cp = tp->t_rawq.c_cf;
+				if (cp)
+					tabc = *cp;	/* XXX FIX NEXTC */
+				for (; cp; cp = nextc(&tp->t_rawq, cp, &tabc))
+					ttyecho(tabc, tp);
+				CLR(tp->t_lflag, FLUSHO);
+				CLR(tp->t_state, TS_CNTTB);
+				splx(s);
+
+				/* savecol will now be length of the tab. */
+				savecol -= tp->t_column;
+				tp->t_column += savecol;
+				if (savecol > 8)
+					savecol = 8;	/* overflow screw */
+				while (--savecol >= 0)
+					(void)ttyoutput('\b', tp);
+				break;
+			default:			/* XXX */
+#define	PANICSTR	"ttyrub: would panic c = %d, val = %d\n"
+				(void)printf(PANICSTR, c, CCLASS(c));
+#ifdef notdef
+				panic(PANICSTR, c, CCLASS(c));
+#endif
+			}
+		}
+	} else if (ISSET(tp->t_lflag, ECHOPRT)) {
+		if (!ISSET(tp->t_state, TS_ERASE)) {
+			SET(tp->t_state, TS_ERASE);
+			(void)ttyoutput('\\', tp);
+		}
+		ttyecho(c, tp);
+	} else {
+		ttyecho(tp->t_cc[VERASE], tp);
+		/*
+		 * This code may be executed not only when an ERASE key
+		 * is pressed, but also when ^U (KILL) or ^W (WERASE) are.
+		 * So, I didn't think it was worthwhile to pass the extra
+		 * information (which would need an extra parameter,
+		 * changing every call) needed to distinguish the ERASE2
+		 * case from the ERASE.
+		 */
+	}
+	--tp->t_rocount;
+}
+
+/*
+ * Back over cnt characters, erasing them.
+ */
+static void
+ttyrubo(struct tty *tp, int cnt)
+{
+
+	while (cnt-- > 0) {
+		(void)ttyoutput('\b', tp);
+		(void)ttyoutput(' ', tp);
+		(void)ttyoutput('\b', tp);
+	}
+}
+
+/*
+ * ttyretype --
+ *	Reprint the rawq line.  Note, it is assumed that c_cc has already
+ *	been checked.
+ */
+static void
+ttyretype(struct tty *tp)
+{
+	char *cp;
+	int s, c;
+
+	/* Echo the reprint character. */
+	if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE)
+		ttyecho(tp->t_cc[VREPRINT], tp);
+
+	(void)ttyoutput('\n', tp);
+
+	/*
+	 * XXX
+	 * FIX: NEXTC IS BROKEN - DOESN'T CHECK QUOTE
+	 * BIT OF FIRST CHAR.
+	 */
+	s = spltty();
+	for (cp = tp->t_canq.c_cf, c = (cp != NULL ? *cp : 0);
+	    cp != NULL; cp = nextc(&tp->t_canq, cp, &c))
+		ttyecho(c, tp);
+	for (cp = tp->t_rawq.c_cf, c = (cp != NULL ? *cp : 0);
+	    cp != NULL; cp = nextc(&tp->t_rawq, cp, &c))
+		ttyecho(c, tp);
+	CLR(tp->t_state, TS_ERASE);
+	splx(s);
+
+	tp->t_rocount = tp->t_rawq.c_cc;
+	tp->t_rocol = 0;
+}
+
+/*
+ * Echo a typed character to the terminal.
+ */
+static void
+ttyecho(int c, struct tty *tp)
+{
+
+	if (!ISSET(tp->t_state, TS_CNTTB))
+		CLR(tp->t_lflag, FLUSHO);
+	if ((!ISSET(tp->t_lflag, ECHO) &&
+	     (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) ||
+	    ISSET(tp->t_lflag, EXTPROC))
+		return;
+	if (ISSET(tp->t_lflag, ECHOCTL) &&
+	    ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') ||
+	    ISSET(c, TTY_CHARMASK) == 0177)) {
+		(void)ttyoutput('^', tp);
+		CLR(c, ~TTY_CHARMASK);
+		if (c == 0177)
+			c = '?';
+		else
+			c += 'A' - 1;
+	}
+	(void)ttyoutput(c, tp);
+}
+
+/*
+ * Wake up any readers on a tty.
+ */
+void
+ttwakeup(struct tty *tp)
+{
+
+	if (SEL_WAITING(&tp->t_rsel))
+		selwakeup(&tp->t_rsel);
+	if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
+		pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
+	wakeup(TSA_HUP_OR_INPUT(tp));
+	KNOTE(&tp->t_rsel.si_note, 0);
+}
+
+/*
+ * Wake up any writers on a tty.
+ */
+void
+ttwwakeup(struct tty *tp)
+{
+
+	if (SEL_WAITING(&tp->t_wsel) && tp->t_outq.c_cc <= tp->t_olowat)
+		selwakeup(&tp->t_wsel);
+	if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
+		pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
+	if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) ==
+	    TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) {
+		CLR(tp->t_state, TS_SO_OCOMPLETE);
+		wakeup(TSA_OCOMPLETE(tp));
+	}
+	if (ISSET(tp->t_state, TS_SO_OLOWAT) &&
+	    tp->t_outq.c_cc <= tp->t_olowat) {
+		CLR(tp->t_state, TS_SO_OLOWAT);
+		wakeup(TSA_OLOWAT(tp));
+	}
+	KNOTE(&tp->t_wsel.si_note, 0);
+}
+
+/*
+ * Look up a code for a specified speed in a conversion table;
+ * used by drivers to map software speed values to hardware parameters.
+ */
+int
+ttspeedtab(int speed, struct speedtab *table)
+{
+
+	for ( ; table->sp_speed != -1; table++)
+		if (table->sp_speed == speed)
+			return (table->sp_code);
+	return (-1);
+}
+
+/*
+ * Set input and output watermarks and buffer sizes.  For input, the
+ * high watermark is about one second's worth of input above empty, the
+ * low watermark is slightly below high water, and the buffer size is a
+ * driver-dependent amount above high water.  For output, the watermarks
+ * are near the ends of the buffer, with about 1 second's worth of input
+ * between them.  All this only applies to the standard line discipline.
+ */
+void
+ttsetwater(struct tty *tp)
+{
+	int cps, ttmaxhiwat, x;
+
+	/* Input. */
+	clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512);
+	switch (tp->t_ispeedwat) {
+	case (speed_t)-1:
+		cps = tp->t_ispeed / 10;
+		break;
+	case 0:
+		/*
+		 * This case is for old drivers that don't know about
+		 * t_ispeedwat.  Arrange for them to get the old buffer
+		 * sizes and watermarks.
+		 */
+		cps = TTYHOG - 2 * 256;
+		tp->t_ififosize = 2 * 256;
+		break;
+	default:
+		cps = tp->t_ispeedwat / 10;
+		break;
+	}
+	tp->t_ihiwat = cps;
+	tp->t_ilowat = 7 * cps / 8;
+	x = cps + tp->t_ififosize;
+	clist_alloc_cblocks(&tp->t_rawq, x, x);
+
+	/* Output. */
+	switch (tp->t_ospeedwat) {
+	case (speed_t)-1:
+		cps = tp->t_ospeed / 10;
+		ttmaxhiwat = 2 * TTMAXHIWAT;
+		break;
+	case 0:
+		cps = tp->t_ospeed / 10;
+		ttmaxhiwat = TTMAXHIWAT;
+		break;
+	default:
+		cps = tp->t_ospeedwat / 10;
+		ttmaxhiwat = 8 * TTMAXHIWAT;
+		break;
+	}
+#define CLAMP(x, h, l)	((x) > h ? h : ((x) < l) ? l : (x))
+	tp->t_olowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT);
+	x += cps;
+	x = CLAMP(x, ttmaxhiwat, TTMINHIWAT);	/* XXX clamps are too magic */
+	tp->t_ohiwat = roundup(x, CBSIZE);	/* XXX for compat */
+	x = imax(tp->t_ohiwat, TTMAXHIWAT);	/* XXX for compat/safety */
+	x += OBUFSIZ + 100;
+	clist_alloc_cblocks(&tp->t_outq, x, x);
+#undef	CLAMP
+}
+
+/*
+ * Report on state of foreground process group.
+ */
+void
+ttyinfo(struct tty *tp)
+{
+	struct proc *p, *pick;
+	struct timeval utime, stime;
+	const char *stmp;
+	long ltmp;
+	int tmp;
+	struct thread *td;
+
+	if (ttycheckoutq(tp,0) == 0)
+		return;
+
+	/* Print load average. */
+	tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
+	ttyprintf(tp, "load: %d.%02d ", tmp / 100, tmp % 100);
+
+	if (tp->t_session == NULL)
+		ttyprintf(tp, "not a controlling terminal\n");
+	else if (tp->t_pgrp == NULL)
+		ttyprintf(tp, "no foreground process group\n");
+	else {
+		PGRP_LOCK(tp->t_pgrp);
+		if ((p = LIST_FIRST(&tp->t_pgrp->pg_members)) == 0) {
+			PGRP_UNLOCK(tp->t_pgrp);
+			ttyprintf(tp, "empty foreground process group\n");
+		} else {
+			mtx_lock_spin(&sched_lock);
+
+			/* Pick interesting process. */
+			for (pick = NULL; p != 0; p = LIST_NEXT(p, p_pglist))
+				if (proc_compare(pick, p))
+					pick = p;
+			PGRP_UNLOCK(tp->t_pgrp);
+
+			td = FIRST_THREAD_IN_PROC(pick);
+			stmp = pick->p_stat == SRUN ? "running" :  /* XXXKSE */
+			    pick->p_stat == SMTX ? td->td_mtxname :
+			    td->td_wmesg ? td->td_wmesg : "iowait";
+			calcru(pick, &utime, &stime, NULL);
+			ltmp = pick->p_stat == SIDL || pick->p_stat == SWAIT ||
+			    pick->p_stat == SZOMB ? 0 :
+			    pgtok(vmspace_resident_count(pick->p_vmspace));
+			mtx_unlock_spin(&sched_lock);
+
+			ttyprintf(tp, " cmd: %s %d [%s%s] ", pick->p_comm,
+			    pick->p_pid, pick->p_stat == SMTX ? "*" : "", stmp);
+
+			/* Print user time. */
+			ttyprintf(tp, "%ld.%02ldu ",
+			    utime.tv_sec, utime.tv_usec / 10000);
+
+			/* Print system time. */
+			ttyprintf(tp, "%ld.%02lds ",
+			    (long)stime.tv_sec, stime.tv_usec / 10000);
+
+			/* Print percentage cpu, resident set size. */
+			ttyprintf(tp, "%d%% %ldk\n", tmp / 100, ltmp);
+
+		}
+	}
+	tp->t_rocount = 0;	/* so pending input will be retyped if BS */
+}
+
+/*
+ * Returns 1 if p2 is "better" than p1
+ *
+ * The algorithm for picking the "interesting" process is thus:
+ *
+ *	1) Only foreground processes are eligible - implied.
+ *	2) Runnable processes are favored over anything else.  The runner
+ *	   with the highest cpu utilization is picked (p_estcpu).  Ties are
+ *	   broken by picking the highest pid.
+ *	3) The sleeper with the shortest sleep time is next.  With ties,
+ *	   we pick out just "short-term" sleepers (P_SINTR == 0).
+ *	4) Further ties are broken by picking the highest pid.
+ */
+#define ISRUN(p)	(((p)->p_stat == SRUN) || ((p)->p_stat == SIDL))
+#define TESTAB(a, b)    ((a)<<1 | (b))
+#define ONLYA   2
+#define ONLYB   1
+#define BOTH    3
+
+static int
+proc_compare(struct proc *p1, struct proc *p2)
+{
+
+	int esta, estb;
+	struct ksegrp *kg;
+	mtx_assert(&sched_lock, MA_OWNED);
+	if (p1 == NULL)
+		return (1);
+
+	/*
+	 * see if at least one of them is runnable
+	 */
+	switch (TESTAB(ISRUN(p1), ISRUN(p2))) {
+	case ONLYA:
+		return (0);
+	case ONLYB:
+		return (1);
+	case BOTH:
+		/*
+		 * tie - favor one with highest recent cpu utilization
+		 */
+		esta = estb = 0;
+		FOREACH_KSEGRP_IN_PROC(p1,kg) {
+			esta += kg->kg_estcpu;
+		}
+		FOREACH_KSEGRP_IN_PROC(p2,kg) {
+			estb += kg->kg_estcpu;
+		}
+		if (estb > esta)
+			return (1);
+		if (esta > estb)
+			return (0);
+		return (p2->p_pid > p1->p_pid);	/* tie - return highest pid */
+	}
+	/*
+	 * weed out zombies
+	 */
+	switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) {
+	case ONLYA:
+		return (1);
+	case ONLYB:
+		return (0);
+	case BOTH:
+		return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+	}
+
+#if 0 /* XXXKSE */
+	/*
+	 * pick the one with the smallest sleep time
+	 */
+	if (p2->p_slptime > p1->p_slptime)
+		return (0);
+	if (p1->p_slptime > p2->p_slptime)
+		return (1);
+	/*
+	 * favor one sleeping in a non-interruptible sleep
+	 */
+	if (p1->p_sflag & PS_SINTR && (p2->p_sflag & PS_SINTR) == 0)
+		return (1);
+	if (p2->p_sflag & PS_SINTR && (p1->p_sflag & PS_SINTR) == 0)
+		return (0);
+#endif
+	return (p2->p_pid > p1->p_pid);		/* tie - return highest pid */
+}
+
+/*
+ * Output char to tty; console putchar style.
+ */
+int
+tputchar(int c, struct tty *tp)
+{
+	int s;
+
+	s = spltty();
+	if (!ISSET(tp->t_state, TS_CONNECTED)) {
+		splx(s);
+		return (-1);
+	}
+	if (c == '\n')
+		(void)ttyoutput('\r', tp);
+	(void)ttyoutput(c, tp);
+	ttstart(tp);
+	splx(s);
+	return (0);
+}
+
+/*
+ * Sleep on chan, returning ERESTART if tty changed while we napped and
+ * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep.  If
+ * the tty is revoked, restarting a pending call will redo validation done
+ * at the start of the call.
+ */
+int
+ttysleep(struct tty *tp, void *chan, int pri, char *wmesg, int timo)
+{
+	int error;
+	int gen;
+
+	gen = tp->t_gen;
+	error = tsleep(chan, pri, wmesg, timo);
+	if (error)
+		return (error);
+	return (tp->t_gen == gen ? 0 : ERESTART);
+}
+
+/*
+ * Allocate a tty struct.  Clists in the struct will be allocated by
+ * ttyopen().
+ */
+struct tty *
+ttymalloc(struct tty *tp)
+{
+
+	if (tp)
+		return(tp);
+	tp = malloc(sizeof *tp, M_TTYS, M_WAITOK | M_ZERO);
+	ttyregister(tp);
+	return (tp);
+}
+
+#if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */
+/*
+ * Free a tty struct.  Clists in the struct should have been freed by
+ * ttyclose().
+ */
+void
+ttyfree(struct tty *tp)
+{
+	free(tp, M_TTYS);
+}
+#endif /* 0 */
+
+void
+ttyregister(struct tty *tp)
+{
+	tp->t_timeout = -1;
+	SLIST_INSERT_HEAD(&tty_list, tp, t_list);
+}
+
+static int
+sysctl_kern_ttys(SYSCTL_HANDLER_ARGS)
+{
+	struct tty *tp;
+	struct xtty xt;
+	int error;
+
+	SLIST_FOREACH(tp, &tty_list, t_list) {
+		bzero(&xt, sizeof xt);
+		xt.xt_size = sizeof xt;
+#define XT_COPY(field) xt.xt_##field = tp->t_##field
+		xt.xt_rawcc = tp->t_rawq.c_cc;
+		xt.xt_cancc = tp->t_canq.c_cc;
+		xt.xt_outcc = tp->t_outq.c_cc;
+		XT_COPY(line);
+		if (tp->t_dev)
+			xt.xt_dev = dev2udev(tp->t_dev);
+		XT_COPY(state);
+		XT_COPY(flags);
+		XT_COPY(timeout);
+		if (tp->t_pgrp)
+			xt.xt_pgid = tp->t_pgrp->pg_id;
+		if (tp->t_session)
+			xt.xt_sid = tp->t_session->s_sid;
+		XT_COPY(termios);
+		XT_COPY(winsize);
+		XT_COPY(column);
+		XT_COPY(rocount);
+		XT_COPY(rocol);
+		XT_COPY(ififosize);
+		XT_COPY(ihiwat);
+		XT_COPY(ilowat);
+		XT_COPY(ispeedwat);
+		XT_COPY(ohiwat);
+		XT_COPY(olowat);
+		XT_COPY(ospeedwat);
+#undef XT_COPY
+		error = SYSCTL_OUT(req, &xt, sizeof xt);
+		if (error)
+			return (error);
+	}
+	return (0);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, 0, sysctl_kern_ttys, "S,xtty", "All ttys");
+SYSCTL_LONG(_kern, OID_AUTO, tty_nin, CTLFLAG_RD,
+	&tk_nin, 0, "Total TTY in characters");
+SYSCTL_LONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD,
+	&tk_nout, 0, "Total TTY out characters");
+
+void
+nottystop(struct tty *tp, int rw)
+{
+
+	return;
+}
+
+int
+ttyread(dev_t dev, struct uio *uio, int flag)
+{
+	struct tty *tp;
+
+	tp = dev->si_tty;
+	if (tp == NULL)
+		return (ENODEV);
+	return ((*linesw[tp->t_line].l_read)(tp, uio, flag));
+}
+
+int
+ttywrite(dev_t dev, struct uio *uio, int flag)
+{
+	struct tty *tp;
+
+	tp = dev->si_tty;
+	if (tp == NULL)
+		return (ENODEV);
+	return ((*linesw[tp->t_line].l_write)(tp, uio, flag));
+}
diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c
new file mode 100644
index 0000000..01628ff
--- /dev/null
+++ b/sys/kern/tty_compat.c
@@ -0,0 +1,490 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_compat.c	8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+/*
+ * mapping routines for old line discipline (yuck)
+ */
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ioctl_compat.h>
+#include <sys/tty.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+static int ttcompatgetflags(struct tty *tp);
+static void ttcompatsetflags(struct tty *tp, struct termios *t);
+static void ttcompatsetlflags(struct tty *tp, struct termios *t);
+static int ttcompatspeedtab(int speed, struct speedtab *table);
+
+static int ttydebug = 0;
+SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, "");
+
+static struct speedtab compatspeeds[] = {
+#define MAX_SPEED	17
+	{ 115200, 17 },
+	{ 57600, 16 },
+	{ 38400, 15 },
+	{ 19200, 14 },
+	{ 9600,	13 },
+	{ 4800,	12 },
+	{ 2400,	11 },
+	{ 1800,	10 },
+	{ 1200,	9 },
+	{ 600,	8 },
+	{ 300,	7 },
+	{ 200,	6 },
+	{ 150,	5 },
+	{ 134,	4 },
+	{ 110,	3 },
+	{ 75,	2 },
+	{ 50,	1 },
+	{ 0,	0 },
+	{ -1,	-1 },
+};
+static int compatspcodes[] = {
+	0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
+	1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200,
+};
+
+static int
+ttcompatspeedtab(speed, table)
+	int speed;
+	register struct speedtab *table;
+{
+	if (speed == 0)
+		return (0); /* hangup */
+	for ( ; table->sp_speed > 0; table++)
+		if (table->sp_speed <= speed) /* nearest one, rounded down */
+			return (table->sp_code);
+	return (1); /* 50, min and not hangup */
+}
+
+int
+ttsetcompat(tp, com, data, term)
+	register struct tty *tp;
+	u_long *com;
+	caddr_t data;
+	struct termios *term;
+{
+	switch (*com) {
+	case TIOCSETP:
+	case TIOCSETN: {
+		register struct sgttyb *sg = (struct sgttyb *)data;
+		int speed;
+
+		if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0)
+			return(EINVAL);
+		else if (speed != ttcompatspeedtab(tp->t_ispeed, compatspeeds))
+			term->c_ispeed = compatspcodes[speed];
+		else
+			term->c_ispeed = tp->t_ispeed;
+		if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0)
+			return(EINVAL);
+		else if (speed != ttcompatspeedtab(tp->t_ospeed, compatspeeds))
+			term->c_ospeed = compatspcodes[speed];
+		else
+			term->c_ospeed = tp->t_ospeed;
+		term->c_cc[VERASE] = sg->sg_erase;
+		term->c_cc[VKILL] = sg->sg_kill;
+		tp->t_flags = (tp->t_flags&0xffff0000) | (sg->sg_flags&0xffff);
+		ttcompatsetflags(tp, term);
+		*com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA;
+		break;
+	}
+	case TIOCSETC: {
+		struct tchars *tc = (struct tchars *)data;
+		register cc_t *cc;
+
+		cc = term->c_cc;
+		cc[VINTR] = tc->t_intrc;
+		cc[VQUIT] = tc->t_quitc;
+		cc[VSTART] = tc->t_startc;
+		cc[VSTOP] = tc->t_stopc;
+		cc[VEOF] = tc->t_eofc;
+		cc[VEOL] = tc->t_brkc;
+		if (tc->t_brkc == -1)
+			cc[VEOL2] = _POSIX_VDISABLE;
+		*com = TIOCSETA;
+		break;
+	}
+	case TIOCSLTC: {
+		struct ltchars *ltc = (struct ltchars *)data;
+		register cc_t *cc;
+
+		cc = term->c_cc;
+		cc[VSUSP] = ltc->t_suspc;
+		cc[VDSUSP] = ltc->t_dsuspc;
+		cc[VREPRINT] = ltc->t_rprntc;
+		cc[VDISCARD] = ltc->t_flushc;
+		cc[VWERASE] = ltc->t_werasc;
+		cc[VLNEXT] = ltc->t_lnextc;
+		*com = TIOCSETA;
+		break;
+	}
+	case TIOCLBIS:
+	case TIOCLBIC:
+	case TIOCLSET:
+		if (*com == TIOCLSET)
+			tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16;
+		else {
+			tp->t_flags =
+			 (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff);
+			if (*com == TIOCLBIS)
+				tp->t_flags |= *(int *)data<<16;
+			else
+				tp->t_flags &= ~(*(int *)data<<16);
+		}
+		ttcompatsetlflags(tp, term);
+		*com = TIOCSETA;
+		break;
+	}
+	return 0;
+}
+
+/*ARGSUSED*/
+int
+ttcompat(tp, com, data, flag)
+	register struct tty *tp;
+	u_long com;
+	caddr_t data;
+	int flag;
+{
+	switch (com) {
+	case TIOCSETP:
+	case TIOCSETN:
+	case TIOCSETC:
+	case TIOCSLTC:
+	case TIOCLBIS:
+	case TIOCLBIC:
+	case TIOCLSET: {
+		struct termios term;
+		int error;
+
+		term = tp->t_termios;
+		if ((error = ttsetcompat(tp, &com, data, &term)) != 0)
+			return error;
+		return ttioctl(tp, com, &term, flag);
+	}
+	case TIOCGETP: {
+		register struct sgttyb *sg = (struct sgttyb *)data;
+		register cc_t *cc = tp->t_cc;
+
+		sg->sg_ospeed = ttcompatspeedtab(tp->t_ospeed, compatspeeds);
+		if (tp->t_ispeed == 0)
+			sg->sg_ispeed = sg->sg_ospeed;
+		else
+			sg->sg_ispeed = ttcompatspeedtab(tp->t_ispeed, compatspeeds);
+		sg->sg_erase = cc[VERASE];
+		sg->sg_kill = cc[VKILL];
+		sg->sg_flags = tp->t_flags = ttcompatgetflags(tp);
+		break;
+	}
+	case TIOCGETC: {
+		struct tchars *tc = (struct tchars *)data;
+		register cc_t *cc = tp->t_cc;
+
+		tc->t_intrc = cc[VINTR];
+		tc->t_quitc = cc[VQUIT];
+		tc->t_startc = cc[VSTART];
+		tc->t_stopc = cc[VSTOP];
+		tc->t_eofc = cc[VEOF];
+		tc->t_brkc = cc[VEOL];
+		break;
+	}
+	case TIOCGLTC: {
+		struct ltchars *ltc = (struct ltchars *)data;
+		register cc_t *cc = tp->t_cc;
+
+		ltc->t_suspc = cc[VSUSP];
+		ltc->t_dsuspc = cc[VDSUSP];
+		ltc->t_rprntc = cc[VREPRINT];
+		ltc->t_flushc = cc[VDISCARD];
+		ltc->t_werasc = cc[VWERASE];
+		ltc->t_lnextc = cc[VLNEXT];
+		break;
+	}
+	case TIOCLGET:
+		tp->t_flags =
+		 (ttcompatgetflags(tp) & 0xffff0000UL)
+		   | (tp->t_flags & 0xffff);
+		*(int *)data = tp->t_flags>>16;
+		if (ttydebug)
+			printf("CLGET: returning %x\n", *(int *)data);
+		break;
+
+	case OTIOCGETD:
+		*(int *)data = tp->t_line ? tp->t_line : 2;
+		break;
+
+	case OTIOCSETD: {
+		int ldisczero = 0;
+
+		return (ttioctl(tp, TIOCSETD,
+			*(int *)data == 2 ? (caddr_t)&ldisczero : data, flag));
+	    }
+
+	case OTIOCCONS:
+		*(int *)data = 1;
+		return (ttioctl(tp, TIOCCONS, data, flag));
+
+	default:
+		return (ENOIOCTL);
+	}
+	return (0);
+}
+
+static int
+ttcompatgetflags(tp)
+	register struct tty *tp;
+{
+	register tcflag_t iflag	= tp->t_iflag;
+	register tcflag_t lflag	= tp->t_lflag;
+	register tcflag_t oflag	= tp->t_oflag;
+	register tcflag_t cflag	= tp->t_cflag;
+	register int flags = 0;
+
+	if (iflag&IXOFF)
+		flags |= TANDEM;
+	if (iflag&ICRNL || oflag&ONLCR)
+		flags |= CRMOD;
+	if ((cflag&CSIZE) == CS8) {
+		flags |= PASS8;
+		if (iflag&ISTRIP)
+			flags |= ANYP;
+	}
+	else if (cflag&PARENB) {
+		if (iflag&INPCK) {
+			if (cflag&PARODD)
+				flags |= ODDP;
+			else
+				flags |= EVENP;
+		} else
+			flags |= EVENP | ODDP;
+	}
+
+	if ((lflag&ICANON) == 0) {
+		/* fudge */
+		if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG)
+		    || (cflag&(CSIZE|PARENB)) != CS8)
+			flags |= CBREAK;
+		else
+			flags |= RAW;
+	}
+	if (!(flags&RAW) && !(oflag&OPOST) && (cflag&(CSIZE|PARENB)) == CS8)
+		flags |= LITOUT;
+	if (cflag&MDMBUF)
+		flags |= MDMBUF;
+	if ((cflag&HUPCL) == 0)
+		flags |= NOHANG;
+	if (oflag&OXTABS)
+		flags |= XTABS;
+	if (lflag&ECHOE)
+		flags |= CRTERA|CRTBS;
+	if (lflag&ECHOKE)
+		flags |= CRTKIL|CRTBS;
+	if (lflag&ECHOPRT)
+		flags |= PRTERA;
+	if (lflag&ECHOCTL)
+		flags |= CTLECH;
+	if ((iflag&IXANY) == 0)
+		flags |= DECCTQ;
+	flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH);
+	if (ttydebug)
+		printf("getflags: %x\n", flags);
+	return (flags);
+}
+
+static void
+ttcompatsetflags(tp, t)
+	register struct tty *tp;
+	register struct termios *t;
+{
+	register int flags = tp->t_flags;
+	register tcflag_t iflag	= t->c_iflag;
+	register tcflag_t oflag	= t->c_oflag;
+	register tcflag_t lflag	= t->c_lflag;
+	register tcflag_t cflag	= t->c_cflag;
+
+	if (flags & RAW) {
+		iflag = IGNBRK;
+		lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN);
+	} else {
+		iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+		iflag |= BRKINT|IXON|IMAXBEL;
+		lflag |= ISIG|IEXTEN|ECHOCTL;	/* XXX was echoctl on ? */
+		if (flags & XTABS)
+			oflag |= OXTABS;
+		else
+			oflag &= ~OXTABS;
+		if (flags & CBREAK)
+			lflag &= ~ICANON;
+		else
+			lflag |= ICANON;
+		if (flags&CRMOD) {
+			iflag |= ICRNL;
+			oflag |= ONLCR;
+		} else {
+			iflag &= ~ICRNL;
+			oflag &= ~ONLCR;
+		}
+	}
+	if (flags&ECHO)
+		lflag |= ECHO;
+	else
+		lflag &= ~ECHO;
+
+	cflag &= ~(CSIZE|PARENB);
+	if (flags&(RAW|LITOUT|PASS8)) {
+		cflag |= CS8;
+		if (!(flags&(RAW|PASS8))
+		    || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+			iflag |= ISTRIP;
+		else
+			iflag &= ~ISTRIP;
+		if (flags&(RAW|LITOUT))
+			oflag &= ~OPOST;
+		else
+			oflag |= OPOST;
+	} else {
+		cflag |= CS7|PARENB;
+		iflag |= ISTRIP;
+		oflag |= OPOST;
+	}
+	/* XXX don't set INPCK if RAW or PASS8? */
+	if ((flags&(EVENP|ODDP)) == EVENP) {
+		iflag |= INPCK;
+		cflag &= ~PARODD;
+	} else if ((flags&(EVENP|ODDP)) == ODDP) {
+		iflag |= INPCK;
+		cflag |= PARODD;
+	} else
+		iflag &= ~INPCK;
+	if (flags&TANDEM)
+		iflag |= IXOFF;
+	else
+		iflag &= ~IXOFF;
+	if ((flags&DECCTQ) == 0)
+		iflag |= IXANY;
+	else
+		iflag &= ~IXANY;
+	t->c_iflag = iflag;
+	t->c_oflag = oflag;
+	t->c_lflag = lflag;
+	t->c_cflag = cflag;
+}
+
+static void
+ttcompatsetlflags(tp, t)
+	register struct tty *tp;
+	register struct termios *t;
+{
+	register int flags = tp->t_flags;
+	register tcflag_t iflag	= t->c_iflag;
+	register tcflag_t oflag	= t->c_oflag;
+	register tcflag_t lflag	= t->c_lflag;
+	register tcflag_t cflag	= t->c_cflag;
+
+	iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+	if (flags&CRTERA)
+		lflag |= ECHOE;
+	else
+		lflag &= ~ECHOE;
+	if (flags&CRTKIL)
+		lflag |= ECHOKE;
+	else
+		lflag &= ~ECHOKE;
+	if (flags&PRTERA)
+		lflag |= ECHOPRT;
+	else
+		lflag &= ~ECHOPRT;
+	if (flags&CTLECH)
+		lflag |= ECHOCTL;
+	else
+		lflag &= ~ECHOCTL;
+	if (flags&TANDEM)
+		iflag |= IXOFF;
+	else
+		iflag &= ~IXOFF;
+	if ((flags&DECCTQ) == 0)
+		iflag |= IXANY;
+	else
+		iflag &= ~IXANY;
+	if (flags & MDMBUF)
+		cflag |= MDMBUF;
+	else
+		cflag &= ~MDMBUF;
+	if (flags&NOHANG)
+		cflag &= ~HUPCL;
+	else
+		cflag |= HUPCL;
+	lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+	lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+
+	/*
+	 * The next if-else statement is copied from above so don't bother
+	 * checking it separately.  We could avoid fiddlling with the
+	 * character size if the mode is already RAW or if neither the
+	 * LITOUT bit or the PASS8 bit is being changed, but the delta of
+	 * the change is not available here and skipping the RAW case would
+	 * make the code different from above.
+	 */
+	cflag &= ~(CSIZE|PARENB);
+	if (flags&(RAW|LITOUT|PASS8)) {
+		cflag |= CS8;
+		if (!(flags&(RAW|PASS8))
+		    || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+			iflag |= ISTRIP;
+		else
+			iflag &= ~ISTRIP;
+		if (flags&(RAW|LITOUT))
+			oflag &= ~OPOST;
+		else
+			oflag |= OPOST;
+	} else {
+		cflag |= CS7|PARENB;
+		iflag |= ISTRIP;
+		oflag |= OPOST;
+	}
+	t->c_iflag = iflag;
+	t->c_oflag = oflag;
+	t->c_lflag = lflag;
+	t->c_cflag = cflag;
+}
+#endif	/* COMPAT_43 || COMPAT_SUNOS */
diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c
new file mode 100644
index 0000000..0609dc9
--- /dev/null
+++ b/sys/kern/tty_conf.c
@@ -0,0 +1,210 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_conf.c	8.4 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+
+#ifndef MAXLDISC
+#define MAXLDISC 9
+#endif
+
+static l_open_t		l_noopen;
+static l_close_t	l_noclose;
+static l_rint_t		l_norint;
+static l_start_t	l_nostart;
+
+/*
+ * XXX it probably doesn't matter what the entries other than the l_open
+ * entry are here.  The l_nullioctl and ttymodem entries still look fishy.
+ * Reconsider the removal of nullmodem anyway.  It was too much like
+ * ttymodem, but a completely null version might be useful.
+ */
+#define NODISC(n) \
+	{ l_noopen,	l_noclose,	l_noread,	l_nowrite, \
+	  l_nullioctl,	l_norint,	l_nostart,	ttymodem }
+
+struct	linesw linesw[MAXLDISC] =
+{
+				/* 0- termios */
+	{ ttyopen,	ttylclose,	ttread,		ttwrite,
+	  l_nullioctl,	ttyinput,	ttstart,	ttymodem },
+	NODISC(1),		/* 1- defunct */
+	  			/* 2- NTTYDISC */
+#ifdef COMPAT_43
+	{ ttyopen,	ttylclose,	ttread,		ttwrite,
+	  l_nullioctl,	ttyinput,	ttstart,	ttymodem },
+#else
+	NODISC(2),
+#endif
+	NODISC(3),		/* loadable */
+	NODISC(4),		/* SLIPDISC */
+	NODISC(5),		/* PPPDISC */
+	NODISC(6),		/* NETGRAPHDISC */
+	NODISC(7),		/* loadable */
+	NODISC(8),		/* loadable */
+};
+
+int	nlinesw = sizeof (linesw) / sizeof (linesw[0]);
+
+static struct linesw nodisc = NODISC(0);
+
+#define LOADABLE_LDISC 7
+/*
+ * ldisc_register: Register a line discipline.
+ *
+ * discipline: Index for discipline to load, or LDISC_LOAD for us to choose.
+ * linesw_p:   Pointer to linesw_p.
+ *
+ * Returns: Index used or -1 on failure.
+ */
+int
+ldisc_register(discipline, linesw_p)
+	int discipline;
+	struct linesw *linesw_p;
+{
+	int slot = -1;
+
+	if (discipline == LDISC_LOAD) {
+		int i;
+		for (i = LOADABLE_LDISC; i < MAXLDISC; i++)
+			if (bcmp(linesw + i, &nodisc, sizeof(nodisc)) == 0) {
+				slot = i;
+			}
+	}
+	else if (discipline >= 0 && discipline < MAXLDISC) {
+		slot = discipline;
+	}
+
+	if (slot != -1 && linesw_p)
+		linesw[slot] = *linesw_p;
+
+	return slot;
+}
+
+/*
+ * ldisc_deregister: Deregister a line discipline obtained with
+ * ldisc_register.
+ *
+ * discipline: Index for discipline to unload.
+ */
+void
+ldisc_deregister(discipline)
+	int discipline;
+{
+	if (discipline < MAXLDISC) {
+		linesw[discipline] = nodisc;
+	}
+}
+
+static int
+l_noopen(dev, tp)
+	dev_t dev;
+	struct tty *tp;
+{
+
+	return (ENODEV);
+}
+
+static int
+l_noclose(tp, flag)
+	struct tty *tp;
+	int flag;
+{
+
+	return (ENODEV);
+}
+
+int
+l_noread(tp, uio, flag)
+	struct tty *tp;
+	struct uio *uio;
+	int flag;
+{
+
+	return (ENODEV);
+}
+
+int
+l_nowrite(tp, uio, flag)
+	struct tty *tp;
+	struct uio *uio;
+	int flag;
+{
+
+	return (ENODEV);
+}
+
+static int
+l_norint(c, tp)
+	int c;
+	struct tty *tp;
+{
+
+	return (ENODEV);
+}
+
+static int
+l_nostart(tp)
+	struct tty *tp;
+{
+
+	return (ENODEV);
+}
+
+/*
+ * Do nothing specific version of line
+ * discipline specific ioctl command.
+ */
+int
+l_nullioctl(tp, cmd, data, flags, td)
+	struct tty *tp;
+	u_long cmd;
+	char *data;
+	int flags;
+	struct thread *td;
+{
+
+	return (ENOIOCTL);
+}
diff --git a/sys/kern/tty_cons.c b/sys/kern/tty_cons.c
new file mode 100644
index 0000000..91713c1
--- /dev/null
+++ b/sys/kern/tty_cons.c
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)cons.c	7.2 (Berkeley) 5/9/91
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/reboot.h>
+#include <sys/sysctl.h>
+#include <sys/tty.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+#include <ddb/ddb.h>
+
+#include <machine/cpu.h>
+
+static	d_open_t	cnopen;
+static	d_close_t	cnclose;
+static	d_read_t	cnread;
+static	d_write_t	cnwrite;
+static	d_ioctl_t	cnioctl;
+static	d_poll_t	cnpoll;
+static	d_kqfilter_t	cnkqfilter;
+
+#define	CDEV_MAJOR	0
+static struct cdevsw cn_cdevsw = {
+	/* open */	cnopen,
+	/* close */	cnclose,
+	/* read */	cnread,
+	/* write */	cnwrite,
+	/* ioctl */	cnioctl,
+	/* poll */	cnpoll,
+	/* mmap */	nommap,
+	/* strategy */	nostrategy,
+	/* name */	"console",
+	/* maj */	CDEV_MAJOR,
+	/* dump */	nodump,
+	/* psize */	nopsize,
+	/* flags */	D_TTY | D_KQFILTER,
+	/* kqfilter */	cnkqfilter,
+};
+
+struct cn_device {
+	STAILQ_ENTRY(cn_device) cnd_next;
+	char		cnd_name[16];
+	struct		vnode *cnd_vp;
+	struct		consdev *cnd_cn;
+};
+
+#define CNDEVPATHMAX	32
+#define CNDEVTAB_SIZE	4
+static struct cn_device cn_devtab[CNDEVTAB_SIZE];
+static STAILQ_HEAD(, cn_device) cn_devlist =
+    STAILQ_HEAD_INITIALIZER(cn_devlist);
+
+#define CND_INVALID(cnd, td) 						\
+	(cnd == NULL || cnd->cnd_vp == NULL ||				\
+	    (cnd->cnd_vp->v_type == VBAD && !cn_devopen(cnd, td, 1)))
+
+static udev_t	cn_udev_t;
+SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLFLAG_RD,
+	&cn_udev_t, sizeof cn_udev_t, "T,dev_t", "");
+
+int	cons_unavail = 0;	/* XXX:
+				 * physical console not available for
+				 * input (i.e., it is in graphics mode)
+				 */
+static int cn_mute;
+static int openflag;			/* how /dev/console was opened */
+static int cn_is_open;
+static dev_t cn_devfsdev;		/* represents the device private info */
+static u_char console_pausing;		/* pause after each line during probe */
+static char *console_pausestr=
+"<pause; press any key to proceed to next line or '.' to end pause mode>";
+
+void	cndebug(char *);
+
+CONS_DRIVER(cons, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+SET_DECLARE(cons_set, struct consdev);
+
+void
+cninit(void)
+{
+	struct consdev *best_cn, *cn, **list;
+
+	/*
+	 * Check if we should mute the console (for security reasons perhaps)
+	 * It can be changes dynamically using sysctl kern.consmute
+	 * once we are up and going.
+	 * 
+	 */
+        cn_mute = ((boothowto & (RB_MUTE
+			|RB_SINGLE
+			|RB_VERBOSE
+			|RB_ASKNAME
+			|RB_CONFIG)) == RB_MUTE);
+
+	/*
+	 * Find the first console with the highest priority.
+	 */
+	best_cn = NULL;
+	SET_FOREACH(list, cons_set) {
+		cn = *list;
+		if (cn->cn_probe == NULL)
+			continue;
+		cn->cn_probe(cn);
+		if (cn->cn_pri == CN_DEAD)
+			continue;
+		if (best_cn == NULL || cn->cn_pri > best_cn->cn_pri)
+			best_cn = cn;
+		if (boothowto & RB_MULTIPLE) {
+			/*
+			 * Initialize console, and attach to it.
+			 */
+			cnadd(cn);
+			cn->cn_init(cn);
+		}
+	}
+	if (best_cn == NULL)
+		return;
+	if ((boothowto & RB_MULTIPLE) == 0) {
+		cnadd(best_cn);
+		best_cn->cn_init(best_cn);
+	}
+	if (boothowto & RB_PAUSE)
+		console_pausing = 1;
+	/*
+	 * Make the best console the preferred console.
+	 */
+	cnselect(best_cn);
+}
+
+void
+cninit_finish()
+{
+	console_pausing = 0;
+} 
+
+/* add a new physical console to back the virtual console */
+int
+cnadd(struct consdev *cn)
+{
+	struct cn_device *cnd;
+	int i;
+
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+		if (cnd->cnd_cn == cn)
+			return (0);
+	for (i = 0; i < CNDEVTAB_SIZE; i++) {
+		cnd = &cn_devtab[i];
+		if (cnd->cnd_cn == NULL)
+			break;
+	}
+	if (cnd->cnd_cn != NULL)
+		return (ENOMEM);
+	cnd->cnd_cn = cn;
+	STAILQ_INSERT_TAIL(&cn_devlist, cnd, cnd_next);
+	return (0);
+}
+
+void
+cnremove(struct consdev *cn)
+{
+	struct cn_device *cnd;
+
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		if (cnd->cnd_cn != cn)
+			continue;
+		STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next);
+		if (cnd->cnd_vp != NULL)
+			vn_close(cnd->cnd_vp, openflag, NOCRED, NULL);
+		cnd->cnd_vp = NULL;
+		cnd->cnd_cn = NULL;
+		cnd->cnd_name[0] = '\0';
+#if 0
+		/*
+		 * XXX
+		 * syscons gets really confused if console resources are
+		 * freed after the system has initialized.
+		 */
+		if (cn->cn_term != NULL)
+			cn->cn_term(cn);
+#endif
+		return;
+	}
+}
+
+void
+cnselect(struct consdev *cn)
+{
+	struct cn_device *cnd;
+
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		if (cnd->cnd_cn != cn)
+			continue;
+		if (cnd == STAILQ_FIRST(&cn_devlist))
+			return;
+		STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next);
+		STAILQ_INSERT_HEAD(&cn_devlist, cnd, cnd_next);
+		return;
+	}
+}
+
+void
+cndebug(char *str)
+{
+	int i, len;
+
+	len = strlen(str);
+	cnputc('>'); cnputc('>'); cnputc('>'); cnputc(' '); 
+	for (i = 0; i < len; i++)
+		cnputc(str[i]);
+	cnputc('\n');
+}
+
+static int
+sysctl_kern_console(SYSCTL_HANDLER_ARGS)
+{
+	struct cn_device *cnd;
+	struct consdev *cp, **list;
+	char *name, *p;
+	int delete, len, error;
+
+	len = 2;
+	SET_FOREACH(list, cons_set) {
+		cp = *list;
+		if (cp->cn_dev != NULL)
+			len += strlen(devtoname(cp->cn_dev)) + 1;
+	}
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+		len += strlen(devtoname(cnd->cnd_cn->cn_dev)) + 1;
+	len = len > CNDEVPATHMAX ? len : CNDEVPATHMAX;
+	MALLOC(name, char *, len, M_TEMP, M_WAITOK | M_ZERO);
+	p = name;
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+		p += sprintf(p, "%s,", devtoname(cnd->cnd_cn->cn_dev));
+	*p++ = '/';
+	SET_FOREACH(list, cons_set) {
+		cp = *list;
+		if (cp->cn_dev != NULL)
+			p += sprintf(p, "%s,", devtoname(cp->cn_dev));
+	}
+	error = sysctl_handle_string(oidp, name, len, req);
+	if (error == 0 && req->newptr != NULL) {
+		p = name;
+		error = ENXIO;
+		delete = 0;
+		if (*p == '-') {
+			delete = 1;
+			p++;
+		}
+		SET_FOREACH(list, cons_set) {
+			cp = *list;
+			if (cp->cn_dev == NULL ||
+			    strcmp(p, devtoname(cp->cn_dev)) != 0)
+				continue;
+			if (delete) {
+				cnremove(cp);
+				error = 0;
+			} else {
+				error = cnadd(cp);
+				if (error == 0)
+					cnselect(cp);
+			}
+			break;
+		}
+	}
+	FREE(name, M_TEMP);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, console, CTLTYPE_STRING|CTLFLAG_RW,
+	0, 0, sysctl_kern_console, "A", "Console device control");
+
+/*
+ * User has changed the state of the console muting.
+ * This may require us to open or close the device in question.
+ */
+static int
+sysctl_kern_consmute(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	int ocn_mute;
+
+	ocn_mute = cn_mute;
+	error = sysctl_handle_int(oidp, &cn_mute, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (ocn_mute && !cn_mute && cn_is_open)
+		error = cnopen(NODEV, openflag, 0, curthread);
+	else if (!ocn_mute && cn_mute && cn_is_open) {
+		error = cnclose(NODEV, openflag, 0, curthread);
+		cn_is_open = 1;		/* XXX hack */
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, consmute, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof(cn_mute), sysctl_kern_consmute, "I", "");
+
+static int
+cn_devopen(struct cn_device *cnd, struct thread *td, int forceopen)
+{
+	char path[CNDEVPATHMAX];
+	struct nameidata nd;
+	struct vnode *vp;
+	dev_t dev;
+	int error;
+
+	if ((vp = cnd->cnd_vp) != NULL) {
+		if (!forceopen && vp->v_type != VBAD) {
+			dev = vp->v_rdev;
+			return ((*devsw(dev)->d_open)(dev, openflag, 0, td));
+		}
+		cnd->cnd_vp = NULL;
+		vn_close(vp, openflag, td->td_ucred, td);
+	}
+	if (cnd->cnd_name[0] == '\0')
+		strncpy(cnd->cnd_name, devtoname(cnd->cnd_cn->cn_dev),
+		    sizeof(cnd->cnd_name));
+	snprintf(path, sizeof(path), "/dev/%s", cnd->cnd_name);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, td);
+	error = vn_open(&nd, &openflag, 0);
+	if (error == 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		VOP_UNLOCK(nd.ni_vp, 0, td);
+		if (nd.ni_vp->v_type == VCHR)
+			cnd->cnd_vp = nd.ni_vp;
+		else
+			vn_close(nd.ni_vp, openflag, td->td_ucred, td);
+	}
+	return (cnd->cnd_vp != NULL);
+}
+
+static int
+cnopen(dev_t dev, int flag, int mode, struct thread *td)
+{
+	struct cn_device *cnd;
+
+	openflag = flag | FWRITE;	/* XXX */
+	cn_is_open = 1;			/* console is logically open */
+	if (cn_mute)
+		return (0);
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+		cn_devopen(cnd, td, 0);
+	return (0);
+}
+
+static int
+cnclose(dev_t dev, int flag, int mode, struct thread *td)
+{
+	struct cn_device *cnd;
+	struct vnode *vp;
+
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		if ((vp = cnd->cnd_vp) == NULL)
+			continue; 
+		cnd->cnd_vp = NULL;
+		vn_close(vp, openflag, td->td_ucred, td);
+	}
+	cn_is_open = 0;
+	return (0);
+}
+
+static int
+cnread(dev_t dev, struct uio *uio, int flag)
+{
+	struct cn_device *cnd;
+
+	cnd = STAILQ_FIRST(&cn_devlist);
+	if (cn_mute || CND_INVALID(cnd, curthread))
+		return (0);
+	dev = cnd->cnd_vp->v_rdev;
+	return ((*devsw(dev)->d_read)(dev, uio, flag));
+}
+
+static int
+cnwrite(dev_t dev, struct uio *uio, int flag)
+{
+	struct cn_device *cnd;
+
+	cnd = STAILQ_FIRST(&cn_devlist);
+	if (cn_mute || CND_INVALID(cnd, curthread))
+		goto done;
+	if (constty)
+		dev = constty->t_dev;
+	else
+		dev = cnd->cnd_vp->v_rdev;
+	if (dev != NULL) {
+		log_console(uio);
+		return ((*devsw(dev)->d_write)(dev, uio, flag));
+	}
+done:
+	uio->uio_resid = 0; /* dump the data */
+	return (0);
+}
+
+static int
+cnioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
+{
+	struct cn_device *cnd;
+	int error;
+
+	cnd = STAILQ_FIRST(&cn_devlist);
+	if (cn_mute || CND_INVALID(cnd, td))
+		return (0);
+	/*
+	 * Superuser can always use this to wrest control of console
+	 * output from the "virtual" console.
+	 */
+	if (cmd == TIOCCONS && constty) {
+		error = suser(td);
+		if (error)
+			return (error);
+		constty = NULL;
+		return (0);
+	}
+	dev = cnd->cnd_vp->v_rdev;
+	if (dev != NULL)
+		return ((*devsw(dev)->d_ioctl)(dev, cmd, data, flag, td));
+	return (0);
+}
+
+/*
+ * XXX
+ * poll/kqfilter do not appear to be correct
+ */
+static int
+cnpoll(dev_t dev, int events, struct thread *td)
+{
+	struct cn_device *cnd;
+
+	cnd = STAILQ_FIRST(&cn_devlist);
+	if (cn_mute || CND_INVALID(cnd, td))
+		return (0);
+	dev = cnd->cnd_vp->v_rdev;
+	if (dev != NULL)
+		return ((*devsw(dev)->d_poll)(dev, events, td));
+	return (0);
+}
+
+static int
+cnkqfilter(dev_t dev, struct knote *kn)
+{
+	struct cn_device *cnd;
+
+	cnd = STAILQ_FIRST(&cn_devlist);
+	if (cn_mute || CND_INVALID(cnd, curthread))
+		return (1);
+	dev = cnd->cnd_vp->v_rdev;
+	if (dev != NULL)
+		return ((*devsw(dev)->d_kqfilter)(dev, kn));
+	return (1);
+}
+
+/*
+ * Low level console routines.
+ */
+int
+cngetc(void)
+{
+	int c;
+
+	if (cn_mute)
+		return (-1);
+	while ((c = cncheckc()) == -1)
+		;
+	if (c == '\r')
+		c = '\n';		/* console input is always ICRNL */
+	return (c);
+}
+
+int
+cncheckc(void)
+{
+	struct cn_device *cnd;
+	struct consdev *cn;
+	int c;
+
+	if (cn_mute)
+		return (-1);
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		cn = cnd->cnd_cn;
+		c = cn->cn_checkc(cn->cn_dev);
+		if (c != -1) {
+			return (c);
+		}
+	}
+	return (-1);
+}
+
+void
+cnputc(int c)
+{
+	struct cn_device *cnd;
+	struct consdev *cn;
+	char *cp;
+
+	if (cn_mute || c == '\0')
+		return;
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		cn = cnd->cnd_cn;
+		if (c == '\n')
+			cn->cn_putc(cn->cn_dev, '\r');
+		cn->cn_putc(cn->cn_dev, c);
+	}
+#ifdef DDB
+	if (console_pausing && !db_active && (c == '\n')) {
+#else
+	if (console_pausing && (c == '\n')) {
+#endif
+		for (cp = console_pausestr; *cp != '\0'; cp++)
+			cnputc(*cp);
+		if (cngetc() == '.')
+			console_pausing = 0;
+		cnputc('\r');
+		for (cp = console_pausestr; *cp != '\0'; cp++)
+			cnputc(' ');
+		cnputc('\r');
+	}
+}
+
+void
+cndbctl(int on)
+{
+	struct cn_device *cnd;
+	struct consdev *cn;
+	static int refcount;
+
+	if (!on)
+		refcount--;
+	if (refcount == 0)
+		STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+			cn = cnd->cnd_cn;
+			if (cn->cn_dbctl != NULL)
+				cn->cn_dbctl(cn->cn_dev, on);
+		}
+	if (on)
+		refcount++;
+}
+
+static void
+cn_drvinit(void *unused)
+{
+
+	cn_devfsdev = make_dev(&cn_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+	    "console");
+}
+
+SYSINIT(cndev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,cn_drvinit,NULL)
diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c
new file mode 100644
index 0000000..7d6e736
--- /dev/null
+++ b/sys/kern/tty_pty.c
@@ -0,0 +1,874 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_pty.c	8.4 (Berkeley) 2/20/95
+ * $FreeBSD$
+ */
+
+/*
+ * Pseudo-teletype Driver
+ * (Actually two drivers, requiring two entries in 'cdevsw')
+ */
+#include "opt_compat.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+
+static MALLOC_DEFINE(M_PTY, "ptys", "pty data structures");
+
+static void ptsstart(struct tty *tp);
+static void ptsstop(struct tty *tp, int rw);
+static void ptcwakeup(struct tty *tp, int flag);
+static dev_t ptyinit(dev_t cdev);
+
+static	d_open_t	ptsopen;
+static	d_close_t	ptsclose;
+static	d_read_t	ptsread;
+static	d_write_t	ptswrite;
+static	d_ioctl_t	ptyioctl;
+static	d_open_t	ptcopen;
+static	d_close_t	ptcclose;
+static	d_read_t	ptcread;
+static	d_write_t	ptcwrite;
+static	d_poll_t	ptcpoll;
+
+#define	CDEV_MAJOR_S	5
+static struct cdevsw pts_cdevsw = {
+	/* open */	ptsopen,
+	/* close */	ptsclose,
+	/* read */	ptsread,
+	/* write */	ptswrite,
+	/* ioctl */	ptyioctl,
+	/* poll */	ttypoll,
+	/* mmap */	nommap,
+	/* strategy */	nostrategy,
+	/* name */	"pts",
+	/* maj */	CDEV_MAJOR_S,
+	/* dump */	nodump,
+	/* psize */	nopsize,
+	/* flags */	D_TTY | D_KQFILTER,
+	/* kqfilter */	ttykqfilter,
+};
+
+#define	CDEV_MAJOR_C	6
+static struct cdevsw ptc_cdevsw = {
+	/* open */	ptcopen,
+	/* close */	ptcclose,
+	/* read */	ptcread,
+	/* write */	ptcwrite,
+	/* ioctl */	ptyioctl,
+	/* poll */	ptcpoll,
+	/* mmap */	nommap,
+	/* strategy */	nostrategy,
+	/* name */	"ptc",
+	/* maj */	CDEV_MAJOR_C,
+	/* dump */	nodump,
+	/* psize */	nopsize,
+	/* flags */	D_TTY | D_KQFILTER,
+	/* kqfilter */	ttykqfilter,
+};
+
+#define BUFSIZ 100		/* Chunk size iomoved to/from user */
+
+struct	pt_ioctl {
+	int	pt_flags;
+	struct	selinfo pt_selr, pt_selw;
+	u_char	pt_send;
+	u_char	pt_ucntl;
+	struct tty pt_tty;
+	dev_t	devs, devc;
+	struct	prison *pt_prison;
+};
+
+#define	PF_PKT		0x08		/* packet mode */
+#define	PF_STOPPED	0x10		/* user told stopped */
+#define	PF_REMOTE	0x20		/* remote and flow controlled input */
+#define	PF_NOSTOP	0x40
+#define PF_UCNTL	0x80		/* user control mode */
+
+static char *names = "pqrsPQRS";
+/*
+ * This function creates and initializes a pts/ptc pair
+ *
+ * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
+ * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
+ *
+ * XXX: define and add mapping of upper minor bits to allow more 
+ *      than 256 ptys.
+ */
+static dev_t
+ptyinit(dev_t devc)
+{
+	dev_t devs;
+	struct pt_ioctl *pt;
+	int n;
+
+	n = minor(devc);
+	/* For now we only map the lower 8 bits of the minor */
+	if (n & ~0xff)
+		return (NODEV);
+
+	devc->si_flags &= ~SI_CHEAPCLONE;
+
+	pt = malloc(sizeof(*pt), M_PTY, M_WAITOK | M_ZERO);
+	pt->devs = devs = make_dev(&pts_cdevsw, n,
+	    UID_ROOT, GID_WHEEL, 0666, "tty%c%r", names[n / 32], n % 32);
+	pt->devc = devc;
+
+	devs->si_drv1 = devc->si_drv1 = pt;
+	devs->si_tty = devc->si_tty = &pt->pt_tty;
+	pt->pt_tty.t_dev = devs;
+	ttyregister(&pt->pt_tty);
+	return (devc);
+}
+
+/*ARGSUSED*/
+static	int
+ptsopen(dev, flag, devtype, td)
+	dev_t dev;
+	int flag, devtype;
+	struct thread *td;
+{
+	register struct tty *tp;
+	int error;
+	struct pt_ioctl *pti;
+
+	if (!dev->si_drv1)
+		return(ENXIO);	
+	pti = dev->si_drv1;
+	tp = dev->si_tty;
+	if ((tp->t_state & TS_ISOPEN) == 0) {
+		ttychars(tp);		/* Set up default chars */
+		tp->t_iflag = TTYDEF_IFLAG;
+		tp->t_oflag = TTYDEF_OFLAG;
+		tp->t_lflag = TTYDEF_LFLAG;
+		tp->t_cflag = TTYDEF_CFLAG;
+		tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED;
+	} else if (tp->t_state & TS_XCLUDE && suser(td)) {
+		return (EBUSY);
+	} else if (pti->pt_prison != td->td_ucred->cr_prison) {
+		return (EBUSY);
+	}
+	if (tp->t_oproc)			/* Ctrlr still around. */
+		(void)(*linesw[tp->t_line].l_modem)(tp, 1);
+	while ((tp->t_state & TS_CARR_ON) == 0) {
+		if (flag&FNONBLOCK)
+			break;
+		error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+				 "ptsopn", 0);
+		if (error)
+			return (error);
+	}
+	error = (*linesw[tp->t_line].l_open)(dev, tp);
+	if (error == 0)
+		ptcwakeup(tp, FREAD|FWRITE);
+	return (error);
+}
+
+static	int
+ptsclose(dev, flag, mode, td)
+	dev_t dev;
+	int flag, mode;
+	struct thread *td;
+{
+	register struct tty *tp;
+	int err;
+
+	tp = dev->si_tty;
+	err = (*linesw[tp->t_line].l_close)(tp, flag);
+	ptsstop(tp, FREAD|FWRITE);
+	(void) ttyclose(tp);
+	return (err);
+}
+
+static	int
+ptsread(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	struct thread *td = curthread;
+	struct proc *p = td->td_proc;
+	register struct tty *tp = dev->si_tty;
+	register struct pt_ioctl *pti = dev->si_drv1;
+	struct pgrp *pg;
+	int error = 0;
+
+again:
+	if (pti->pt_flags & PF_REMOTE) {
+		while (isbackground(p, tp)) {
+			sx_slock(&proctree_lock);
+			PROC_LOCK(p);
+			if (SIGISMEMBER(p->p_sigignore, SIGTTIN) ||
+			    SIGISMEMBER(p->p_sigmask, SIGTTIN) ||
+			    p->p_pgrp->pg_jobc == 0 || p->p_flag & P_PPWAIT) {
+				PROC_UNLOCK(p);
+				sx_sunlock(&proctree_lock);
+				return (EIO);
+			}
+			pg = p->p_pgrp;
+			PROC_UNLOCK(p);
+			PGRP_LOCK(pg);
+			sx_sunlock(&proctree_lock);
+			pgsignal(pg, SIGTTIN, 1);
+			PGRP_UNLOCK(pg);
+			error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ptsbg",
+					 0);
+			if (error)
+				return (error);
+		}
+		if (tp->t_canq.c_cc == 0) {
+			if (flag & IO_NDELAY)
+				return (EWOULDBLOCK);
+			error = ttysleep(tp, TSA_PTS_READ(tp), TTIPRI | PCATCH,
+					 "ptsin", 0);
+			if (error)
+				return (error);
+			goto again;
+		}
+		while (tp->t_canq.c_cc > 1 && uio->uio_resid > 0)
+			if (ureadc(getc(&tp->t_canq), uio) < 0) {
+				error = EFAULT;
+				break;
+			}
+		if (tp->t_canq.c_cc == 1)
+			(void) getc(&tp->t_canq);
+		if (tp->t_canq.c_cc)
+			return (error);
+	} else
+		if (tp->t_oproc)
+			error = (*linesw[tp->t_line].l_read)(tp, uio, flag);
+	ptcwakeup(tp, FWRITE);
+	return (error);
+}
+
+/*
+ * Write to pseudo-tty.
+ * Wakeups of controlling tty will happen
+ * indirectly, when tty driver calls ptsstart.
+ */
+static	int
+ptswrite(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	register struct tty *tp;
+
+	tp = dev->si_tty;
+	if (tp->t_oproc == 0)
+		return (EIO);
+	return ((*linesw[tp->t_line].l_write)(tp, uio, flag));
+}
+
+/*
+ * Start output on pseudo-tty.
+ * Wake up process selecting or sleeping for input from controlling tty.
+ */
+static void
+ptsstart(tp)
+	struct tty *tp;
+{
+	register struct pt_ioctl *pti = tp->t_dev->si_drv1;
+
+	if (tp->t_state & TS_TTSTOP)
+		return;
+	if (pti->pt_flags & PF_STOPPED) {
+		pti->pt_flags &= ~PF_STOPPED;
+		pti->pt_send = TIOCPKT_START;
+	}
+	ptcwakeup(tp, FREAD);
+}
+
+static void
+ptcwakeup(tp, flag)
+	struct tty *tp;
+	int flag;
+{
+	struct pt_ioctl *pti = tp->t_dev->si_drv1;
+
+	if (flag & FREAD) {
+		selwakeup(&pti->pt_selr);
+		wakeup(TSA_PTC_READ(tp));
+	}
+	if (flag & FWRITE) {
+		selwakeup(&pti->pt_selw);
+		wakeup(TSA_PTC_WRITE(tp));
+	}
+}
+
+static	int
+ptcopen(dev, flag, devtype, td)
+	dev_t dev;
+	int flag, devtype;
+	struct thread *td;
+{
+	register struct tty *tp;
+	struct pt_ioctl *pti;
+
+	if (!dev->si_drv1)
+		ptyinit(dev);
+	if (!dev->si_drv1)
+		return(ENXIO);	
+	tp = dev->si_tty;
+	if (tp->t_oproc)
+		return (EIO);
+	tp->t_timeout = -1;
+	tp->t_oproc = ptsstart;
+	tp->t_stop = ptsstop;
+	(void)(*linesw[tp->t_line].l_modem)(tp, 1);
+	tp->t_lflag &= ~EXTPROC;
+	pti = dev->si_drv1;
+	pti->pt_prison = td->td_ucred->cr_prison;
+	pti->pt_flags = 0;
+	pti->pt_send = 0;
+	pti->pt_ucntl = 0;
+	return (0);
+}
+
+static	int
+ptcclose(dev, flags, fmt, td)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct thread *td;
+{
+	register struct tty *tp;
+
+	tp = dev->si_tty;
+	(void)(*linesw[tp->t_line].l_modem)(tp, 0);
+
+	/*
+	 * XXX MDMBUF makes no sense for ptys but would inhibit the above
+	 * l_modem().  CLOCAL makes sense but isn't supported.   Special
+	 * l_modem()s that ignore carrier drop make no sense for ptys but
+	 * may be in use because other parts of the line discipline make
+	 * sense for ptys.  Recover by doing everything that a normal
+	 * ttymodem() would have done except for sending a SIGHUP.
+	 */
+	if (tp->t_state & TS_ISOPEN) {
+		tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED);
+		tp->t_state |= TS_ZOMBIE;
+		ttyflush(tp, FREAD | FWRITE);
+	}
+
+	tp->t_oproc = 0;		/* mark closed */
+	return (0);
+}
+
+static	int
+ptcread(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	register struct tty *tp = dev->si_tty;
+	struct pt_ioctl *pti = dev->si_drv1;
+	char buf[BUFSIZ];
+	int error = 0, cc;
+
+	/*
+	 * We want to block until the slave
+	 * is open, and there's something to read;
+	 * but if we lost the slave or we're NBIO,
+	 * then return the appropriate error instead.
+	 */
+	for (;;) {
+		if (tp->t_state&TS_ISOPEN) {
+			if (pti->pt_flags&PF_PKT && pti->pt_send) {
+				error = ureadc((int)pti->pt_send, uio);
+				if (error)
+					return (error);
+				if (pti->pt_send & TIOCPKT_IOCTL) {
+					cc = min(uio->uio_resid,
+						sizeof(tp->t_termios));
+					uiomove((caddr_t)&tp->t_termios, cc,
+						uio);
+				}
+				pti->pt_send = 0;
+				return (0);
+			}
+			if (pti->pt_flags&PF_UCNTL && pti->pt_ucntl) {
+				error = ureadc((int)pti->pt_ucntl, uio);
+				if (error)
+					return (error);
+				pti->pt_ucntl = 0;
+				return (0);
+			}
+			if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0)
+				break;
+		}
+		if ((tp->t_state & TS_CONNECTED) == 0)
+			return (0);	/* EOF */
+		if (flag & IO_NDELAY)
+			return (EWOULDBLOCK);
+		error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0);
+		if (error)
+			return (error);
+	}
+	if (pti->pt_flags & (PF_PKT|PF_UCNTL))
+		error = ureadc(0, uio);
+	while (uio->uio_resid > 0 && error == 0) {
+		cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ));
+		if (cc <= 0)
+			break;
+		error = uiomove(buf, cc, uio);
+	}
+	ttwwakeup(tp);
+	return (error);
+}
+
+static	void
+ptsstop(tp, flush)
+	register struct tty *tp;
+	int flush;
+{
+	struct pt_ioctl *pti = tp->t_dev->si_drv1;
+	int flag;
+
+	/* note: FLUSHREAD and FLUSHWRITE already ok */
+	if (flush == 0) {
+		flush = TIOCPKT_STOP;
+		pti->pt_flags |= PF_STOPPED;
+	} else
+		pti->pt_flags &= ~PF_STOPPED;
+	pti->pt_send |= flush;
+	/* change of perspective */
+	flag = 0;
+	if (flush & FREAD)
+		flag |= FWRITE;
+	if (flush & FWRITE)
+		flag |= FREAD;
+	ptcwakeup(tp, flag);
+}
+
+static	int
+ptcpoll(dev, events, td)
+	dev_t dev;
+	int events;
+	struct thread *td;
+{
+	register struct tty *tp = dev->si_tty;
+	struct pt_ioctl *pti = dev->si_drv1;
+	int revents = 0;
+	int s;
+
+	if ((tp->t_state & TS_CONNECTED) == 0)
+		return (seltrue(dev, events, td) | POLLHUP);
+
+	/*
+	 * Need to block timeouts (ttrstart).
+	 */
+	s = spltty();
+
+	if (events & (POLLIN | POLLRDNORM))
+		if ((tp->t_state & TS_ISOPEN) &&
+		    ((tp->t_outq.c_cc && (tp->t_state & TS_TTSTOP) == 0) ||
+		     ((pti->pt_flags & PF_PKT) && pti->pt_send) ||
+		     ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)))
+			revents |= events & (POLLIN | POLLRDNORM);
+
+	if (events & (POLLOUT | POLLWRNORM))
+		if (tp->t_state & TS_ISOPEN &&
+		    ((pti->pt_flags & PF_REMOTE) ?
+		     (tp->t_canq.c_cc == 0) : 
+		     ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG - 2) ||
+		      (tp->t_canq.c_cc == 0 && (tp->t_lflag & ICANON)))))
+			revents |= events & (POLLOUT | POLLWRNORM);
+
+	if (events & POLLHUP)
+		if ((tp->t_state & TS_CARR_ON) == 0)
+			revents |= POLLHUP;
+
+	if (revents == 0) {
+		if (events & (POLLIN | POLLRDNORM))
+			selrecord(td, &pti->pt_selr);
+
+		if (events & (POLLOUT | POLLWRNORM)) 
+			selrecord(td, &pti->pt_selw);
+	}
+	splx(s);
+
+	return (revents);
+}
+
+static	int
+ptcwrite(dev, uio, flag)
+	dev_t dev;
+	register struct uio *uio;
+	int flag;
+{
+	register struct tty *tp = dev->si_tty;
+	register u_char *cp = 0;
+	register int cc = 0;
+	u_char locbuf[BUFSIZ];
+	int cnt = 0;
+	struct pt_ioctl *pti = dev->si_drv1;
+	int error = 0;
+
+again:
+	if ((tp->t_state&TS_ISOPEN) == 0)
+		goto block;
+	if (pti->pt_flags & PF_REMOTE) {
+		if (tp->t_canq.c_cc)
+			goto block;
+		while ((uio->uio_resid > 0 || cc > 0) &&
+		       tp->t_canq.c_cc < TTYHOG - 1) {
+			if (cc == 0) {
+				cc = min(uio->uio_resid, BUFSIZ);
+				cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc);
+				cp = locbuf;
+				error = uiomove((caddr_t)cp, cc, uio);
+				if (error)
+					return (error);
+				/* check again for safety */
+				if ((tp->t_state & TS_ISOPEN) == 0) {
+					/* adjust as usual */
+					uio->uio_resid += cc;
+					return (EIO);
+				}
+			}
+			if (cc > 0) {
+				cc = b_to_q((char *)cp, cc, &tp->t_canq);
+				/*
+				 * XXX we don't guarantee that the canq size
+				 * is >= TTYHOG, so the above b_to_q() may
+				 * leave some bytes uncopied.  However, space
+				 * is guaranteed for the null terminator if
+				 * we don't fail here since (TTYHOG - 1) is
+				 * not a multiple of CBSIZE.
+				 */
+				if (cc > 0)
+					break;
+			}
+		}
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		(void) putc(0, &tp->t_canq);
+		ttwakeup(tp);
+		wakeup(TSA_PTS_READ(tp));
+		return (0);
+	}
+	while (uio->uio_resid > 0 || cc > 0) {
+		if (cc == 0) {
+			cc = min(uio->uio_resid, BUFSIZ);
+			cp = locbuf;
+			error = uiomove((caddr_t)cp, cc, uio);
+			if (error)
+				return (error);
+			/* check again for safety */
+			if ((tp->t_state & TS_ISOPEN) == 0) {
+				/* adjust for data copied in but not written */
+				uio->uio_resid += cc;
+				return (EIO);
+			}
+		}
+		while (cc > 0) {
+			if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 &&
+			   (tp->t_canq.c_cc > 0 || !(tp->t_lflag&ICANON))) {
+				wakeup(TSA_HUP_OR_INPUT(tp));
+				goto block;
+			}
+			(*linesw[tp->t_line].l_rint)(*cp++, tp);
+			cnt++;
+			cc--;
+		}
+		cc = 0;
+	}
+	return (0);
+block:
+	/*
+	 * Come here to wait for slave to open, for space
+	 * in outq, or space in rawq, or an empty canq.
+	 */
+	if ((tp->t_state & TS_CONNECTED) == 0) {
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		return (EIO);
+	}
+	if (flag & IO_NDELAY) {
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		if (cnt == 0)
+			return (EWOULDBLOCK);
+		return (0);
+	}
+	error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0);
+	if (error) {
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		return (error);
+	}
+	goto again;
+}
+
+/*ARGSUSED*/
+static	int
+ptyioctl(dev, cmd, data, flag, td)
+	dev_t dev;
+	u_long cmd;
+	caddr_t data;
+	int flag;
+	struct thread *td;
+{
+	register struct tty *tp = dev->si_tty;
+	register struct pt_ioctl *pti = dev->si_drv1;
+	register u_char *cc = tp->t_cc;
+	int stop, error;
+
+	if (devsw(dev)->d_open == ptcopen) {
+		switch (cmd) {
+
+		case TIOCGPGRP:
+			/*
+			 * We avoid calling ttioctl on the controller since,
+			 * in that case, tp must be the controlling terminal.
+			 */
+			*(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
+			return (0);
+
+		case TIOCPKT:
+			if (*(int *)data) {
+				if (pti->pt_flags & PF_UCNTL)
+					return (EINVAL);
+				pti->pt_flags |= PF_PKT;
+			} else
+				pti->pt_flags &= ~PF_PKT;
+			return (0);
+
+		case TIOCUCNTL:
+			if (*(int *)data) {
+				if (pti->pt_flags & PF_PKT)
+					return (EINVAL);
+				pti->pt_flags |= PF_UCNTL;
+			} else
+				pti->pt_flags &= ~PF_UCNTL;
+			return (0);
+
+		case TIOCREMOTE:
+			if (*(int *)data)
+				pti->pt_flags |= PF_REMOTE;
+			else
+				pti->pt_flags &= ~PF_REMOTE;
+			ttyflush(tp, FREAD|FWRITE);
+			return (0);
+		}
+
+		/*
+		 * The rest of the ioctls shouldn't be called until 
+		 * the slave is open.
+		 */
+		if ((tp->t_state & TS_ISOPEN) == 0)
+			return (EAGAIN);
+
+		switch (cmd) {
+#ifdef COMPAT_43
+		case TIOCSETP:
+		case TIOCSETN:
+#endif
+		case TIOCSETD:
+		case TIOCSETA:
+		case TIOCSETAW:
+		case TIOCSETAF:
+			/*
+			 * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG.
+			 * ttywflush(tp) will hang if there are characters in
+			 * the outq.
+			 */
+			ndflush(&tp->t_outq, tp->t_outq.c_cc);
+			break;
+
+		case TIOCSIG:
+			if (*(unsigned int *)data >= NSIG ||
+			    *(unsigned int *)data == 0)
+				return(EINVAL);
+			if ((tp->t_lflag&NOFLSH) == 0)
+				ttyflush(tp, FREAD|FWRITE);
+			if (tp->t_pgrp != NULL) {
+				PGRP_LOCK(tp->t_pgrp);
+				pgsignal(tp->t_pgrp, *(unsigned int *)data, 1);
+				PGRP_UNLOCK(tp->t_pgrp);
+			}
+			if ((*(unsigned int *)data == SIGINFO) &&
+			    ((tp->t_lflag&NOKERNINFO) == 0))
+				ttyinfo(tp);
+			return(0);
+		}
+	}
+	if (cmd == TIOCEXT) {
+		/*
+		 * When the EXTPROC bit is being toggled, we need
+		 * to send an TIOCPKT_IOCTL if the packet driver
+		 * is turned on.
+		 */
+		if (*(int *)data) {
+			if (pti->pt_flags & PF_PKT) {
+				pti->pt_send |= TIOCPKT_IOCTL;
+				ptcwakeup(tp, FREAD);
+			}
+			tp->t_lflag |= EXTPROC;
+		} else {
+			if ((tp->t_lflag & EXTPROC) &&
+			    (pti->pt_flags & PF_PKT)) {
+				pti->pt_send |= TIOCPKT_IOCTL;
+				ptcwakeup(tp, FREAD);
+			}
+			tp->t_lflag &= ~EXTPROC;
+		}
+		return(0);
+	}
+	error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, td);
+	if (error == ENOIOCTL)
+		 error = ttioctl(tp, cmd, data, flag);
+	if (error == ENOIOCTL) {
+		if (pti->pt_flags & PF_UCNTL &&
+		    (cmd & ~0xff) == UIOCCMD(0)) {
+			if (cmd & 0xff) {
+				pti->pt_ucntl = (u_char)cmd;
+				ptcwakeup(tp, FREAD);
+			}
+			return (0);
+		}
+		error = ENOTTY;
+	}
+	/*
+	 * If external processing and packet mode send ioctl packet.
+	 */
+	if ((tp->t_lflag&EXTPROC) && (pti->pt_flags & PF_PKT)) {
+		switch(cmd) {
+		case TIOCSETA:
+		case TIOCSETAW:
+		case TIOCSETAF:
+#ifdef COMPAT_43
+		case TIOCSETP:
+		case TIOCSETN:
+#endif
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+		case TIOCSETC:
+		case TIOCSLTC:
+		case TIOCLBIS:
+		case TIOCLBIC:
+		case TIOCLSET:
+#endif
+			pti->pt_send |= TIOCPKT_IOCTL;
+			ptcwakeup(tp, FREAD);
+		default:
+			break;
+		}
+	}
+	stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s'))
+		&& CCEQ(cc[VSTART], CTRL('q'));
+	if (pti->pt_flags & PF_NOSTOP) {
+		if (stop) {
+			pti->pt_send &= ~TIOCPKT_NOSTOP;
+			pti->pt_send |= TIOCPKT_DOSTOP;
+			pti->pt_flags &= ~PF_NOSTOP;
+			ptcwakeup(tp, FREAD);
+		}
+	} else {
+		if (!stop) {
+			pti->pt_send &= ~TIOCPKT_DOSTOP;
+			pti->pt_send |= TIOCPKT_NOSTOP;
+			pti->pt_flags |= PF_NOSTOP;
+			ptcwakeup(tp, FREAD);
+		}
+	}
+	return (error);
+}
+
+
+static void ptc_drvinit(void *unused);
+
+static void pty_clone(void *arg, char *name, int namelen, dev_t *dev);
+
+static void
+pty_clone(arg, name, namelen, dev)
+	void *arg;
+	char *name;
+	int namelen;
+	dev_t *dev;
+{
+	int u;
+
+	if (*dev != NODEV)
+		return;
+	if (bcmp(name, "pty", 3) != 0)
+		return;
+	if (name[5] != '\0')
+		return;
+	switch (name[3]) {
+	case 'p': u =   0; break;
+	case 'q': u =  32; break;
+	case 'r': u =  64; break;
+	case 's': u =  96; break;
+	case 'P': u = 128; break;
+	case 'Q': u = 160; break;
+	case 'R': u = 192; break;
+	case 'S': u = 224; break;
+	default: return;
+	}
+	if (name[4] >= '0' && name[4] <= '9')
+		u += name[4] - '0';
+	else if (name[4] >= 'a' && name[4] <= 'v')
+		u += name[4] - 'a' + 10;
+	else
+		return;
+	*dev = make_dev(&ptc_cdevsw, u,
+	    UID_ROOT, GID_WHEEL, 0666, "pty%c%r", names[u / 32], u % 32);
+	(*dev)->si_flags |= SI_CHEAPCLONE;
+	return;
+}
+
+static void
+ptc_drvinit(unused)
+	void *unused;
+{
+	EVENTHANDLER_REGISTER(dev_clone, pty_clone, 0, 1000);
+	cdevsw_add(&pts_cdevsw);
+	cdevsw_add(&ptc_cdevsw);
+}
+
+SYSINIT(ptcdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR_C,ptc_drvinit,NULL)
diff --git a/sys/kern/tty_subr.c b/sys/kern/tty_subr.c
new file mode 100644
index 0000000..78bb231
--- /dev/null
+++ b/sys/kern/tty_subr.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * clist support routines
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/tty.h>
+#include <sys/clist.h>
+
+static void clist_init(void *);
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
+
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define	INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc(void);
+static void cblock_alloc_cblocks(int number);
+static void cblock_free(struct cblock *cblockp);
+static void cblock_free_cblocks(int number);
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
+{
+	int cbsize = CBSIZE;
+
+	printf(
+	"tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+	       ctotcount * cbsize, ctotcount * cbsize - cfreecount, cfreecount,
+	       cfreecount - cslushcount * cbsize, cslushcount * cbsize);
+}
+#endif /* DDB */
+
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+	void *dummy;
+{
+	/*
+	 * Allocate an initial base set of cblocks as a 'slush'.
+	 * We allocate non-slush cblocks with each initial ttyopen() and
+	 * deallocate them with each ttyclose().
+	 * We should adjust the slush allocation.  This can't be done in
+	 * the i/o routines because they are sometimes called from
+	 * interrupt handlers when it may be unsafe to call malloc().
+	 */
+	cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
+}
+
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static __inline struct cblock *
+cblock_alloc()
+{
+	struct cblock *cblockp;
+
+	cblockp = cfreelist;
+	if (cblockp == NULL)
+		panic("clist reservation botch");
+	cfreelist = cblockp->c_next;
+	cblockp->c_next = NULL;
+	cfreecount -= CBSIZE;
+	return (cblockp);
+}
+
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static __inline void
+cblock_free(cblockp)
+	struct cblock *cblockp;
+{
+	if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+		bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+	cblockp->c_next = cfreelist;
+	cfreelist = cblockp;
+	cfreecount += CBSIZE;
+}
+
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+	int number;
+{
+	int i;
+	struct cblock *cbp;
+
+	for (i = 0; i < number; ++i) {
+		cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+		if (cbp == NULL) {
+			printf(
+"cblock_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+			cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+		}
+		/*
+		 * Freed cblocks have zero quotes and garbage elsewhere.
+		 * Set the may-have-quote bit to force zeroing the quotes.
+		 */
+		setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+		cblock_free(cbp);
+	}
+	ctotcount += number;
+}
+
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+	struct clist *clistp;
+	int ccmax;
+	int ccreserved;
+{
+	int dcbr;
+
+	/*
+	 * Allow for wasted space at the head.
+	 */
+	if (ccmax != 0)
+		ccmax += CBSIZE - 1;
+	if (ccreserved != 0)
+		ccreserved += CBSIZE - 1;
+
+	clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+	dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+	if (dcbr >= 0)
+		cblock_alloc_cblocks(dcbr);
+	else {
+		if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+			dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+		cblock_free_cblocks(-dcbr);
+	}
+	clistp->c_cbreserved += dcbr;
+}
+
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+	int number;
+{
+	int i;
+
+	for (i = 0; i < number; ++i)
+		free(cblock_alloc(), M_TTYS);
+	ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
+void
+clist_free_cblocks(clistp)
+	struct clist *clistp;
+{
+	if (clistp->c_cbcount != 0)
+		panic("freeing active clist cblocks");
+	cblock_free_cblocks(clistp->c_cbreserved);
+	clistp->c_cbmax = 0;
+	clistp->c_cbreserved = 0;
+}
+
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+	struct clist *clistp;
+{
+	int chr = -1;
+	int s;
+	struct cblock *cblockp;
+
+	s = spltty();
+
+	/* If there are characters in the list, get one */
+	if (clistp->c_cc) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		chr = (u_char)*clistp->c_cf;
+
+		/*
+		 * If this char is quoted, set the flag.
+		 */
+		if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * Advance to next character.
+		 */
+		clistp->c_cf++;
+		clistp->c_cc--;
+		/*
+		 * If we have advanced the 'first' character pointer
+		 * past the end of this cblock, advance to the next one.
+		 * If there are no more characters, set the first and
+		 * last pointers to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+	struct clist *clistp;
+	char *dest;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	char *dest_orig = dest;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (clistp && amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		bcopy(clistp->c_cf, dest, numc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		dest += numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (dest - dest_orig);
+}
+
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+	struct clist *clistp;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+	int chr;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	int s;
+
+	s = spltty();
+
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("putc to a clist with no reserved cblocks\n");
+			return (-1);		/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+		if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = (cblockp - 1);
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (-1);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+	}
+
+	/*
+	 * If this character is quoted, set the quote bit, if not, clear it.
+	 */
+	if (chr & TTY_QUOTE) {
+		setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+		/*
+		 * Use one of the spare quote bits to record that something
+		 * may be quoted.
+		 */
+		setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+	} else
+		clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+	*clistp->c_cl++ = chr;
+	clistp->c_cc++;
+
+	splx(s);
+	return (0);
+}
+
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+	char *src;
+	int amount;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	char *firstbyte, *lastbyte;
+	u_char startmask, endmask;
+	int startbit, endbit, num_between, numc;
+	int s;
+
+	/*
+	 * Avoid allocating an initial cblock and then not using it.
+	 * c_cc == 0 must imply c_cbount == 0.
+	 */
+	if (amount <= 0)
+		return (amount);
+
+	s = spltty();
+
+	/*
+	 * If there are no cblocks assigned to this clist yet,
+	 * then get one.
+	 */
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("b_to_q to a clist with no reserved cblocks.\n");
+			return (amount);	/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+	}
+
+	while (amount) {
+		/*
+		 * Get another cblock if needed.
+		 */
+		if (((intptr_t)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = cblockp - 1;
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (amount);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+
+		/*
+		 * Copy a chunk of the linear buffer up to the end
+		 * of this cblock.
+		 */
+		numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+		bcopy(src, clistp->c_cl, numc);
+
+		/*
+		 * Clear quote bits if they aren't known to be clear.
+		 * The following could probably be made into a separate
+		 * "bitzero()" routine, but why bother?
+		 */
+		if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+			startbit = clistp->c_cl - (char *)cblockp->c_info;
+			endbit = startbit + numc - 1;
+
+			firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+			lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+			/*
+			 * Calculate mask of bits to preserve in first and
+			 * last bytes.
+			 */
+			startmask = NBBY - (startbit % NBBY);
+			startmask = 0xff >> startmask;
+			endmask = (endbit % NBBY);
+			endmask = 0xff << (endmask + 1);
+
+			if (firstbyte != lastbyte) {
+				*firstbyte &= startmask;
+				*lastbyte &= endmask;
+
+				num_between = lastbyte - firstbyte - 1;
+				if (num_between)
+					bzero(firstbyte + 1, num_between);
+			} else {
+				*firstbyte &= (startmask | endmask);
+			}
+		}
+
+		/*
+		 * ...and update pointer for the next chunk.
+		 */
+		src += numc;
+		clistp->c_cl += numc;
+		clistp->c_cc += numc;
+		amount -= numc;
+		/*
+		 * If we go through the loop again, it's always
+		 * for data in the next cblock, so by adding one (cblock),
+		 * (which makes the pointer 1 beyond the end of this
+		 * cblock) we prepare for the assignment of 'prev'
+		 * above.
+		 */
+		cblockp += 1;
+
+	}
+
+	splx(s);
+	return (amount);
+}
+
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
+char *
+nextc(clistp, cp, dst)
+	struct clist *clistp;
+	char *cp;
+	int *dst;
+{
+	struct cblock *cblockp;
+
+	++cp;
+	/*
+	 * See if the next character is beyond the end of
+	 * the clist.
+	 */
+	if (clistp->c_cc && (cp != clistp->c_cl)) {
+		/*
+		 * If the next character is beyond the end of this
+		 * cblock, advance to the next cblock.
+		 */
+		if (((intptr_t)cp & CROUND) == 0)
+			cp = ((struct cblock *)cp - 1)->c_next->c_info;
+		cblockp = (struct cblock *)((intptr_t)cp & ~CROUND);
+
+		/*
+		 * Get the character. Set the quote flag if this character
+		 * is quoted.
+		 */
+		*dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+		return (cp);
+	}
+
+	return (NULL);
+}
+
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+	struct clist *clistp;
+{
+	struct cblock *cblockp = 0, *cbp = 0;
+	int s;
+	int chr = -1;
+
+
+	s = spltty();
+
+	if (clistp->c_cc) {
+		--clistp->c_cc;
+		--clistp->c_cl;
+
+		chr = (u_char)*clistp->c_cl;
+
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+
+		/*
+		 * Set quote flag if this character was quoted.
+		 */
+		if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * If all of the characters have been unput in this
+		 * cblock, then find the previous one and free this
+		 * one.
+		 */
+		if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+			cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND);
+
+			while (cbp->c_next != cblockp)
+				cbp = cbp->c_next;
+
+			/*
+			 * When the previous cblock is at the end, the 'last'
+			 * pointer always points (invalidly) one past.
+			 */
+			clistp->c_cl = (char *)(cbp+1);
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+			cbp->c_next = NULL;
+		}
+	}
+
+	/*
+	 * If there are no more characters on the list, then
+	 * free the last cblock.
+	 */
+	if ((clistp->c_cc == 0) && clistp->c_cl) {
+		cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND);
+		cblock_free(cblockp);
+		if (--clistp->c_cbcount >= clistp->c_cbreserved)
+			++cslushcount;
+		clistp->c_cf = clistp->c_cl = NULL;
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
+void
+catq(src_clistp, dest_clistp)
+	struct clist *src_clistp, *dest_clistp;
+{
+	int chr, s;
+
+	s = spltty();
+	/*
+	 * If the destination clist is empty (has no cblocks atttached),
+	 * and there are no possible complications with the resource counters,
+	 * then we simply assign the current clist to the destination.
+	 */
+	if (!dest_clistp->c_cf
+	    && src_clistp->c_cbcount <= src_clistp->c_cbmax
+	    && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+		dest_clistp->c_cf = src_clistp->c_cf;
+		dest_clistp->c_cl = src_clistp->c_cl;
+		src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+		dest_clistp->c_cc = src_clistp->c_cc;
+		src_clistp->c_cc = 0;
+		dest_clistp->c_cbcount = src_clistp->c_cbcount;
+		src_clistp->c_cbcount = 0;
+
+		splx(s);
+		return;
+	}
+
+	splx(s);
+
+	/*
+	 * XXX  This should probably be optimized to more than one
+	 * character at a time.
+	 */
+	while ((chr = getc(src_clistp)) != -1)
+		putc(chr, dest_clistp);
+}
diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c
new file mode 100644
index 0000000..e1e03bd
--- /dev/null
+++ b/sys/kern/tty_tty.c
@@ -0,0 +1,252 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_tty.c	8.2 (Berkeley) 9/23/93
+ * $FreeBSD$
+ */
+
+/*
+ * Indirect driver for controlling tty.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/proc.h>
+#include <sys/ttycom.h>
+#include <sys/vnode.h>
+
+static	d_open_t	cttyopen;
+static	d_read_t	cttyread;
+static	d_write_t	cttywrite;
+static	d_ioctl_t	cttyioctl;
+static	d_poll_t	cttypoll;
+
+#define	CDEV_MAJOR	1
+
+static struct cdevsw ctty_cdevsw = {
+	/* open */	cttyopen,
+	/* close */	nullclose,
+	/* read */	cttyread,
+	/* write */	cttywrite,
+	/* ioctl */	cttyioctl,
+	/* poll */	cttypoll,
+	/* mmap */	nommap,
+	/* strategy */	nostrategy,
+	/* name */	"ctty",
+	/* maj */	CDEV_MAJOR,
+	/* dump */	nodump,
+	/* psize */	nopsize,
+	/* flags */	D_TTY,
+};
+
+#define cttyvp(td) ((td)->td_proc->p_flag & P_CONTROLT ? (td)->td_proc->p_session->s_ttyvp : NULL)
+
+/*ARGSUSED*/
+static	int
+cttyopen(dev, flag, mode, td)
+	dev_t dev;
+	int flag, mode;
+	struct thread *td;
+{
+	struct vnode *ttyvp;
+	int error;
+
+	PROC_LOCK(td->td_proc);
+	SESS_LOCK(td->td_proc->p_session);
+	ttyvp = cttyvp(td);
+	SESS_UNLOCK(td->td_proc->p_session);
+	PROC_UNLOCK(td->td_proc);
+
+	if (ttyvp == NULL)
+		return (ENXIO);
+	vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_OPEN(ttyvp, flag, NOCRED, td);
+	VOP_UNLOCK(ttyvp, 0, td);
+	return (error);
+}
+
+/*ARGSUSED*/
+static	int
+cttyread(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	struct thread *td = uio->uio_td;
+	register struct vnode *ttyvp;
+	int error;
+
+	PROC_LOCK(td->td_proc);
+	SESS_LOCK(td->td_proc->p_session);
+	ttyvp = cttyvp(td);
+	SESS_UNLOCK(td->td_proc->p_session);
+	PROC_UNLOCK(td->td_proc);
+
+	if (ttyvp == NULL)
+		return (EIO);
+	vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_READ(ttyvp, uio, flag, NOCRED);
+	VOP_UNLOCK(ttyvp, 0, td);
+	return (error);
+}
+
+/*ARGSUSED*/
+static	int
+cttywrite(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	struct thread *td = uio->uio_td;
+	struct vnode *ttyvp;
+	struct mount *mp;
+	int error;
+
+	PROC_LOCK(td->td_proc);
+	SESS_LOCK(td->td_proc->p_session);
+	ttyvp = cttyvp(td);
+	SESS_UNLOCK(td->td_proc->p_session);
+	PROC_UNLOCK(td->td_proc);
+
+	if (ttyvp == NULL)
+		return (EIO);
+	mp = NULL;
+	if (ttyvp->v_type != VCHR &&
+	    (error = vn_start_write(ttyvp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_WRITE(ttyvp, uio, flag, NOCRED);
+	VOP_UNLOCK(ttyvp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*ARGSUSED*/
+static	int
+cttyioctl(dev, cmd, addr, flag, td)
+	dev_t dev;
+	u_long cmd;
+	caddr_t addr;
+	int flag;
+	struct thread *td;
+{
+	struct vnode *ttyvp;
+	int error;
+
+	PROC_LOCK(td->td_proc);
+	SESS_LOCK(td->td_proc->p_session);
+	ttyvp = cttyvp(td);
+	SESS_UNLOCK(td->td_proc->p_session);
+	PROC_UNLOCK(td->td_proc);
+
+	if (ttyvp == NULL)
+		return (EIO);
+	if (cmd == TIOCSCTTY)  /* don't allow controlling tty to be set    */
+		return EINVAL; /* to controlling tty -- infinite recursion */
+	if (cmd == TIOCNOTTY) {
+		PROC_LOCK(td->td_proc);
+		SESS_LOCK(td->td_proc->p_session);
+		error = 0;
+		if (!SESS_LEADER(td->td_proc))
+			td->td_proc->p_flag &= ~P_CONTROLT;
+		else
+			error = EINVAL;
+		SESS_UNLOCK(td->td_proc->p_session);
+		PROC_UNLOCK(td->td_proc);
+		return (error);
+	}
+	return (VOP_IOCTL(ttyvp, cmd, addr, flag, NOCRED, td));
+}
+
+/*ARGSUSED*/
+static	int
+cttypoll(dev, events, td)
+	dev_t dev;
+	int events;
+	struct thread *td;
+{
+	struct vnode *ttyvp;
+
+	PROC_LOCK(td->td_proc);
+	SESS_LOCK(td->td_proc->p_session);
+	ttyvp = cttyvp(td);
+	SESS_UNLOCK(td->td_proc->p_session);
+	PROC_UNLOCK(td->td_proc);
+
+	if (ttyvp == NULL)
+		/* try operation to get EOF/failure */
+		return (seltrue(dev, events, td));
+	return (VOP_POLL(ttyvp, events, td->td_ucred, td));
+}
+
+static void ctty_clone(void *arg, char *name, int namelen, dev_t *dev);
+
+static dev_t ctty;
+
+static void
+ctty_clone(void *arg, char *name, int namelen, dev_t *dev)
+{
+	struct vnode *vp;
+
+	if (*dev != NODEV)
+		return;
+	if (strcmp(name, "tty"))
+		return;
+	vp = cttyvp(curthread);
+	if (vp == NULL) {
+		if (ctty)
+			*dev = ctty;
+	} else
+		*dev = vp->v_rdev;
+}
+
+
+static void ctty_drvinit(void *unused);
+static void
+ctty_drvinit(unused)
+	void *unused;
+{
+
+	if (devfs_present) {
+		EVENTHANDLER_REGISTER(dev_clone, ctty_clone, 0, 1000);
+		ctty = make_dev(&ctty_cdevsw, 0, 0, 0, 0666, "ctty");
+	} else {
+		make_dev(&ctty_cdevsw, 0, 0, 0, 0666, "tty");
+	}
+}
+
+SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,ctty_drvinit,NULL)
diff --git a/sys/kern/uipc_accf.c b/sys/kern/uipc_accf.c
new file mode 100644
index 0000000..b31026a
--- /dev/null
+++ b/sys/kern/uipc_accf.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2000 Paycounter, Inc.
+ * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$FreeBSD$
+ */
+
+#define ACCEPT_FILTER_MOD
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/queue.h>
+
+static SLIST_HEAD(, accept_filter) accept_filtlsthd =
+	SLIST_HEAD_INITIALIZER(&accept_filtlsthd);
+
+MALLOC_DEFINE(M_ACCF, "accf", "accept filter data");
+
+static int unloadable = 0;
+
+SYSCTL_DECL(_net_inet);	/* XXX: some header should do this for me */
+SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters");
+SYSCTL_INT(_net_inet_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
+	"Allow unload of accept filters (not recommended)");
+
+/*
+ * must be passed a malloc'd structure so we don't explode if the kld
+ * is unloaded, we leak the struct on deallocation to deal with this,
+ * but if a filter is loaded with the same name as a leaked one we re-use
+ * the entry.
+ */
+int
+accept_filt_add(struct accept_filter *filt)
+{
+	struct accept_filter *p;
+
+	SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
+		if (strcmp(p->accf_name, filt->accf_name) == 0)  {
+			if (p->accf_callback != NULL) {
+				return (EEXIST);
+			} else {
+				p->accf_callback = filt->accf_callback;
+				FREE(filt, M_ACCF);
+				return (0);
+			}
+		}
+				
+	if (p == NULL)
+		SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next);
+	return (0);
+}
+
+int
+accept_filt_del(char *name)
+{
+	struct accept_filter *p;
+
+	p = accept_filt_get(name);
+	if (p == NULL)
+		return (ENOENT);
+
+	p->accf_callback = NULL;
+	return (0);
+}
+
+struct accept_filter *
+accept_filt_get(char *name)
+{
+	struct accept_filter *p;
+
+	SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
+		if (strcmp(p->accf_name, name) == 0)
+			return (p);
+
+	return (NULL);
+}
+
+int
+accept_filt_generic_mod_event(module_t mod, int event, void *data)
+{
+	struct accept_filter *p;
+	struct accept_filter *accfp = (struct accept_filter *) data;
+	int	s, error;
+
+	switch (event) {
+	case MOD_LOAD:
+		MALLOC(p, struct accept_filter *, sizeof(*p), M_ACCF, M_WAITOK);
+		bcopy(accfp, p, sizeof(*p));
+		s = splnet();
+		error = accept_filt_add(p);
+		splx(s);
+		break;
+
+	case MOD_UNLOAD:
+		/*
+		 * Do not support unloading yet. we don't keep track of refcounts
+		 * and unloading an accept filter callback and then having it called
+		 * is a bad thing.  A simple fix would be to track the refcount
+		 * in the struct accept_filter.
+		 */
+		if (unloadable != 0) {
+			s = splnet();
+			error = accept_filt_del(accfp->accf_name);
+			splx(s);
+		} else
+			error = EOPNOTSUPP;
+		break;
+
+	case MOD_SHUTDOWN:
+		error = 0;
+		break;
+
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
diff --git a/sys/kern/uipc_cow.c b/sys/kern/uipc_cow.c
new file mode 100644
index 0000000..239e7c5
--- /dev/null
+++ b/sys/kern/uipc_cow.c
@@ -0,0 +1,181 @@
+/*-
+ * Copyright (c) 1997, Duke University
+ * All rights reserved.
+ *
+ * Author:
+ *         Andrew Gallatin <gallatin@cs.duke.edu>  
+ *            
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgements:
+ *      This product includes software developed by Duke University
+ * 4. The name of Duke University may not be used to endorse or promote 
+ *    products derived from this software without specific prior written 
+ *    permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+ *
+ * $FreeBSD$
+ */
+/*
+ * This is a set of routines for enabling and disabling copy on write
+ * protection for data written into sockets.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/mbuf.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#if 0
+#include <vm/vm_pager.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_zone.h>
+#include <vm/swap_pager.h>
+#endif
+
+
+struct netsend_cow_stats {
+	int attempted;
+	int fail_not_mapped;
+	int fail_wired;
+	int fail_not_anon;
+	int fail_pmap_cow;
+	int fail_pg_error;
+	int fail_kva;
+	int free_post_exit;
+	int success;
+	int iodone;
+	int freed;
+};
+
+static struct netsend_cow_stats socow_stats = {0,0,0,0,0,0,0,0,0,0,0};
+
+extern struct sf_buf *sf_bufs;
+extern vm_offset_t sf_base;
+#define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
+void sf_buf_free(caddr_t addr, void *args);
+struct sf_buf *sf_buf_alloc(void);
+static void socow_iodone(caddr_t addr, void *args);
+
+static void
+socow_iodone(caddr_t addr, void *args)
+{	
+	int s;
+	struct sf_buf *sf;
+
+	vm_offset_t paddr; 
+	vm_page_t pp;
+
+	sf = dtosf(addr);
+	paddr = vtophys((vm_offset_t)addr);
+	pp = PHYS_TO_VM_PAGE(paddr);
+	s = splvm();
+	/* remove COW mapping  */
+	vm_page_cowclear(pp);
+	vm_object_deallocate(pp->object);
+	splx(s);
+	/* note that sf_buf_free() unwires the page for us*/
+	sf_buf_free(addr, NULL);
+	socow_stats.iodone++;
+}
+
+int
+socow_setup(struct mbuf *m0, struct uio *uio)
+{
+	struct sf_buf *sf;
+	vm_page_t pp;
+	vm_offset_t pa;
+	struct iovec *iov;
+	struct vmspace *vmspace;
+	struct vm_map *map;
+	vm_offset_t uva;
+	int s;
+
+	vmspace = curproc->p_vmspace;;
+	map = &vmspace->vm_map;
+	uva = (vm_offset_t) uio->uio_iov->iov_base;
+
+	s = splvm();
+
+       /* 
+	* verify page is mapped & not already wired for i/o
+	*/
+	socow_stats.attempted++;
+	pa=pmap_extract(map->pmap, uva);
+	if(!pa) {
+		socow_stats.fail_not_mapped++;
+		splx(s);
+		return(0);
+	}
+	pp = PHYS_TO_VM_PAGE(pa);
+
+	sf = sf_buf_alloc();
+	sf->m = pp;
+	pmap_qenter(sf->kva, &pp, 1);
+
+	/* 
+	 * set up COW
+	 */
+	vm_page_cowsetup(pp);
+
+	/*
+	 * wire the page for I/O
+	 */
+	vm_page_wire(pp);
+
+	/*
+	 * prevent the process from exiting on us.
+	 */
+	vm_object_reference(pp->object);
+
+	/* 
+	 * attach to mbuf
+	 */
+	m0->m_data = (caddr_t)sf->kva;
+	m0->m_len = PAGE_SIZE;
+	MEXTADD(m0, sf->kva, PAGE_SIZE, socow_iodone, NULL, 0, EXT_SFBUF);
+	socow_stats.success++;
+
+	iov = uio->uio_iov;
+	iov->iov_base += PAGE_SIZE;
+	iov->iov_len -= PAGE_SIZE;
+	uio->uio_resid -= PAGE_SIZE;
+	uio->uio_offset += PAGE_SIZE;
+	if (iov->iov_len == 0) {
+		uio->uio_iov++;
+		uio->uio_iovcnt--;
+	}
+
+	splx(s);
+	return(1);
+}
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
new file mode 100644
index 0000000..b8321eb
--- /dev/null
+++ b/sys/kern/uipc_domain.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_domain.c	8.2 (Berkeley) 10/18/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socketvar.h>
+#include <sys/systm.h>
+#include <vm/uma.h>
+
+/*
+ * System initialization
+ *
+ * Note: domain initialization takes place on a per domain basis
+ * as a result of traversing a SYSINIT linker set.  Most likely,
+ * each domain would want to call DOMAIN_SET(9) itself, which
+ * would cause the domain to be added just after domaininit()
+ * is called during startup.
+ *
+ * See DOMAIN_SET(9) for details on its use.
+ */
+
+static void domaininit(void *);
+SYSINIT(domain, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, domaininit, NULL)
+
+static struct callout pffast_callout;
+static struct callout pfslow_callout;
+
+static void	pffasttimo(void *);
+static void	pfslowtimo(void *);
+
+struct domain *domains;
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because  a socket may be using it.
+ * XXX can't fail at this time.
+ */
+static void
+net_init_domain(struct domain *dp)
+{
+	register struct protosw *pr;
+	int	s;
+
+	s = splnet();
+	if (dp->dom_init)
+		(*dp->dom_init)();
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++){
+		if (pr->pr_usrreqs == 0)
+			panic("domaininit: %ssw[%d] has no usrreqs!",
+			      dp->dom_name, 
+			      (int)(pr - dp->dom_protosw));
+		if (pr->pr_init)
+			(*pr->pr_init)();
+	}
+	/*
+	 * update global informatio about maximums
+	 */
+	max_hdr = max_linkhdr + max_protohdr;
+	max_datalen = MHLEN - max_hdr;
+	splx(s);
+}
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because  a socket may be using it.
+ * XXX can't fail at this time.
+ */
+void
+net_add_domain(void *data)
+{
+	int	s;
+	struct domain *dp;
+
+	dp = (struct domain *)data;
+	s = splnet();
+	dp->dom_next = domains;
+	domains = dp;
+	splx(s);
+	net_init_domain(dp);
+}
+
+/* ARGSUSED*/
+static void
+domaininit(void *dummy)
+{
+	/*
+	 * Before we do any setup, make sure to initialize the
+	 * zone allocator we get struct sockets from.
+	 */
+
+	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	uma_zone_set_max(socket_zone, maxsockets);
+
+	if (max_linkhdr < 16)		/* XXX */
+		max_linkhdr = 16;
+
+	callout_init(&pffast_callout, 0);
+	callout_init(&pfslow_callout, 0);
+
+	callout_reset(&pffast_callout, 1, pffasttimo, NULL);
+	callout_reset(&pfslow_callout, 1, pfslowtimo, NULL);
+}
+
+
+struct protosw *
+pffindtype(family, type)
+	int family;
+	int type;
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		if (dp->dom_family == family)
+			goto found;
+	return (0);
+found:
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+		if (pr->pr_type && pr->pr_type == type)
+			return (pr);
+	return (0);
+}
+
+struct protosw *
+pffindproto(family, protocol, type)
+	int family;
+	int protocol;
+	int type;
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+	struct protosw *maybe = 0;
+
+	if (family == 0)
+		return (0);
+	for (dp = domains; dp; dp = dp->dom_next)
+		if (dp->dom_family == family)
+			goto found;
+	return (0);
+found:
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+		if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
+			return (pr);
+
+		if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
+		    pr->pr_protocol == 0 && maybe == (struct protosw *)0)
+			maybe = pr;
+	}
+	return (maybe);
+}
+
+void
+pfctlinput(cmd, sa)
+	int cmd;
+	struct sockaddr *sa;
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_ctlinput)
+				(*pr->pr_ctlinput)(cmd, sa, (void *)0);
+}
+
+void
+pfctlinput2(cmd, sa, ctlparam)
+	int cmd;
+	struct sockaddr *sa;
+	void *ctlparam;
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+	if (!sa)
+		return;
+	for (dp = domains; dp; dp = dp->dom_next) {
+		/*
+		 * the check must be made by xx_ctlinput() anyways, to
+		 * make sure we use data item pointed to by ctlparam in
+		 * correct way.  the following check is made just for safety.
+		 */
+		if (dp->dom_family != sa->sa_family)
+			continue;
+
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_ctlinput)
+				(*pr->pr_ctlinput)(cmd, sa, ctlparam);
+	}
+}
+
+static void
+pfslowtimo(arg)
+	void *arg;
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_slowtimo)
+				(*pr->pr_slowtimo)();
+	callout_reset(&pfslow_callout, hz/2, pfslowtimo, NULL);
+}
+
+static void
+pffasttimo(arg)
+	void *arg;
+{
+	register struct domain *dp;
+	register struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_fasttimo)
+				(*pr->pr_fasttimo)();
+	callout_reset(&pffast_callout, hz/5, pffasttimo, NULL);
+}
diff --git a/sys/kern/uipc_jumbo.c b/sys/kern/uipc_jumbo.c
new file mode 100644
index 0000000..4625752
--- /dev/null
+++ b/sys/kern/uipc_jumbo.c
@@ -0,0 +1,252 @@
+/*-
+ * Copyright (c) 1997, Duke University
+ * All rights reserved.
+ *
+ * Author:
+ *         Andrew Gallatin <gallatin@cs.duke.edu>  
+ *            
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgements:
+ *      This product includes software developed by Duke University
+ * 4. The name of Duke University may not be used to endorse or promote 
+ *    products derived from this software without specific prior written 
+ *    permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+ *
+ * $FreeBSD$
+ */
+/*
+ * This is a set of routines for allocating large-sized mbuf payload
+ * areas, and is primarily intended for use in receive side mbuf
+ * allocation.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/sockio.h>
+#include <sys/uio.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_map.h>
+#include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
+#include <sys/vmmeter.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <sys/proc.h>
+#include <sys/jumbo.h>
+
+/*
+ * XXX this may be too high or too low.
+ */
+#define JUMBO_MAX_PAGES 3072
+
+struct jumbo_kmap {
+	vm_offset_t kva;
+	SLIST_ENTRY(jumbo_kmap) entries;     /* Singly-linked List. */
+};
+
+static SLIST_HEAD(jumbo_kmap_head, jumbo_kmap) jumbo_kmap_free,
+					       jumbo_kmap_inuse;
+
+static struct mtx jumbo_mutex;
+MTX_SYSINIT(jumbo_lock, &jumbo_mutex, "jumbo mutex", MTX_DEF);
+
+static struct vm_object *jumbo_vm_object;
+static unsigned long jumbo_vmuiomove_pgs_freed = 0;
+#if 0
+static int jumbo_vm_wakeup_wanted = 0;
+#endif
+vm_offset_t jumbo_basekva;
+
+int
+jumbo_vm_init(void)
+{
+	int i;
+	struct jumbo_kmap *entry;
+
+	mtx_lock(&jumbo_mutex);
+
+	if (jumbo_vm_object != NULL) {
+		mtx_unlock(&jumbo_mutex);
+		return (1);
+	}
+
+	/* allocate our object */
+	jumbo_vm_object = vm_object_allocate_wait(OBJT_DEFAULT, JUMBO_MAX_PAGES,
+						  M_NOWAIT);
+
+	if (jumbo_vm_object == NULL) {
+		mtx_unlock(&jumbo_mutex);
+		return (0);
+	}
+
+	SLIST_INIT(&jumbo_kmap_free);
+	SLIST_INIT(&jumbo_kmap_inuse);
+
+	/* grab some kernel virtual address space */
+	jumbo_basekva = kmem_alloc_pageable(kernel_map,
+		PAGE_SIZE * JUMBO_MAX_PAGES);
+	if (jumbo_basekva == 0) {
+		vm_object_deallocate(jumbo_vm_object);
+		jumbo_vm_object = NULL;
+		mtx_unlock(&jumbo_mutex);
+		return 0;
+	}
+	for (i = 0; i < JUMBO_MAX_PAGES; i++) {
+		entry = malloc(sizeof(struct jumbo_kmap), M_TEMP, M_NOWAIT);
+		if (!entry && !i)  {
+			mtx_unlock(&jumbo_mutex);
+			panic("jumbo_vm_init: unable to allocated kvas");
+		} else if (!entry) {
+			printf("warning: jumbo_vm_init allocated only %d kva\n",
+			       i);
+			mtx_unlock(&jumbo_mutex);
+			return 1;
+		}
+		entry->kva = jumbo_basekva + (vm_offset_t)i * PAGE_SIZE;
+		SLIST_INSERT_HEAD(&jumbo_kmap_free, entry, entries);
+	}
+	mtx_unlock(&jumbo_mutex);
+	return 1;
+}
+
+void
+jumbo_freem(caddr_t addr, void *args)
+{
+	vm_page_t frame;
+
+	frame = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)addr));
+
+	/*
+	 * Need giant for looking at the hold count below.  Convert this
+	 * to the vm mutex once the VM code has been moved out from under
+	 * giant.
+	 */
+	GIANT_REQUIRED;
+
+	if (frame->hold_count == 0)
+		jumbo_pg_free((vm_offset_t)addr);
+	else
+		printf("jumbo_freem: hold count for %p is %d!!??\n",
+		       frame, frame->hold_count);
+}
+
+void 
+jumbo_pg_steal(vm_page_t pg)
+{
+	vm_offset_t addr;
+	struct jumbo_kmap *entry;
+
+	addr = ptoa(pg->pindex) + jumbo_basekva;
+
+	if (pg->object != jumbo_vm_object)
+		panic("stealing a non jumbo_vm_object page");
+	vm_page_remove(pg);
+
+	mtx_lock(&jumbo_mutex);
+
+	pmap_qremove(addr,1);
+	entry = SLIST_FIRST(&jumbo_kmap_inuse);
+	entry->kva = addr;
+	SLIST_REMOVE_HEAD(&jumbo_kmap_inuse, entries);
+	SLIST_INSERT_HEAD(&jumbo_kmap_free, entry, entries);
+
+	mtx_unlock(&jumbo_mutex);
+
+#if 0
+	if (jumbo_vm_wakeup_wanted)
+		wakeup(jumbo_vm_object);
+#endif
+}
+
+
+vm_page_t 
+jumbo_pg_alloc(void)
+{
+	vm_page_t pg;
+	vm_pindex_t pindex;
+	struct jumbo_kmap *entry;
+
+ 	pg = NULL;
+	mtx_lock(&jumbo_mutex);
+
+	entry = SLIST_FIRST(&jumbo_kmap_free);
+	if (entry != NULL){
+		pindex = atop(entry->kva - jumbo_basekva);
+		pg = vm_page_alloc(jumbo_vm_object, pindex, VM_ALLOC_INTERRUPT);
+		if (pg != NULL) {
+			SLIST_REMOVE_HEAD(&jumbo_kmap_free, entries);
+			SLIST_INSERT_HEAD(&jumbo_kmap_inuse, entry, entries);
+			pmap_qenter(entry->kva, &pg, 1);
+		}
+	}
+	mtx_unlock(&jumbo_mutex);
+	return(pg);
+}
+
+void 
+jumbo_pg_free(vm_offset_t addr)
+{
+	struct jumbo_kmap *entry;
+	vm_offset_t paddr;
+	vm_page_t pg;
+
+	paddr = pmap_kextract((vm_offset_t)addr);
+	pg = PHYS_TO_VM_PAGE(paddr);
+
+	if (pg->object != jumbo_vm_object) {
+		jumbo_vmuiomove_pgs_freed++;
+/*		if(vm_page_lookup(jumbo_vm_object, atop(addr - jumbo_basekva)))
+			panic("vm_page_rename didn't");
+		printf("freeing uiomoved pg:\t pindex = %d, padd = 0x%lx\n",
+		       atop(addr - jumbo_basekva), paddr);
+*/
+	} else {
+		vm_page_busy(pg); /* vm_page_free wants pages to be busy*/
+		vm_page_free(pg);
+	}
+
+	mtx_lock(&jumbo_mutex);
+
+	pmap_qremove(addr,1);
+	entry = SLIST_FIRST(&jumbo_kmap_inuse);
+	entry->kva = addr;
+	SLIST_REMOVE_HEAD(&jumbo_kmap_inuse, entries);
+	SLIST_INSERT_HEAD(&jumbo_kmap_free, entry, entries);
+
+	mtx_unlock(&jumbo_mutex);
+
+#if 0
+	if (jumbo_vm_wakeup_wanted)
+		wakeup(jumbo_vm_object);
+#endif
+}
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
new file mode 100644
index 0000000..27ca156
--- /dev/null
+++ b/sys/kern/uipc_mbuf.c
@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/sysctl.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+
+int	max_linkhdr;
+int	max_protohdr;
+int	max_hdr;
+int	max_datalen;
+
+/*
+ * sysctl(8) exported objects
+ */
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
+	   &max_linkhdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
+	   &max_protohdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
+	   &max_datalen, 0, "");
+
+/*
+ * struct mbuf *
+ * m_getm(m, len, how, type)
+ *
+ * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits
+ * best) and return a pointer to the top of the allocated chain. If m is
+ * non-null, then we assume that it is a single mbuf or an mbuf chain to
+ * which we want len bytes worth of mbufs and/or clusters attached, and so
+ * if we succeed in allocating it, we will just return a pointer to m.
+ *
+ * If we happen to fail at any point during the allocation, we will free
+ * up everything we have already allocated and return NULL.
+ *
+ */
+struct mbuf *
+m_getm(struct mbuf *m, int len, int how, int type)
+{
+	struct mbuf *top, *tail, *mp, *mtail = NULL;
+
+	KASSERT(len >= 0, ("len is < 0 in m_getm"));
+
+	MGET(mp, how, type);
+	if (mp == NULL)
+		return (NULL);
+	else if (len > MINCLSIZE) {
+		MCLGET(mp, how);
+		if ((mp->m_flags & M_EXT) == 0) {
+			m_free(mp);
+			return (NULL);
+		}
+	}
+	mp->m_len = 0;
+	len -= M_TRAILINGSPACE(mp);
+
+	if (m != NULL)
+		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
+	else
+		m = mp;
+
+	top = tail = mp;
+	while (len > 0) {
+		MGET(mp, how, type);
+		if (mp == NULL)
+			goto failed;
+
+		tail->m_next = mp;
+		tail = mp;
+		if (len > MINCLSIZE) {
+			MCLGET(mp, how);
+			if ((mp->m_flags & M_EXT) == 0)
+				goto failed;
+		}
+
+		mp->m_len = 0;
+		len -= M_TRAILINGSPACE(mp);
+	}
+
+	if (mtail != NULL)
+		mtail->m_next = top;
+	return (m);
+
+failed:
+	m_freem(top);
+	return (NULL);
+}
+
+void
+m_freem(struct mbuf *m)
+{
+	while (m) {
+		m = m_free(m);
+	}
+}
+
+/*
+ * Lesser-used path for M_PREPEND:
+ * allocate new mbuf to prepend to chain,
+ * copy junk along.
+ */
+struct mbuf *
+m_prepend(struct mbuf *m, int len, int how)
+{
+	struct mbuf *mn;
+
+	MGET(mn, how, m->m_type);
+	if (mn == NULL) {
+		m_freem(m);
+		return (NULL);
+	}
+	if (m->m_flags & M_PKTHDR) {
+		M_COPY_PKTHDR(mn, m);
+		m->m_flags &= ~M_PKTHDR;
+	}
+	mn->m_next = m;
+	m = mn;
+	if (len < MHLEN)
+		MH_ALIGN(m, len);
+	m->m_len = len;
+	return (m);
+}
+
+/*
+ * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
+ * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
+ * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller.
+ * Note that the copy is read-only, because clusters are not copied,
+ * only their reference counts are incremented.
+ */
+struct mbuf *
+m_copym(struct mbuf *m, int off0, int len, int wait)
+{
+	struct mbuf *n, **np;
+	int off = off0;
+	struct mbuf *top;
+	int copyhdr = 0;
+
+	KASSERT(off >= 0, ("m_copym, negative off %d", off));
+	KASSERT(len >= 0, ("m_copym, negative len %d", len));
+	if (off == 0 && m->m_flags & M_PKTHDR)
+		copyhdr = 1;
+	while (off > 0) {
+		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
+		if (off < m->m_len)
+			break;
+		off -= m->m_len;
+		m = m->m_next;
+	}
+	np = &top;
+	top = 0;
+	while (len > 0) {
+		if (m == NULL) {
+			KASSERT(len == M_COPYALL, 
+			    ("m_copym, length > size of mbuf chain"));
+			break;
+		}
+		MGET(n, wait, m->m_type);
+		*np = n;
+		if (n == NULL)
+			goto nospace;
+		if (copyhdr) {
+			M_COPY_PKTHDR(n, m);
+			if (len == M_COPYALL)
+				n->m_pkthdr.len -= off0;
+			else
+				n->m_pkthdr.len = len;
+			copyhdr = 0;
+		}
+		n->m_len = min(len, m->m_len - off);
+		if (m->m_flags & M_EXT) {
+			n->m_data = m->m_data + off;
+			n->m_ext = m->m_ext;
+			n->m_flags |= M_EXT;
+			MEXT_ADD_REF(m);
+		} else
+			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
+			    (unsigned)n->m_len);
+		if (len != M_COPYALL)
+			len -= n->m_len;
+		off = 0;
+		m = m->m_next;
+		np = &n->m_next;
+	}
+	if (top == NULL)
+		mbstat.m_mcfail++;	/* XXX: No consistency. */
+
+	return (top);
+nospace:
+	m_freem(top);
+	mbstat.m_mcfail++;	/* XXX: No consistency. */
+	return (NULL);
+}
+
+/*
+ * Copy an entire packet, including header (which must be present).
+ * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
+ * Note that the copy is read-only, because clusters are not copied,
+ * only their reference counts are incremented.
+ * Preserve alignment of the first mbuf so if the creator has left
+ * some room at the beginning (e.g. for inserting protocol headers)
+ * the copies still have the room available.
+ */
+struct mbuf *
+m_copypacket(struct mbuf *m, int how)
+{
+	struct mbuf *top, *n, *o;
+
+	MGET(n, how, m->m_type);
+	top = n;
+	if (n == NULL)
+		goto nospace;
+
+	M_COPY_PKTHDR(n, m);
+	n->m_len = m->m_len;
+	if (m->m_flags & M_EXT) {
+		n->m_data = m->m_data;
+		n->m_ext = m->m_ext;
+		n->m_flags |= M_EXT;
+		MEXT_ADD_REF(m);
+	} else {
+		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
+		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+	}
+
+	m = m->m_next;
+	while (m) {
+		MGET(o, how, m->m_type);
+		if (o == NULL)
+			goto nospace;
+
+		n->m_next = o;
+		n = n->m_next;
+
+		n->m_len = m->m_len;
+		if (m->m_flags & M_EXT) {
+			n->m_data = m->m_data;
+			n->m_ext = m->m_ext;
+			n->m_flags |= M_EXT;
+			MEXT_ADD_REF(m);
+		} else {
+			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+		}
+
+		m = m->m_next;
+	}
+	return top;
+nospace:
+	m_freem(top);
+	mbstat.m_mcfail++;	/* XXX: No consistency. */ 
+	return (NULL);
+}
+
+/*
+ * Copy data from an mbuf chain starting "off" bytes from the beginning,
+ * continuing for "len" bytes, into the indicated buffer.
+ */
+void
+m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
+{
+	unsigned count;
+
+	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
+	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
+	while (off > 0) {
+		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
+		if (off < m->m_len)
+			break;
+		off -= m->m_len;
+		m = m->m_next;
+	}
+	while (len > 0) {
+		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
+		count = min(m->m_len - off, len);
+		bcopy(mtod(m, caddr_t) + off, cp, count);
+		len -= count;
+		cp += count;
+		off = 0;
+		m = m->m_next;
+	}
+}
+
+/*
+ * Copy a packet header mbuf chain into a completely new chain, including
+ * copying any mbuf clusters.  Use this instead of m_copypacket() when
+ * you need a writable copy of an mbuf chain.
+ */
+struct mbuf *
+m_dup(struct mbuf *m, int how)
+{
+	struct mbuf **p, *top = NULL;
+	int remain, moff, nsize;
+
+	/* Sanity check */
+	if (m == NULL)
+		return (NULL);
+	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
+
+	/* While there's more data, get a new mbuf, tack it on, and fill it */
+	remain = m->m_pkthdr.len;
+	moff = 0;
+	p = &top;
+	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
+		struct mbuf *n;
+
+		/* Get the next new mbuf */
+		MGET(n, how, m->m_type);
+		if (n == NULL)
+			goto nospace;
+		if (top == NULL) {		/* first one, must be PKTHDR */
+			M_COPY_PKTHDR(n, m);
+			nsize = MHLEN;
+		} else				/* not the first one */
+			nsize = MLEN;
+		if (remain >= MINCLSIZE) {
+			MCLGET(n, how);
+			if ((n->m_flags & M_EXT) == 0) {
+				(void)m_free(n);
+				goto nospace;
+			}
+			nsize = MCLBYTES;
+		}
+		n->m_len = 0;
+
+		/* Link it into the new chain */
+		*p = n;
+		p = &n->m_next;
+
+		/* Copy data from original mbuf(s) into new mbuf */
+		while (n->m_len < nsize && m != NULL) {
+			int chunk = min(nsize - n->m_len, m->m_len - moff);
+
+			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
+			moff += chunk;
+			n->m_len += chunk;
+			remain -= chunk;
+			if (moff == m->m_len) {
+				m = m->m_next;
+				moff = 0;
+			}
+		}
+
+		/* Check correct total mbuf length */
+		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
+		    	("%s: bogus m_pkthdr.len", __func__));
+	}
+	return (top);
+
+nospace:
+	m_freem(top);
+	mbstat.m_mcfail++;	/* XXX: No consistency. */
+	return (NULL);
+}
+
+/*
+ * Concatenate mbuf chain n to m.
+ * Both chains must be of the same type (e.g. MT_DATA).
+ * Any m_pkthdr is not updated.
+ */
+void
+m_cat(struct mbuf *m, struct mbuf *n)
+{
+	while (m->m_next)
+		m = m->m_next;
+	while (n) {
+		if (m->m_flags & M_EXT ||
+		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
+			/* just join the two chains */
+			m->m_next = n;
+			return;
+		}
+		/* splat the data from one into the other */
+		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+		    (u_int)n->m_len);
+		m->m_len += n->m_len;
+		n = m_free(n);
+	}
+}
+
+void
+m_adj(struct mbuf *mp, int req_len)
+{
+	int len = req_len;
+	struct mbuf *m;
+	int count;
+
+	if ((m = mp) == NULL)
+		return;
+	if (len >= 0) {
+		/*
+		 * Trim from head.
+		 */
+		while (m != NULL && len > 0) {
+			if (m->m_len <= len) {
+				len -= m->m_len;
+				m->m_len = 0;
+				m = m->m_next;
+			} else {
+				m->m_len -= len;
+				m->m_data += len;
+				len = 0;
+			}
+		}
+		m = mp;
+		if (mp->m_flags & M_PKTHDR)
+			m->m_pkthdr.len -= (req_len - len);
+	} else {
+		/*
+		 * Trim from tail.  Scan the mbuf chain,
+		 * calculating its length and finding the last mbuf.
+		 * If the adjustment only affects this mbuf, then just
+		 * adjust and return.  Otherwise, rescan and truncate
+		 * after the remaining size.
+		 */
+		len = -len;
+		count = 0;
+		for (;;) {
+			count += m->m_len;
+			if (m->m_next == (struct mbuf *)0)
+				break;
+			m = m->m_next;
+		}
+		if (m->m_len >= len) {
+			m->m_len -= len;
+			if (mp->m_flags & M_PKTHDR)
+				mp->m_pkthdr.len -= len;
+			return;
+		}
+		count -= len;
+		if (count < 0)
+			count = 0;
+		/*
+		 * Correct length for chain is "count".
+		 * Find the mbuf with last data, adjust its length,
+		 * and toss data from remaining mbufs on chain.
+		 */
+		m = mp;
+		if (m->m_flags & M_PKTHDR)
+			m->m_pkthdr.len = count;
+		for (; m; m = m->m_next) {
+			if (m->m_len >= count) {
+				m->m_len = count;
+				break;
+			}
+			count -= m->m_len;
+		}
+		while (m->m_next)
+			(m = m->m_next) ->m_len = 0;
+	}
+}
+
+/*
+ * Rearange an mbuf chain so that len bytes are contiguous
+ * and in the data area of an mbuf (so that mtod and dtom
+ * will work for a structure of size len).  Returns the resulting
+ * mbuf chain on success, frees it and returns null on failure.
+ * If there is room, it will add up to max_protohdr-len extra bytes to the
+ * contiguous region in an attempt to avoid being called next time.
+ */
+struct mbuf *
+m_pullup(struct mbuf *n, int len)
+{
+	struct mbuf *m;
+	int count;
+	int space;
+
+	/*
+	 * If first mbuf has no cluster, and has room for len bytes
+	 * without shifting current data, pullup into it,
+	 * otherwise allocate a new mbuf to prepend to the chain.
+	 */
+	if ((n->m_flags & M_EXT) == 0 &&
+	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
+		if (n->m_len >= len)
+			return (n);
+		m = n;
+		n = n->m_next;
+		len -= m->m_len;
+	} else {
+		if (len > MHLEN)
+			goto bad;
+		MGET(m, M_DONTWAIT, n->m_type);
+		if (m == NULL)
+			goto bad;
+		m->m_len = 0;
+		if (n->m_flags & M_PKTHDR) {
+			M_COPY_PKTHDR(m, n);
+			n->m_flags &= ~M_PKTHDR;
+		}
+	}
+	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
+	do {
+		count = min(min(max(len, max_protohdr), space), n->m_len);
+		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+		  (unsigned)count);
+		len -= count;
+		m->m_len += count;
+		n->m_len -= count;
+		space -= count;
+		if (n->m_len)
+			n->m_data += count;
+		else
+			n = m_free(n);
+	} while (len > 0 && n);
+	if (len > 0) {
+		(void) m_free(m);
+		goto bad;
+	}
+	m->m_next = n;
+	return (m);
+bad:
+	m_freem(n);
+	mbstat.m_mpfail++;	/* XXX: No consistency. */
+	return (NULL);
+}
+
+/*
+ * Partition an mbuf chain in two pieces, returning the tail --
+ * all but the first len0 bytes.  In case of failure, it returns NULL and
+ * attempts to restore the chain to its original state.
+ *
+ * Note that the resulting mbufs might be read-only, because the new
+ * mbuf can end up sharing an mbuf cluster with the original mbuf if
+ * the "breaking point" happens to lie within a cluster mbuf. Use the
+ * M_WRITABLE() macro to check for this case.
+ */
+struct mbuf *
+m_split(struct mbuf *m0, int len0, int wait)
+{
+	struct mbuf *m, *n;
+	unsigned len = len0, remain;
+
+	for (m = m0; m && len > m->m_len; m = m->m_next)
+		len -= m->m_len;
+	if (m == NULL)
+		return (NULL);
+	remain = m->m_len - len;
+	if (m0->m_flags & M_PKTHDR) {
+		MGETHDR(n, wait, m0->m_type);
+		if (n == NULL)
+			return (NULL);
+		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
+		m0->m_pkthdr.len = len0;
+		if (m->m_flags & M_EXT)
+			goto extpacket;
+		if (remain > MHLEN) {
+			/* m can't be the lead packet */
+			MH_ALIGN(n, 0);
+			n->m_next = m_split(m, len, wait);
+			if (n->m_next == NULL) {
+				(void) m_free(n);
+				return (NULL);
+			} else {
+				n->m_len = 0;
+				return (n);
+			}
+		} else
+			MH_ALIGN(n, remain);
+	} else if (remain == 0) {
+		n = m->m_next;
+		m->m_next = NULL;
+		return (n);
+	} else {
+		MGET(n, wait, m->m_type);
+		if (n == NULL)
+			return (NULL);
+		M_ALIGN(n, remain);
+	}
+extpacket:
+	if (m->m_flags & M_EXT) {
+		n->m_flags |= M_EXT;
+		n->m_ext = m->m_ext;
+		MEXT_ADD_REF(m);
+		n->m_data = m->m_data + len;
+	} else {
+		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
+	}
+	n->m_len = remain;
+	m->m_len = len;
+	n->m_next = m->m_next;
+	m->m_next = NULL;
+	return (n);
+}
+/*
+ * Routine to copy from device local memory into mbufs.
+ * Note that `off' argument is offset into first mbuf of target chain from
+ * which to begin copying the data to.
+ */
+struct mbuf *
+m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
+	 void (*copy)(char *from, caddr_t to, u_int len))
+{
+	struct mbuf *m;
+	struct mbuf *top = 0, **mp = &top;
+	int len;
+
+	if (off < 0 || off > MHLEN)
+		return (NULL);
+
+	MGETHDR(m, M_DONTWAIT, MT_DATA);
+	if (m == NULL)
+		return (NULL);
+	m->m_pkthdr.rcvif = ifp;
+	m->m_pkthdr.len = totlen;
+	len = MHLEN;
+
+	while (totlen > 0) {
+		if (top) {
+			MGET(m, M_DONTWAIT, MT_DATA);
+			if (m == NULL) {
+				m_freem(top);
+				return (NULL);
+			}
+			len = MLEN;
+		}
+		if (totlen + off >= MINCLSIZE) {
+			MCLGET(m, M_DONTWAIT);
+			if (m->m_flags & M_EXT)
+				len = MCLBYTES;
+		} else {
+			/*
+			 * Place initial small packet/header at end of mbuf.
+			 */
+			if (top == NULL && totlen + off + max_linkhdr <= len) {
+				m->m_data += max_linkhdr;
+				len -= max_linkhdr;
+			}
+		}
+		if (off) {
+			m->m_data += off;
+			len -= off;
+			off = 0;
+		}
+		m->m_len = len = min(totlen, len);
+		if (copy)
+			copy(buf, mtod(m, caddr_t), (unsigned)len);
+		else
+			bcopy(buf, mtod(m, caddr_t), (unsigned)len);
+		buf += len;
+		*mp = m;
+		mp = &m->m_next;
+		totlen -= len;
+	}
+	return (top);
+}
+
+/*
+ * Copy data from a buffer back into the indicated mbuf chain,
+ * starting "off" bytes from the beginning, extending the mbuf
+ * chain if necessary.
+ */
+void
+m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
+{
+	int mlen;
+	struct mbuf *m = m0, *n;
+	int totlen = 0;
+
+	if (m0 == NULL)
+		return;
+	while (off > (mlen = m->m_len)) {
+		off -= mlen;
+		totlen += mlen;
+		if (m->m_next == NULL) {
+			n = m_get_clrd(M_DONTWAIT, m->m_type);
+			if (n == NULL)
+				goto out;
+			n->m_len = min(MLEN, len + off);
+			m->m_next = n;
+		}
+		m = m->m_next;
+	}
+	while (len > 0) {
+		mlen = min (m->m_len - off, len);
+		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
+		cp += mlen;
+		len -= mlen;
+		mlen += off;
+		off = 0;
+		totlen += mlen;
+		if (len == 0)
+			break;
+		if (m->m_next == NULL) {
+			n = m_get(M_DONTWAIT, m->m_type);
+			if (n == NULL)
+				break;
+			n->m_len = min(MLEN, len);
+			m->m_next = n;
+		}
+		m = m->m_next;
+	}
+out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
+		m->m_pkthdr.len = totlen;
+}
+
+void
+m_print(const struct mbuf *m)
+{
+	int len;
+	const struct mbuf *m2;
+
+	len = m->m_pkthdr.len;
+	m2 = m;
+	while (len) {
+		printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
+		len -= m2->m_len;
+		m2 = m2->m_next;
+	}
+	return;
+}
diff --git a/sys/kern/uipc_mbuf2.c b/sys/kern/uipc_mbuf2.c
new file mode 100644
index 0000000..37ee53e
--- /dev/null
+++ b/sys/kern/uipc_mbuf2.c
@@ -0,0 +1,404 @@
+/*	$FreeBSD$	*/
+/*	$KAME: uipc_mbuf2.c,v 1.31 2001/11/28 11:08:53 itojun Exp $	*/
+/*	$NetBSD: uipc_mbuf.c,v 1.40 1999/04/01 00:23:25 thorpej Exp $	*/
+
+/*
+ * Copyright (C) 1999 WIDE Project.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_mbuf.c	8.4 (Berkeley) 2/14/95
+ */
+
+/*#define PULLDOWN_DEBUG*/
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+
+/* can't call it m_dup(), as freebsd[34] uses m_dup() with different arg */
+static struct mbuf *m_dup1(struct mbuf *, int, int, int);
+
+/*
+ * ensure that [off, off + len) is contiguous on the mbuf chain "m".
+ * packet chain before "off" is kept untouched.
+ * if offp == NULL, the target will start at <retval, 0> on resulting chain.
+ * if offp != NULL, the target will start at <retval, *offp> on resulting chain.
+ *
+ * on error return (NULL return value), original "m" will be freed.
+ *
+ * XXX: M_TRAILINGSPACE/M_LEADINGSPACE only permitted on writable ext_buf.
+ */
+struct mbuf *
+m_pulldown(struct mbuf *m, int off, int len, int *offp)
+{
+	struct mbuf *n, *o;
+	int hlen, tlen, olen;
+	int writable;
+
+	/* check invalid arguments. */
+	if (m == NULL)
+		panic("m == NULL in m_pulldown()");
+	if (len > MCLBYTES) {
+		m_freem(m);
+		return NULL;	/* impossible */
+	}
+
+#ifdef PULLDOWN_DEBUG
+    {
+	struct mbuf *t;
+	printf("before:");
+	for (t = m; t; t = t->m_next)
+		printf(" %d", t->m_len);
+	printf("\n");
+    }
+#endif
+	n = m;
+	while (n != NULL && off > 0) {
+		if (n->m_len > off)
+			break;
+		off -= n->m_len;
+		n = n->m_next;
+	}
+	/* be sure to point non-empty mbuf */
+	while (n != NULL && n->m_len == 0)
+		n = n->m_next;
+	if (!n) {
+		m_freem(m);
+		return NULL;	/* mbuf chain too short */
+	}
+
+	/*
+	 * XXX: This code is flawed because it considers a "writable" mbuf
+	 *      data region to require all of the following:
+	 *	  (i) mbuf _has_ to have M_EXT set; if it is just a regular
+	 *	      mbuf, it is still not considered "writable."
+	 *	  (ii) since mbuf has M_EXT, the ext_type _has_ to be
+	 *	       EXT_CLUSTER. Anything else makes it non-writable.
+	 *	  (iii) M_WRITABLE() must evaluate true.
+	 *      Ideally, the requirement should only be (iii).
+	 *
+	 * If we're writable, we're sure we're writable, because the ref. count
+	 * cannot increase from 1, as that would require posession of mbuf
+	 * n by someone else (which is impossible). However, if we're _not_
+	 * writable, we may eventually become writable )if the ref. count drops
+	 * to 1), but we'll fail to notice it unless we re-evaluate
+	 * M_WRITABLE(). For now, we only evaluate once at the beginning and
+	 * live with this.
+	 */
+	/*
+	 * XXX: This is dumb. If we're just a regular mbuf with no M_EXT,
+	 *      then we're not "writable," according to this code.
+	 */
+	writable = 0;
+	if ((n->m_flags & M_EXT) == 0 ||
+	    (n->m_ext.ext_type == EXT_CLUSTER && M_WRITABLE(n)))
+		writable = 1;
+
+	/*
+	 * the target data is on <n, off>.
+	 * if we got enough data on the mbuf "n", we're done.
+	 */
+	if ((off == 0 || offp) && len <= n->m_len - off && writable)
+		goto ok;
+
+	/*
+	 * when len <= n->m_len - off and off != 0, it is a special case.
+	 * len bytes from <n, off> sits in single mbuf, but the caller does
+	 * not like the starting position (off).
+	 * chop the current mbuf into two pieces, set off to 0.
+	 */
+	if (len <= n->m_len - off) {
+		o = m_dup1(n, off, n->m_len - off, M_DONTWAIT);
+		if (o == NULL) {
+			m_freem(m);
+			return NULL;	/* ENOBUFS */
+		}
+		n->m_len = off;
+		o->m_next = n->m_next;
+		n->m_next = o;
+		n = n->m_next;
+		off = 0;
+		goto ok;
+	}
+
+	/*
+	 * we need to take hlen from <n, off> and tlen from <n->m_next, 0>,
+	 * and construct contiguous mbuf with m_len == len.
+	 * note that hlen + tlen == len, and tlen > 0.
+	 */
+	hlen = n->m_len - off;
+	tlen = len - hlen;
+
+	/*
+	 * ensure that we have enough trailing data on mbuf chain.
+	 * if not, we can do nothing about the chain.
+	 */
+	olen = 0;
+	for (o = n->m_next; o != NULL; o = o->m_next)
+		olen += o->m_len;
+	if (hlen + olen < len) {
+		m_freem(m);
+		return NULL;	/* mbuf chain too short */
+	}
+
+	/*
+	 * easy cases first.
+	 * we need to use m_copydata() to get data from <n->m_next, 0>.
+	 */
+	if ((off == 0 || offp) && M_TRAILINGSPACE(n) >= tlen
+	 && writable) {
+		m_copydata(n->m_next, 0, tlen, mtod(n, caddr_t) + n->m_len);
+		n->m_len += tlen;
+		m_adj(n->m_next, tlen);
+		goto ok;
+	}
+	if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen
+	 && writable) {
+		n->m_next->m_data -= hlen;
+		n->m_next->m_len += hlen;
+		bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen);
+		n->m_len -= hlen;
+		n = n->m_next;
+		off = 0;
+		goto ok;
+	}
+
+	/*
+	 * now, we need to do the hard way.  don't m_copy as there's no room
+	 * on both end.
+	 */
+	MGET(o, M_DONTWAIT, m->m_type);
+	if (o && len > MLEN) {
+		MCLGET(o, M_DONTWAIT);
+		if ((o->m_flags & M_EXT) == 0) {
+			m_free(o);
+			o = NULL;
+		}
+	}
+	if (!o) {
+		m_freem(m);
+		return NULL;	/* ENOBUFS */
+	}
+	/* get hlen from <n, off> into <o, 0> */
+	o->m_len = hlen;
+	bcopy(mtod(n, caddr_t) + off, mtod(o, caddr_t), hlen);
+	n->m_len -= hlen;
+	/* get tlen from <n->m_next, 0> into <o, hlen> */
+	m_copydata(n->m_next, 0, tlen, mtod(o, caddr_t) + o->m_len);
+	o->m_len += tlen;
+	m_adj(n->m_next, tlen);
+	o->m_next = n->m_next;
+	n->m_next = o;
+	n = o;
+	off = 0;
+
+ok:
+#ifdef PULLDOWN_DEBUG
+    {
+	struct mbuf *t;
+	printf("after:");
+	for (t = m; t; t = t->m_next)
+		printf("%c%d", t == n ? '*' : ' ', t->m_len);
+	printf(" (off=%d)\n", off);
+    }
+#endif
+	if (offp)
+		*offp = off;
+	return n;
+}
+
+static struct mbuf *
+m_dup1(struct mbuf *m, int off, int len, int wait)
+{
+	struct mbuf *n;
+	int l;
+	int copyhdr;
+
+	if (len > MCLBYTES)
+		return NULL;
+	if (off == 0 && (m->m_flags & M_PKTHDR) != 0) {
+		copyhdr = 1;
+		MGETHDR(n, wait, m->m_type);
+		l = MHLEN;
+	} else {
+		copyhdr = 0;
+		MGET(n, wait, m->m_type);
+		l = MLEN;
+	}
+	if (n && len > l) {
+		MCLGET(n, wait);
+		if ((n->m_flags & M_EXT) == 0) {
+			m_free(n);
+			n = NULL;
+		}
+	}
+	if (!n)
+		return NULL;
+
+	if (copyhdr)
+		M_COPY_PKTHDR(n, m);
+	m_copydata(m, off, len, mtod(n, caddr_t));
+	return n;
+}
+
+/*
+ * pkthdr.aux chain manipulation.
+ * we don't allow clusters at this moment. 
+ */
+struct mbuf *
+m_aux_add2(struct mbuf *m, int af, int type, void *p)
+{
+	struct mbuf *n;
+	struct mauxtag *t;
+
+	if ((m->m_flags & M_PKTHDR) == 0)
+		return NULL;
+
+	n = m_aux_find(m, af, type);
+	if (n)
+		return n;
+
+	MGET(n, M_DONTWAIT, m->m_type);
+	if (n == NULL)
+		return NULL;
+
+	t = mtod(n, struct mauxtag *);
+	bzero(t, sizeof(*t));
+	t->af = af;
+	t->type = type;
+	t->p = p;
+	n->m_data += sizeof(struct mauxtag);
+	n->m_len = 0;
+	n->m_next = m->m_pkthdr.aux;
+	m->m_pkthdr.aux = n;
+	return n;
+}
+
+struct mbuf *
+m_aux_find2(struct mbuf *m, int af, int type, void *p)
+{
+	struct mbuf *n;
+	struct mauxtag *t;
+
+	if ((m->m_flags & M_PKTHDR) == 0)
+		return NULL;
+
+	for (n = m->m_pkthdr.aux; n; n = n->m_next) {
+		t = (struct mauxtag *)n->m_dat;
+		if (n->m_data != ((caddr_t)t) + sizeof(struct mauxtag)) {
+			printf("m_aux_find: invalid m_data for mbuf=%p (%p %p)\n", n, t, n->m_data);
+			continue;
+		}
+		if (t->af == af && t->type == type && t->p == p)
+			return n;
+	}
+	return NULL;
+}
+
+struct mbuf *
+m_aux_find(struct mbuf *m, int af, int type)
+{
+
+	return m_aux_find2(m, af, type, NULL);
+}
+
+struct mbuf *
+m_aux_add(struct mbuf *m, int af, int type)
+{
+
+	return m_aux_add2(m, af, type, NULL);
+}
+
+void
+m_aux_delete(struct mbuf *m, struct mbuf *victim)
+{
+	struct mbuf *n, *prev, *next;
+	struct mauxtag *t;
+
+	if ((m->m_flags & M_PKTHDR) == 0)
+		return;
+
+	prev = NULL;
+	n = m->m_pkthdr.aux;
+	while (n) {
+		t = (struct mauxtag *)n->m_dat;
+		next = n->m_next;
+		if (n->m_data != ((caddr_t)t) + sizeof(struct mauxtag)) {
+			printf("m_aux_delete: invalid m_data for mbuf=%p (%p %p)\n", n, t, n->m_data);
+			prev = n;
+			n = next;
+			continue;
+		}
+		if (n == victim) {
+			if (prev)
+				prev->m_next = n->m_next;
+			else
+				m->m_pkthdr.aux = n->m_next;
+			n->m_next = NULL;
+			m_free(n);
+			return;
+		} else
+			prev = n;
+		n = next;
+	}
+}
diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c
new file mode 100644
index 0000000..74dab78
--- /dev/null
+++ b/sys/kern/uipc_proto.c
@@ -0,0 +1,80 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_proto.c	8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/un.h>
+
+#include <net/raw_cb.h>
+
+/*
+ * Definitions of protocols supported in the LOCAL domain.
+ */
+
+static struct protosw localsw[] = {
+{ SOCK_STREAM,	&localdomain,	0,	PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+  0,		0,		0,		&uipc_ctloutput,
+  0,
+  0,		0,		0,		0,
+  &uipc_usrreqs
+},
+{ SOCK_DGRAM,	&localdomain,	0,		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+  0,		0,		0,		0,
+  0,
+  0,		0,		0,		0,
+  &uipc_usrreqs
+},
+{ 0,		0,		0,		0,
+  0,		0,		raw_ctlinput,	0,
+  0,
+  raw_init,	0,		0,		0,
+  &raw_usrreqs
+}
+};
+
+struct domain localdomain =
+    { AF_LOCAL, "local", unp_init, unp_externalize, unp_dispose,
+      localsw, &localsw[sizeof(localsw)/sizeof(localsw[0])] };
+DOMAIN_SET(local);
+
+SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
+SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
+SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
new file mode 100644
index 0000000..1e68f83
--- /dev/null
+++ b/sys/kern/uipc_sockbuf.c
@@ -0,0 +1,983 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/aio.h> /* for aio_swake proto */
+#include <sys/domain.h>
+#include <sys/event.h>
+#include <sys/file.h>	/* for maxfiles */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+int	maxsockets;
+
+void (*aio_swake)(struct socket *, struct sockbuf *);
+
+/*
+ * Primitive routines for operating on sockets and socket buffers
+ */
+
+u_long	sb_max = SB_MAX;		/* XXX should be static */
+
+static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
+
+/*
+ * Procedures to manipulate state flags of socket
+ * and do appropriate wakeups.  Normal sequence from the
+ * active (originating) side is that soisconnecting() is
+ * called during processing of connect() call,
+ * resulting in an eventual call to soisconnected() if/when the
+ * connection is established.  When the connection is torn down
+ * soisdisconnecting() is called during processing of disconnect() call,
+ * and soisdisconnected() is called when the connection to the peer
+ * is totally severed.  The semantics of these routines are such that
+ * connectionless protocols can call soisconnected() and soisdisconnected()
+ * only, bypassing the in-progress calls when setting up a ``connection''
+ * takes no time.
+ *
+ * From the passive side, a socket is created with
+ * two queues of sockets: so_incomp for connections in progress
+ * and so_comp for connections already made and awaiting user acceptance.
+ * As a protocol is preparing incoming connections, it creates a socket
+ * structure queued on so_incomp by calling sonewconn().  When the connection
+ * is established, soisconnected() is called, and transfers the
+ * socket structure to so_comp, making it available to accept().
+ *
+ * If a socket is closed with sockets on either
+ * so_incomp or so_comp, these sockets are dropped.
+ *
+ * If higher level protocols are implemented in
+ * the kernel, the wakeups done here will sometimes
+ * cause software-interrupt process scheduling.
+ */
+
+void
+soisconnecting(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= SS_ISCONNECTING;
+}
+
+void
+soisconnected(so)
+	struct socket *so;
+{
+	struct socket *head = so->so_head;
+
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+	so->so_state |= SS_ISCONNECTED;
+	if (head && (so->so_state & SS_INCOMP)) {
+		if ((so->so_options & SO_ACCEPTFILTER) != 0) {
+			so->so_upcall = head->so_accf->so_accept_filter->accf_callback;
+			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
+			so->so_rcv.sb_flags |= SB_UPCALL;
+			so->so_options &= ~SO_ACCEPTFILTER;
+			so->so_upcall(so, so->so_upcallarg, 0);
+			return;
+		}
+		TAILQ_REMOVE(&head->so_incomp, so, so_list);
+		head->so_incqlen--;
+		so->so_state &= ~SS_INCOMP;
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		head->so_qlen++;
+		so->so_state |= SS_COMP;
+		sorwakeup(head);
+		wakeup_one(&head->so_timeo);
+	} else {
+		wakeup(&so->so_timeo);
+		sorwakeup(so);
+		sowwakeup(so);
+	}
+}
+
+void
+soisdisconnecting(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~SS_ISCONNECTING;
+	so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
+	wakeup(&so->so_timeo);
+	sowwakeup(so);
+	sorwakeup(so);
+}
+
+void
+soisdisconnected(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
+	wakeup(&so->so_timeo);
+	sowwakeup(so);
+	sorwakeup(so);
+}
+
+/*
+ * When an attempt at a new connection is noted on a socket
+ * which accepts connections, sonewconn is called.  If the
+ * connection is possible (subject to space constraints, etc.)
+ * then we allocate a new structure, propoerly linked into the
+ * data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ *
+ * note: the ref count on the socket is 0 on return
+ */
+struct socket *
+sonewconn(head, connstatus)
+	register struct socket *head;
+	int connstatus;
+{
+	register struct socket *so;
+
+	if (head->so_qlen > 3 * head->so_qlimit / 2)
+		return ((struct socket *)0);
+	so = soalloc(0);
+	if (so == NULL)
+		return ((struct socket *)0);
+	if ((head->so_options & SO_ACCEPTFILTER) != 0)
+		connstatus = 0;
+	so->so_head = head;
+	so->so_type = head->so_type;
+	so->so_options = head->so_options &~ SO_ACCEPTCONN;
+	so->so_linger = head->so_linger;
+	so->so_state = head->so_state | SS_NOFDREF;
+	so->so_proto = head->so_proto;
+	so->so_timeo = head->so_timeo;
+	so->so_cred = crhold(head->so_cred);
+	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
+	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+		sotryfree(so);
+		return ((struct socket *)0);
+	}
+
+	if (connstatus) {
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_state |= SS_COMP;
+		head->so_qlen++;
+	} else {
+		if (head->so_incqlen > head->so_qlimit) {
+			struct socket *sp;
+			sp = TAILQ_FIRST(&head->so_incomp);
+			(void) soabort(sp);
+		}
+		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+		so->so_state |= SS_INCOMP;
+		head->so_incqlen++;
+	}
+	if (connstatus) {
+		sorwakeup(head);
+		wakeup(&head->so_timeo);
+		so->so_state |= connstatus;
+	}
+	return (so);
+}
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the
+ * socket; it would normally be applied to a socket when the user
+ * informs the system that no more data is to be sent, by the protocol
+ * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
+ * will be received, and will normally be applied to the socket by a
+ * protocol when it detects that the peer will send no more data.
+ * Data queued for reading in the socket may yet be read.
+ */
+
+void
+socantsendmore(so)
+	struct socket *so;
+{
+
+	so->so_state |= SS_CANTSENDMORE;
+	sowwakeup(so);
+}
+
+void
+socantrcvmore(so)
+	struct socket *so;
+{
+
+	so->so_state |= SS_CANTRCVMORE;
+	sorwakeup(so);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(sb)
+	struct sockbuf *sb;
+{
+
+	sb->sb_flags |= SB_WAIT;
+	return (tsleep(&sb->sb_cc,
+	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+	    sb->sb_timeo));
+}
+
+/*
+ * Lock a sockbuf already known to be locked;
+ * return any error returned from sleep (EINTR).
+ */
+int
+sb_lock(sb)
+	register struct sockbuf *sb;
+{
+	int error;
+
+	while (sb->sb_flags & SB_LOCK) {
+		sb->sb_flags |= SB_WANT;
+		error = tsleep(&sb->sb_flags,
+		    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
+		    "sblock", 0);
+		if (error)
+			return (error);
+	}
+	sb->sb_flags |= SB_LOCK;
+	return (0);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.
+ * Do asynchronous notification via SIGIO
+ * if the socket has the SS_ASYNC flag set.
+ */
+void
+sowakeup(so, sb)
+	register struct socket *so;
+	register struct sockbuf *sb;
+{
+
+	selwakeup(&sb->sb_sel);
+	sb->sb_flags &= ~SB_SEL;
+	if (sb->sb_flags & SB_WAIT) {
+		sb->sb_flags &= ~SB_WAIT;
+		wakeup(&sb->sb_cc);
+	}
+	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+		pgsigio(&so->so_sigio, SIGIO, 0);
+	if (sb->sb_flags & SB_UPCALL)
+		(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
+	if (sb->sb_flags & SB_AIO)
+		aio_swake(so, sb);
+	KNOTE(&sb->sb_sel.si_note, 0);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and
+ * one for receiving data.  Each buffer contains a queue of mbufs,
+ * information about the number of mbufs and amount of data in the
+ * queue, and other fields allowing select() statements and notification
+ * on data availability to be implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.
+ * Each record is a list of mbufs chained together with the m_next
+ * field.  Records are chained together with the m_nextpkt field. The upper
+ * level routine soreceive() expects the following conventions to be
+ * observed when placing information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's
+ *    name, then a record containing that name must be present before
+ *    any associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really
+ *    just additional data associated with the message), and there are
+ *    ``rights'' to be received, then a record containing this data
+ *    should be present (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by
+ *    a data record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve().  This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits).  The space
+ * should be released by calling sbrelease() when the socket is destroyed.
+ */
+
+int
+soreserve(so, sndcc, rcvcc)
+	register struct socket *so;
+	u_long sndcc, rcvcc;
+{
+	struct thread *td = curthread;
+
+	if (sbreserve(&so->so_snd, sndcc, so, td) == 0)
+		goto bad;
+	if (sbreserve(&so->so_rcv, rcvcc, so, td) == 0)
+		goto bad2;
+	if (so->so_rcv.sb_lowat == 0)
+		so->so_rcv.sb_lowat = 1;
+	if (so->so_snd.sb_lowat == 0)
+		so->so_snd.sb_lowat = MCLBYTES;
+	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+	return (0);
+bad2:
+	sbrelease(&so->so_snd, so);
+bad:
+	return (ENOBUFS);
+}
+
+/*
+ * Allot mbufs to a sockbuf.
+ * Attempt to scale mbmax so that mbcnt doesn't become limiting
+ * if buffering efficiency is near the normal case.
+ */
+int
+sbreserve(sb, cc, so, td)
+	struct sockbuf *sb;
+	u_long cc;
+	struct socket *so;
+	struct thread *td;
+{
+
+	/*
+	 * td will only be NULL when we're in an interrupt
+	 * (e.g. in tcp_input())
+	 */
+	if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+		return (0);
+	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
+	    td ? td->td_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur : RLIM_INFINITY)) {
+		return (0);
+	}
+	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+	if (sb->sb_lowat > sb->sb_hiwat)
+		sb->sb_lowat = sb->sb_hiwat;
+	return (1);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease(sb, so)
+	struct sockbuf *sb;
+	struct socket *so;
+{
+
+	sbflush(sb);
+	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
+	    RLIM_INFINITY);
+	sb->sb_mbmax = 0;
+}
+
+/*
+ * Routines to add and remove
+ * data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to
+ * append new mbufs to a socket buffer, after checking that adequate
+ * space is available, comparing the function sbspace() with the amount
+ * of data to be added.  sbappendrecord() differs from sbappend() in
+ * that data supplied is treated as the beginning of a new record.
+ * To place a sender's address, optional access rights, and data in a
+ * socket receive buffer, sbappendaddr() should be used.  To place
+ * access rights and data in a socket receive buffer, sbappendrights()
+ * should be used.  In either case, the new data begins a new record.
+ * Note that unlike sbappend() and sbappendrecord(), these routines check
+ * for the caller that there will be enough space to store the data.
+ * Each fails if there is not enough space, or if it cannot find mbufs
+ * to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data
+ * awaiting acknowledgement.  Data is normally copied from a socket
+ * send buffer in a protocol with m_copy for output to a peer,
+ * and then removing the data from the socket buffer with sbdrop()
+ * or sbdroprecord() when the data is acknowledged by the peer.
+ */
+
+/*
+ * Append mbuf chain m to the last record in the
+ * socket buffer sb.  The additional space associated
+ * the mbuf chain is recorded in sb.  Empty mbufs are
+ * discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(sb, m)
+	struct sockbuf *sb;
+	struct mbuf *m;
+{
+	register struct mbuf *n;
+
+	if (m == 0)
+		return;
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		do {
+			if (n->m_flags & M_EOR) {
+				sbappendrecord(sb, m); /* XXXXXX!!!! */
+				return;
+			}
+		} while (n->m_next && (n = n->m_next));
+	}
+	sbcompress(sb, m, n);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(sb)
+	register struct sockbuf *sb;
+{
+	register struct mbuf *m;
+	register struct mbuf *n = 0;
+	register u_long len = 0, mbcnt = 0;
+
+	for (m = sb->sb_mb; m; m = n) {
+	    n = m->m_nextpkt;
+	    for (; m; m = m->m_next) {
+		len += m->m_len;
+		mbcnt += MSIZE;
+		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+			mbcnt += m->m_ext.ext_size;
+	    }
+	}
+	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+		printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
+		    mbcnt, sb->sb_mbcnt);
+		panic("sbcheck");
+	}
+}
+#endif
+
+/*
+ * As above, except the mbuf chain
+ * begins a new record.
+ */
+void
+sbappendrecord(sb, m0)
+	register struct sockbuf *sb;
+	register struct mbuf *m0;
+{
+	register struct mbuf *m;
+
+	if (m0 == 0)
+		return;
+	m = sb->sb_mb;
+	if (m)
+		while (m->m_nextpkt)
+			m = m->m_nextpkt;
+	/*
+	 * Put the first mbuf on the queue.
+	 * Note this permits zero length records.
+	 */
+	sballoc(sb, m0);
+	if (m)
+		m->m_nextpkt = m0;
+	else
+		sb->sb_mb = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * As above except that OOB data
+ * is inserted at the beginning of the sockbuf,
+ * but after any other OOB data.
+ */
+void
+sbinsertoob(sb, m0)
+	register struct sockbuf *sb;
+	register struct mbuf *m0;
+{
+	register struct mbuf *m;
+	register struct mbuf **mp;
+
+	if (m0 == 0)
+		return;
+	for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+	    m = *mp;
+	    again:
+		switch (m->m_type) {
+
+		case MT_OOBDATA:
+			continue;		/* WANT next train */
+
+		case MT_CONTROL:
+			m = m->m_next;
+			if (m)
+				goto again;	/* inspect THIS train further */
+		}
+		break;
+	}
+	/*
+	 * Put the first mbuf on the queue.
+	 * Note this permits zero length records.
+	 */
+	sballoc(sb, m0);
+	m0->m_nextpkt = *mp;
+	*mp = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data
+ * to the receive queue of a socket.  If present,
+ * m0 must include a packet header with total length.
+ * Returns 0 if no space in sockbuf or insufficient mbufs.
+ */
+int
+sbappendaddr(sb, asa, m0, control)
+	register struct sockbuf *sb;
+	struct sockaddr *asa;
+	struct mbuf *m0, *control;
+{
+	register struct mbuf *m, *n;
+	int space = asa->sa_len;
+
+	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+		panic("sbappendaddr");
+	if (m0)
+		space += m0->m_pkthdr.len;
+	for (n = control; n; n = n->m_next) {
+		space += n->m_len;
+		if (n->m_next == 0)	/* keep pointer to last control buf */
+			break;
+	}
+	if (space > sbspace(sb))
+		return (0);
+	if (asa->sa_len > MLEN)
+		return (0);
+	MGET(m, M_DONTWAIT, MT_SONAME);
+	if (m == 0)
+		return (0);
+	m->m_len = asa->sa_len;
+	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
+	if (n)
+		n->m_next = m0;		/* concatenate data to control */
+	else
+		control = m0;
+	m->m_next = control;
+	for (n = m; n; n = n->m_next)
+		sballoc(sb, n);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		n->m_nextpkt = m;
+	} else
+		sb->sb_mb = m;
+	return (1);
+}
+
+int
+sbappendcontrol(sb, m0, control)
+	struct sockbuf *sb;
+	struct mbuf *control, *m0;
+{
+	register struct mbuf *m, *n;
+	int space = 0;
+
+	if (control == 0)
+		panic("sbappendcontrol");
+	for (m = control; ; m = m->m_next) {
+		space += m->m_len;
+		if (m->m_next == 0)
+			break;
+	}
+	n = m;			/* save pointer to last control buffer */
+	for (m = m0; m; m = m->m_next)
+		space += m->m_len;
+	if (space > sbspace(sb))
+		return (0);
+	n->m_next = m0;			/* concatenate data to control */
+	for (m = control; m; m = m->m_next)
+		sballoc(sb, m);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		n->m_nextpkt = control;
+	} else
+		sb->sb_mb = control;
+	return (1);
+}
+
+/*
+ * Compress mbuf chain m into the socket
+ * buffer sb following mbuf n.  If n
+ * is null, the buffer is presumed empty.
+ */
+void
+sbcompress(sb, m, n)
+	register struct sockbuf *sb;
+	register struct mbuf *m, *n;
+{
+	register int eor = 0;
+	register struct mbuf *o;
+
+	while (m) {
+		eor |= m->m_flags & M_EOR;
+		if (m->m_len == 0 &&
+		    (eor == 0 ||
+		     (((o = m->m_next) || (o = n)) &&
+		      o->m_type == m->m_type))) {
+			m = m_free(m);
+			continue;
+		}
+		if (n && (n->m_flags & M_EOR) == 0 &&
+		    M_WRITABLE(n) &&
+		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
+		    m->m_len <= M_TRAILINGSPACE(n) &&
+		    n->m_type == m->m_type) {
+			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+			    (unsigned)m->m_len);
+			n->m_len += m->m_len;
+			sb->sb_cc += m->m_len;
+			m = m_free(m);
+			continue;
+		}
+		if (n)
+			n->m_next = m;
+		else
+			sb->sb_mb = m;
+		sballoc(sb, m);
+		n = m;
+		m->m_flags &= ~M_EOR;
+		m = m->m_next;
+		n->m_next = 0;
+	}
+	if (eor) {
+		if (n)
+			n->m_flags |= eor;
+		else
+			printf("semi-panic: sbcompress\n");
+	}
+}
+
+/*
+ * Free all mbufs in a sockbuf.
+ * Check that all resources are reclaimed.
+ */
+void
+sbflush(sb)
+	register struct sockbuf *sb;
+{
+
+	if (sb->sb_flags & SB_LOCK)
+		panic("sbflush: locked");
+	while (sb->sb_mbcnt) {
+		/*
+		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
+		 * we would loop forever. Panic instead.
+		 */
+		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
+			break;
+		sbdrop(sb, (int)sb->sb_cc);
+	}
+	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+		panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop(sb, len)
+	register struct sockbuf *sb;
+	register int len;
+{
+	register struct mbuf *m;
+	struct mbuf *next;
+
+	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+	while (len > 0) {
+		if (m == 0) {
+			if (next == 0)
+				panic("sbdrop");
+			m = next;
+			next = m->m_nextpkt;
+			continue;
+		}
+		if (m->m_len > len) {
+			m->m_len -= len;
+			m->m_data += len;
+			sb->sb_cc -= len;
+			break;
+		}
+		len -= m->m_len;
+		sbfree(sb, m);
+		m = m_free(m);
+	}
+	while (m && m->m_len == 0) {
+		sbfree(sb, m);
+		m = m_free(m);
+	}
+	if (m) {
+		sb->sb_mb = m;
+		m->m_nextpkt = next;
+	} else
+		sb->sb_mb = next;
+}
+
+/*
+ * Drop a record off the front of a sockbuf
+ * and move the next record to the front.
+ */
+void
+sbdroprecord(sb)
+	register struct sockbuf *sb;
+{
+	register struct mbuf *m;
+
+	m = sb->sb_mb;
+	if (m) {
+		sb->sb_mb = m->m_nextpkt;
+		do {
+			sbfree(sb, m);
+			m = m_free(m);
+		} while (m);
+	}
+}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+	caddr_t p;
+	register int size;
+	int type, level;
+{
+	register struct cmsghdr *cp;
+	struct mbuf *m;
+
+	if (CMSG_SPACE((u_int)size) > MCLBYTES)
+		return ((struct mbuf *) NULL);
+	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+		return ((struct mbuf *) NULL);
+	if (CMSG_SPACE((u_int)size) > MLEN) {
+		MCLGET(m, M_DONTWAIT);
+		if ((m->m_flags & M_EXT) == 0) {
+			m_free(m);
+			return ((struct mbuf *) NULL);
+		}
+	}
+	cp = mtod(m, struct cmsghdr *);
+	m->m_len = 0;
+	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
+	    ("sbcreatecontrol: short mbuf"));
+	if (p != NULL)
+		(void)memcpy(CMSG_DATA(cp), p, size);
+	m->m_len = CMSG_SPACE(size);
+	cp->cmsg_len = CMSG_LEN(size);
+	cp->cmsg_level = level;
+	cp->cmsg_type = type;
+	return (m);
+}
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol.  Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+		    struct ifnet *ifp, struct thread *td)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, struct thread *td)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	return 0;
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+dup_sockaddr(sa, canwait)
+	struct sockaddr *sa;
+	int canwait;
+{
+	struct sockaddr *sa2;
+
+	MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME, 
+	       canwait ? M_WAITOK : M_NOWAIT);
+	if (sa2)
+		bcopy(sa, sa2, sa->sa_len);
+	return sa2;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information
+ * in the kernel-format socket structure pointed to by so.  This is done
+ * to reduce the spew of irrelevant information over this interface,
+ * to isolate user code from changes in the kernel structure, and
+ * potentially to provide information-hiding if we decide that
+ * some of this information should be hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+	xso->xso_len = sizeof *xso;
+	xso->xso_so = so;
+	xso->so_type = so->so_type;
+	xso->so_options = so->so_options;
+	xso->so_linger = so->so_linger;
+	xso->so_state = so->so_state;
+	xso->so_pcb = so->so_pcb;
+	xso->xso_protocol = so->so_proto->pr_protocol;
+	xso->xso_family = so->so_proto->pr_domain->dom_family;
+	xso->so_qlen = so->so_qlen;
+	xso->so_incqlen = so->so_incqlen;
+	xso->so_qlimit = so->so_qlimit;
+	xso->so_timeo = so->so_timeo;
+	xso->so_error = so->so_error;
+	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+	xso->so_oobmark = so->so_oobmark;
+	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+	xso->so_uid = so->so_cred->cr_uid;
+}
+
+/*
+ * This does the same for sockbufs.  Note that the xsockbuf structure,
+ * since it is always embedded in a socket, does not include a self
+ * pointer nor a length.  We make this entry point public in case
+ * some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+	xsb->sb_cc = sb->sb_cc;
+	xsb->sb_hiwat = sb->sb_hiwat;
+	xsb->sb_mbcnt = sb->sb_mbcnt;
+	xsb->sb_mbmax = sb->sb_mbmax;
+	xsb->sb_lowat = sb->sb_lowat;
+	xsb->sb_flags = sb->sb_flags;
+	xsb->sb_timeo = sb->sb_timeo;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, 
+    &sb_max, 0, "Maximum socket buffer size");
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, 
+    &maxsockets, 0, "Maximum number of sockets avaliable");
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+    &sb_efficiency, 0, "");
+
+/*
+ * Initialise maxsockets 
+ */
+static void init_maxsockets(void *ignored)
+{
+	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
+	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
+}
+SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
new file mode 100644
index 0000000..d596294
--- /dev/null
+++ b/sys/kern/uipc_socket.c
@@ -0,0 +1,1792 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_zero.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/domain.h>
+#include <sys/file.h>			/* for struct knote */
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/event.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/jail.h>
+
+#include <vm/uma.h>
+
+#include <machine/limits.h>
+
+#ifdef INET
+static int	 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
+#endif
+
+static void 	filt_sordetach(struct knote *kn);
+static int 	filt_soread(struct knote *kn, long hint);
+static void 	filt_sowdetach(struct knote *kn);
+static int	filt_sowrite(struct knote *kn, long hint);
+static int	filt_solisten(struct knote *kn, long hint);
+
+static struct filterops solisten_filtops =
+	{ 1, NULL, filt_sordetach, filt_solisten };
+static struct filterops soread_filtops =
+	{ 1, NULL, filt_sordetach, filt_soread };
+static struct filterops sowrite_filtops =
+	{ 1, NULL, filt_sowdetach, filt_sowrite };
+
+uma_zone_t socket_zone;
+so_gen_t	so_gencnt;	/* generation count for sockets */
+
+MALLOC_DEFINE(M_SONAME, "soname", "socket name");
+MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
+
+SYSCTL_DECL(_kern_ipc);
+
+static int somaxconn = SOMAXCONN;
+SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
+    &somaxconn, 0, "Maximum pending socket connection queue size");
+static int numopensockets;
+SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
+    &numopensockets, 0, "Number of open sockets");
+#ifdef ZERO_COPY_SOCKETS
+/* These aren't static because they're used in other files. */
+int so_zero_copy_send = 1;
+int so_zero_copy_receive = 1;
+SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
+    "Zero copy controls");
+SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
+    &so_zero_copy_receive, 0, "Enable zero copy receive");
+SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
+    &so_zero_copy_send, 0, "Enable zero copy send");
+#endif /* ZERO_COPY_SOCKETS */
+
+
+/*
+ * Socket operation routines.
+ * These routines are called by the routines in
+ * sys_socket.c or from a system process, and
+ * implement the semantics of socket operations by
+ * switching out to the protocol specific routines.
+ */
+
+/*
+ * Get a socket structure from our zone, and initialize it.
+ * Note that it would probably be better to allocate socket
+ * and PCB at the same time, but I'm not convinced that all
+ * the protocols can be easily modified to do this.
+ *
+ * soalloc() returns a socket with a ref count of 0.
+ */
+struct socket *
+soalloc(waitok)
+	int waitok;
+{
+	struct socket *so;
+	int flag;
+
+	if (waitok == 1)
+		flag = M_WAITOK;
+	else
+		flag = M_NOWAIT;
+	flag |= M_ZERO;
+	so = uma_zalloc(socket_zone, flag);
+	if (so) {
+		/* XXX race condition for reentrant kernel */
+		so->so_gencnt = ++so_gencnt;
+		/* sx_init(&so->so_sxlock, "socket sxlock"); */
+		TAILQ_INIT(&so->so_aiojobq);
+		++numopensockets;
+	}
+	return so;
+}
+
+/*
+ * socreate returns a socket with a ref count of 1.  The socket should be
+ * closed with soclose().
+ */
+int
+socreate(dom, aso, type, proto, cred, td)
+	int dom;
+	struct socket **aso;
+	register int type;
+	int proto;
+	struct ucred *cred;
+	struct thread *td;
+{
+	register struct protosw *prp;
+	register struct socket *so;
+	register int error;
+
+	if (proto)
+		prp = pffindproto(dom, proto, type);
+	else
+		prp = pffindtype(dom, type);
+
+	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
+		return (EPROTONOSUPPORT);
+
+	if (jailed(td->td_ucred) && jail_socket_unixiproute_only &&
+	    prp->pr_domain->dom_family != PF_LOCAL &&
+	    prp->pr_domain->dom_family != PF_INET &&
+	    prp->pr_domain->dom_family != PF_ROUTE) {
+		return (EPROTONOSUPPORT);
+	}
+
+	if (prp->pr_type != type)
+		return (EPROTOTYPE);
+	so = soalloc(M_NOWAIT);
+	if (so == NULL)
+		return (ENOBUFS);
+
+	TAILQ_INIT(&so->so_incomp);
+	TAILQ_INIT(&so->so_comp);
+	so->so_type = type;
+	so->so_cred = crhold(cred);
+	so->so_proto = prp;
+	soref(so);
+	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
+	if (error) {
+		so->so_state |= SS_NOFDREF;
+		sorele(so);
+		return (error);
+	}
+	*aso = so;
+	return (0);
+}
+
+int
+sobind(so, nam, td)
+	struct socket *so;
+	struct sockaddr *nam;
+	struct thread *td;
+{
+	int s = splnet();
+	int error;
+
+	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
+	splx(s);
+	return (error);
+}
+
+static void
+sodealloc(struct socket *so)
+{
+
+	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
+	so->so_gencnt = ++so_gencnt;
+	if (so->so_rcv.sb_hiwat)
+		(void)chgsbsize(so->so_cred->cr_uidinfo,
+		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
+	if (so->so_snd.sb_hiwat)
+		(void)chgsbsize(so->so_cred->cr_uidinfo,
+		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
+#ifdef INET
+	if (so->so_accf != NULL) {
+		if (so->so_accf->so_accept_filter != NULL &&
+			so->so_accf->so_accept_filter->accf_destroy != NULL) {
+			so->so_accf->so_accept_filter->accf_destroy(so);
+		}
+		if (so->so_accf->so_accept_filter_str != NULL)
+			FREE(so->so_accf->so_accept_filter_str, M_ACCF);
+		FREE(so->so_accf, M_ACCF);
+	}
+#endif
+	crfree(so->so_cred);
+	/* sx_destroy(&so->so_sxlock); */
+	uma_zfree(socket_zone, so);
+	--numopensockets;
+}
+
+int
+solisten(so, backlog, td)
+	register struct socket *so;
+	int backlog;
+	struct thread *td;
+{
+	int s, error;
+
+	s = splnet();
+	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
+	if (error) {
+		splx(s);
+		return (error);
+	}
+	if (TAILQ_EMPTY(&so->so_comp))
+		so->so_options |= SO_ACCEPTCONN;
+	if (backlog < 0 || backlog > somaxconn)
+		backlog = somaxconn;
+	so->so_qlimit = backlog;
+	splx(s);
+	return (0);
+}
+
+void
+sofree(so)
+	register struct socket *so;
+{
+	struct socket *head = so->so_head;
+
+	KASSERT(so->so_count == 0, ("socket %p so_count not 0", so));
+
+	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
+		return;
+	if (head != NULL) {
+		if (so->so_state & SS_INCOMP) {
+			TAILQ_REMOVE(&head->so_incomp, so, so_list);
+			head->so_incqlen--;
+		} else if (so->so_state & SS_COMP) {
+			/*
+			 * We must not decommission a socket that's
+			 * on the accept(2) queue.  If we do, then
+			 * accept(2) may hang after select(2) indicated
+			 * that the listening socket was ready.
+			 */
+			return;
+		} else {
+			panic("sofree: not queued");
+		}
+		so->so_state &= ~SS_INCOMP;
+		so->so_head = NULL;
+	}
+	sbrelease(&so->so_snd, so);
+	sorflush(so);
+	sodealloc(so);
+}
+
+/*
+ * Close a socket on last file table reference removal.
+ * Initiate disconnect if connected.
+ * Free socket when disconnect complete.
+ *
+ * This function will sorele() the socket.  Note that soclose() may be
+ * called prior to the ref count reaching zero.  The actual socket
+ * structure will not be freed until the ref count reaches zero.
+ */
+int
+soclose(so)
+	register struct socket *so;
+{
+	int s = splnet();		/* conservative */
+	int error = 0;
+
+	funsetown(&so->so_sigio);
+	if (so->so_options & SO_ACCEPTCONN) {
+		struct socket *sp, *sonext;
+
+		sp = TAILQ_FIRST(&so->so_incomp);
+		for (; sp != NULL; sp = sonext) {
+			sonext = TAILQ_NEXT(sp, so_list);
+			(void) soabort(sp);
+		}
+		for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) {
+			sonext = TAILQ_NEXT(sp, so_list);
+			/* Dequeue from so_comp since sofree() won't do it */
+			TAILQ_REMOVE(&so->so_comp, sp, so_list);
+			so->so_qlen--;
+			sp->so_state &= ~SS_COMP;
+			sp->so_head = NULL;
+			(void) soabort(sp);
+		}
+	}
+	if (so->so_pcb == 0)
+		goto discard;
+	if (so->so_state & SS_ISCONNECTED) {
+		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
+			error = sodisconnect(so);
+			if (error)
+				goto drop;
+		}
+		if (so->so_options & SO_LINGER) {
+			if ((so->so_state & SS_ISDISCONNECTING) &&
+			    (so->so_state & SS_NBIO))
+				goto drop;
+			while (so->so_state & SS_ISCONNECTED) {
+				error = tsleep(&so->so_timeo,
+				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
+				if (error)
+					break;
+			}
+		}
+	}
+drop:
+	if (so->so_pcb) {
+		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
+		if (error == 0)
+			error = error2;
+	}
+discard:
+	if (so->so_state & SS_NOFDREF)
+		panic("soclose: NOFDREF");
+	so->so_state |= SS_NOFDREF;
+	sorele(so);
+	splx(s);
+	return (error);
+}
+
+/*
+ * Must be called at splnet...
+ */
+int
+soabort(so)
+	struct socket *so;
+{
+	int error;
+
+	error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
+	if (error) {
+		sotryfree(so);	/* note: does not decrement the ref count */
+		return error;
+	}
+	return (0);
+}
+
+int
+soaccept(so, nam)
+	register struct socket *so;
+	struct sockaddr **nam;
+{
+	int s = splnet();
+	int error;
+
+	if ((so->so_state & SS_NOFDREF) == 0)
+		panic("soaccept: !NOFDREF");
+	so->so_state &= ~SS_NOFDREF;
+	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
+	splx(s);
+	return (error);
+}
+
+int
+soconnect(so, nam, td)
+	register struct socket *so;
+	struct sockaddr *nam;
+	struct thread *td;
+{
+	int s;
+	int error;
+
+	if (so->so_options & SO_ACCEPTCONN)
+		return (EOPNOTSUPP);
+	s = splnet();
+	/*
+	 * If protocol is connection-based, can only connect once.
+	 * Otherwise, if connected, try to disconnect first.
+	 * This allows user to disconnect by connecting to, e.g.,
+	 * a null address.
+	 */
+	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
+	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
+	    (error = sodisconnect(so))))
+		error = EISCONN;
+	else
+		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
+	splx(s);
+	return (error);
+}
+
+int
+soconnect2(so1, so2)
+	register struct socket *so1;
+	struct socket *so2;
+{
+	int s = splnet();
+	int error;
+
+	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
+	splx(s);
+	return (error);
+}
+
+int
+sodisconnect(so)
+	register struct socket *so;
+{
+	int s = splnet();
+	int error;
+
+	if ((so->so_state & SS_ISCONNECTED) == 0) {
+		error = ENOTCONN;
+		goto bad;
+	}
+	if (so->so_state & SS_ISDISCONNECTING) {
+		error = EALREADY;
+		goto bad;
+	}
+	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
+bad:
+	splx(s);
+	return (error);
+}
+
+#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+/*
+ * Send on a socket.
+ * If send must go all at once and message is larger than
+ * send buffering, then hard error.
+ * Lock against other senders.
+ * If must go all at once and not enough room now, then
+ * inform user that this would block and do nothing.
+ * Otherwise, if nonblocking, send as much as possible.
+ * The data to be sent is described by "uio" if nonzero,
+ * otherwise by the mbuf chain "top" (which must be null
+ * if uio is not).  Data provided in mbuf chain must be small
+ * enough to send all at once.
+ *
+ * Returns nonzero on error, timeout or signal; callers
+ * must check for short counts if EINTR/ERESTART are returned.
+ * Data and control buffers are freed on return.
+ */
+
+#ifdef ZERO_COPY_SOCKETS
+struct so_zerocopy_stats{
+	int size_ok;
+	int align_ok;
+	int found_ifp;
+};
+struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
+#include <netinet/in.h>
+#include <net/route.h>
+#include <netinet/in_pcb.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#endif /*ZERO_COPY_SOCKETS*/
+
+int
+sosend(so, addr, uio, top, control, flags, td)
+	register struct socket *so;
+	struct sockaddr *addr;
+	struct uio *uio;
+	struct mbuf *top;
+	struct mbuf *control;
+	int flags;
+	struct thread *td;
+{
+	struct mbuf **mp;
+	register struct mbuf *m;
+	register long space, len, resid;
+	int clen = 0, error, s, dontroute, mlen;
+	int atomic = sosendallatonce(so) || top;
+#ifdef ZERO_COPY_SOCKETS
+	int cow_send;
+#endif /* ZERO_COPY_SOCKETS */
+
+	if (uio)
+		resid = uio->uio_resid;
+	else
+		resid = top->m_pkthdr.len;
+	/*
+	 * In theory resid should be unsigned.
+	 * However, space must be signed, as it might be less than 0
+	 * if we over-committed, and we must use a signed comparison
+	 * of space and resid.  On the other hand, a negative resid
+	 * causes us to loop sending 0-length segments to the protocol.
+	 *
+	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+	 * type sockets since that's an error.
+	 */
+	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
+		error = EINVAL;
+		goto out;
+	}
+
+	dontroute =
+	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
+	    (so->so_proto->pr_flags & PR_ATOMIC);
+	if (td)
+		td->td_proc->p_stats->p_ru.ru_msgsnd++;
+	if (control)
+		clen = control->m_len;
+#define	snderr(errno)	{ error = errno; splx(s); goto release; }
+
+restart:
+	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+	if (error)
+		goto out;
+	do {
+		s = splnet();
+		if (so->so_state & SS_CANTSENDMORE)
+			snderr(EPIPE);
+		if (so->so_error) {
+			error = so->so_error;
+			so->so_error = 0;
+			splx(s);
+			goto release;
+		}
+		if ((so->so_state & SS_ISCONNECTED) == 0) {
+			/*
+			 * `sendto' and `sendmsg' is allowed on a connection-
+			 * based socket if it supports implied connect.
+			 * Return ENOTCONN if not connected and no address is
+			 * supplied.
+			 */
+			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+				    !(resid == 0 && clen != 0))
+					snderr(ENOTCONN);
+			} else if (addr == 0)
+			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
+				   ENOTCONN : EDESTADDRREQ);
+		}
+		space = sbspace(&so->so_snd);
+		if (flags & MSG_OOB)
+			space += 1024;
+		if ((atomic && resid > so->so_snd.sb_hiwat) ||
+		    clen > so->so_snd.sb_hiwat)
+			snderr(EMSGSIZE);
+		if (space < resid + clen &&
+		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
+			if (so->so_state & SS_NBIO)
+				snderr(EWOULDBLOCK);
+			sbunlock(&so->so_snd);
+			error = sbwait(&so->so_snd);
+			splx(s);
+			if (error)
+				goto out;
+			goto restart;
+		}
+		splx(s);
+		mp = &top;
+		space -= clen;
+		do {
+		    if (uio == NULL) {
+			/*
+			 * Data is prepackaged in "top".
+			 */
+			resid = 0;
+			if (flags & MSG_EOR)
+				top->m_flags |= M_EOR;
+		    } else do {
+#ifdef ZERO_COPY_SOCKETS
+			cow_send = 0;
+#endif /* ZERO_COPY_SOCKETS */
+			if (top == 0) {
+				MGETHDR(m, M_TRYWAIT, MT_DATA);
+				if (m == NULL) {
+					error = ENOBUFS;
+					goto release;
+				}
+				mlen = MHLEN;
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = (struct ifnet *)0;
+			} else {
+				MGET(m, M_TRYWAIT, MT_DATA);
+				if (m == NULL) {
+					error = ENOBUFS;
+					goto release;
+				}
+				mlen = MLEN;
+			}
+			if (resid >= MINCLSIZE) {
+#ifdef ZERO_COPY_SOCKETS				
+				if (so_zero_copy_send &&
+				    resid>=PAGE_SIZE && 
+				    space>=PAGE_SIZE && 
+				    uio->uio_iov->iov_len>=PAGE_SIZE) {
+					so_zerocp_stats.size_ok++;
+					if (!((vm_offset_t)
+					  uio->uio_iov->iov_base & PAGE_MASK)){
+						so_zerocp_stats.align_ok++;
+						cow_send = socow_setup(m, uio);
+					}
+				} 
+				if (!cow_send){
+#endif /* ZERO_COPY_SOCKETS */
+				MCLGET(m, M_TRYWAIT);
+				if ((m->m_flags & M_EXT) == 0)
+					goto nopages;
+				mlen = MCLBYTES;
+				len = min(min(mlen, resid), space);
+			} else {
+#ifdef ZERO_COPY_SOCKETS
+					len = PAGE_SIZE;
+				}
+					
+			} else {
+#endif /* ZERO_COPY_SOCKETS */
+nopages:
+				len = min(min(mlen, resid), space);
+				/*
+				 * For datagram protocols, leave room
+				 * for protocol headers in first mbuf.
+				 */
+				if (atomic && top == 0 && len < mlen)
+					MH_ALIGN(m, len);
+			}
+			space -= len;
+#ifdef ZERO_COPY_SOCKETS
+			if (cow_send)
+				error = 0;
+			else
+#endif /* ZERO_COPY_SOCKETS */
+			error = uiomove(mtod(m, caddr_t), (int)len, uio);
+			resid = uio->uio_resid;
+			m->m_len = len;
+			*mp = m;
+			top->m_pkthdr.len += len;
+			if (error)
+				goto release;
+			mp = &m->m_next;
+			if (resid <= 0) {
+				if (flags & MSG_EOR)
+					top->m_flags |= M_EOR;
+				break;
+			}
+		    } while (space > 0 && atomic);
+		    if (dontroute)
+			    so->so_options |= SO_DONTROUTE;
+		    s = splnet();				/* XXX */
+		    /*
+		     * XXX all the SS_CANTSENDMORE checks previously
+		     * done could be out of date.  We could have recieved
+		     * a reset packet in an interrupt or maybe we slept
+		     * while doing page faults in uiomove() etc. We could
+		     * probably recheck again inside the splnet() protection
+		     * here, but there are probably other places that this
+		     * also happens.  We must rethink this.
+		     */
+		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+			(flags & MSG_OOB) ? PRUS_OOB :
+			/*
+			 * If the user set MSG_EOF, the protocol
+			 * understands this flag and nothing left to
+			 * send then use PRU_SEND_EOF instead of PRU_SEND.
+			 */
+			((flags & MSG_EOF) &&
+			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+			 (resid <= 0)) ?
+				PRUS_EOF :
+			/* If there is more to send set PRUS_MORETOCOME */
+			(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+			top, addr, control, td);
+		    splx(s);
+		    if (dontroute)
+			    so->so_options &= ~SO_DONTROUTE;
+		    clen = 0;
+		    control = 0;
+		    top = 0;
+		    mp = &top;
+		    if (error)
+			goto release;
+		} while (resid && space > 0);
+	} while (resid);
+
+release:
+	sbunlock(&so->so_snd);
+out:
+	if (top)
+		m_freem(top);
+	if (control)
+		m_freem(control);
+	return (error);
+}
+
+/*
+ * Implement receive operations on a socket.
+ * We depend on the way that records are added to the sockbuf
+ * by sbappend*.  In particular, each record (mbufs linked through m_next)
+ * must begin with an address if the protocol so specifies,
+ * followed by an optional mbuf or mbufs containing ancillary data,
+ * and then zero or more mbufs of data.
+ * In order to avoid blocking network interrupts for the entire time here,
+ * we splx() while doing the actual copy to user space.
+ * Although the sockbuf is locked, new data may still be appended,
+ * and thus we must maintain consistency of the sockbuf during that time.
+ *
+ * The caller may receive the data as a single mbuf chain by supplying
+ * an mbuf **mp0 for use in returning the chain.  The uio is then used
+ * only for the count in uio_resid.
+ */
+int
+soreceive(so, psa, uio, mp0, controlp, flagsp)
+	register struct socket *so;
+	struct sockaddr **psa;
+	struct uio *uio;
+	struct mbuf **mp0;
+	struct mbuf **controlp;
+	int *flagsp;
+{
+	struct mbuf *m, **mp;
+	register int flags, len, error, s, offset;
+	struct protosw *pr = so->so_proto;
+	struct mbuf *nextrecord;
+	int moff, type = 0;
+	int orig_resid = uio->uio_resid;
+
+	mp = mp0;
+	if (psa)
+		*psa = 0;
+	if (controlp)
+		*controlp = 0;
+	if (flagsp)
+		flags = *flagsp &~ MSG_EOR;
+	else
+		flags = 0;
+	if (flags & MSG_OOB) {
+		m = m_get(M_TRYWAIT, MT_DATA);
+		if (m == NULL)
+			return (ENOBUFS);
+		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+		if (error)
+			goto bad;
+		do {
+#ifdef ZERO_COPY_SOCKETS
+			if (so_zero_copy_receive) {
+				vm_page_t pg;
+				int disposable;
+
+				if ((m->m_flags & M_EXT)
+				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
+					disposable = 1;
+				else
+					disposable = 0;
+
+				pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t)));
+				if (uio->uio_offset == -1)
+					uio->uio_offset =IDX_TO_OFF(pg->pindex);
+
+				error = uiomoveco(mtod(m, caddr_t), 
+						  min(uio->uio_resid, m->m_len),
+						  uio, pg->object,
+						  disposable);
+			} else
+#endif /* ZERO_COPY_SOCKETS */
+			error = uiomove(mtod(m, caddr_t),
+			    (int) min(uio->uio_resid, m->m_len), uio);
+			m = m_free(m);
+		} while (uio->uio_resid && error == 0 && m);
+bad:
+		if (m)
+			m_freem(m);
+		return (error);
+	}
+	if (mp)
+		*mp = (struct mbuf *)0;
+	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
+		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
+
+restart:
+	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+	if (error)
+		return (error);
+	s = splnet();
+
+	m = so->so_rcv.sb_mb;
+	/*
+	 * If we have less data than requested, block awaiting more
+	 * (subject to any timeout) if:
+	 *   1. the current count is less than the low water mark, or
+	 *   2. MSG_WAITALL is set, and it is possible to do the entire
+	 *	receive operation at once if we block (resid <= hiwat).
+	 *   3. MSG_DONTWAIT is not set
+	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
+	 * we have to do the receive in sections, and thus risk returning
+	 * a short count if a timeout or signal occurs after we start.
+	 */
+	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
+	    so->so_rcv.sb_cc < uio->uio_resid) &&
+	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
+	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
+	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
+		KASSERT(m != 0 || !so->so_rcv.sb_cc,
+		    ("receive: m == %p so->so_rcv.sb_cc == %lu",
+		    m, so->so_rcv.sb_cc));
+		if (so->so_error) {
+			if (m)
+				goto dontblock;
+			error = so->so_error;
+			if ((flags & MSG_PEEK) == 0)
+				so->so_error = 0;
+			goto release;
+		}
+		if (so->so_state & SS_CANTRCVMORE) {
+			if (m)
+				goto dontblock;
+			else
+				goto release;
+		}
+		for (; m; m = m->m_next)
+			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
+				m = so->so_rcv.sb_mb;
+				goto dontblock;
+			}
+		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+			error = ENOTCONN;
+			goto release;
+		}
+		if (uio->uio_resid == 0)
+			goto release;
+		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
+			error = EWOULDBLOCK;
+			goto release;
+		}
+		sbunlock(&so->so_rcv);
+		error = sbwait(&so->so_rcv);
+		splx(s);
+		if (error)
+			return (error);
+		goto restart;
+	}
+dontblock:
+	if (uio->uio_td)
+		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
+	nextrecord = m->m_nextpkt;
+	if (pr->pr_flags & PR_ADDR) {
+		KASSERT(m->m_type == MT_SONAME,
+		    ("m->m_type == %d", m->m_type));
+		orig_resid = 0;
+		if (psa)
+			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
+					    mp0 == 0);
+		if (flags & MSG_PEEK) {
+			m = m->m_next;
+		} else {
+			sbfree(&so->so_rcv, m);
+			so->so_rcv.sb_mb = m_free(m);
+			m = so->so_rcv.sb_mb;
+		}
+	}
+	while (m && m->m_type == MT_CONTROL && error == 0) {
+		if (flags & MSG_PEEK) {
+			if (controlp)
+				*controlp = m_copy(m, 0, m->m_len);
+			m = m->m_next;
+		} else {
+			sbfree(&so->so_rcv, m);
+			so->so_rcv.sb_mb = m->m_next;
+			m->m_next = NULL;
+			if (pr->pr_domain->dom_externalize)
+				error =
+				(*pr->pr_domain->dom_externalize)(m, controlp);
+			else if (controlp)
+				*controlp = m;
+			else
+				m_freem(m);
+			m = so->so_rcv.sb_mb;
+		}
+		if (controlp) {
+			orig_resid = 0;
+			do
+				controlp = &(*controlp)->m_next;
+			while (*controlp != NULL);
+		}
+	}
+	if (m) {
+		if ((flags & MSG_PEEK) == 0)
+			m->m_nextpkt = nextrecord;
+		type = m->m_type;
+		if (type == MT_OOBDATA)
+			flags |= MSG_OOB;
+	}
+	moff = 0;
+	offset = 0;
+	while (m && uio->uio_resid > 0 && error == 0) {
+		if (m->m_type == MT_OOBDATA) {
+			if (type != MT_OOBDATA)
+				break;
+		} else if (type == MT_OOBDATA)
+			break;
+		else
+		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
+			("m->m_type == %d", m->m_type));
+		so->so_state &= ~SS_RCVATMARK;
+		len = uio->uio_resid;
+		if (so->so_oobmark && len > so->so_oobmark - offset)
+			len = so->so_oobmark - offset;
+		if (len > m->m_len - moff)
+			len = m->m_len - moff;
+		/*
+		 * If mp is set, just pass back the mbufs.
+		 * Otherwise copy them out via the uio, then free.
+		 * Sockbuf must be consistent here (points to current mbuf,
+		 * it points to next record) when we drop priority;
+		 * we must note any additions to the sockbuf when we
+		 * block interrupts again.
+		 */
+		if (mp == 0) {
+			splx(s);
+#ifdef ZERO_COPY_SOCKETS
+			if (so_zero_copy_receive) {
+				vm_page_t pg;
+				int disposable;
+
+				if ((m->m_flags & M_EXT)
+				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
+					disposable = 1;
+				else
+					disposable = 0;
+ 
+				pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) +
+					moff));
+
+				if (uio->uio_offset == -1)
+					uio->uio_offset =IDX_TO_OFF(pg->pindex);
+
+				error = uiomoveco(mtod(m, caddr_t) + moff,
+						  (int)len, uio,pg->object,
+						  disposable);
+			} else
+#endif /* ZERO_COPY_SOCKETS */
+			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
+			s = splnet();
+			if (error)
+				goto release;
+		} else
+			uio->uio_resid -= len;
+		if (len == m->m_len - moff) {
+			if (m->m_flags & M_EOR)
+				flags |= MSG_EOR;
+			if (flags & MSG_PEEK) {
+				m = m->m_next;
+				moff = 0;
+			} else {
+				nextrecord = m->m_nextpkt;
+				sbfree(&so->so_rcv, m);
+				if (mp) {
+					*mp = m;
+					mp = &m->m_next;
+					so->so_rcv.sb_mb = m = m->m_next;
+					*mp = (struct mbuf *)0;
+				} else {
+					so->so_rcv.sb_mb = m_free(m);
+					m = so->so_rcv.sb_mb;
+				}
+				if (m)
+					m->m_nextpkt = nextrecord;
+			}
+		} else {
+			if (flags & MSG_PEEK)
+				moff += len;
+			else {
+				if (mp)
+					*mp = m_copym(m, 0, len, M_TRYWAIT);
+				m->m_data += len;
+				m->m_len -= len;
+				so->so_rcv.sb_cc -= len;
+			}
+		}
+		if (so->so_oobmark) {
+			if ((flags & MSG_PEEK) == 0) {
+				so->so_oobmark -= len;
+				if (so->so_oobmark == 0) {
+					so->so_state |= SS_RCVATMARK;
+					break;
+				}
+			} else {
+				offset += len;
+				if (offset == so->so_oobmark)
+					break;
+			}
+		}
+		if (flags & MSG_EOR)
+			break;
+		/*
+		 * If the MSG_WAITALL flag is set (for non-atomic socket),
+		 * we must not quit until "uio->uio_resid == 0" or an error
+		 * termination.  If a signal/timeout occurs, return
+		 * with a short count but without error.
+		 * Keep sockbuf locked against other readers.
+		 */
+		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
+		    !sosendallatonce(so) && !nextrecord) {
+			if (so->so_error || so->so_state & SS_CANTRCVMORE)
+				break;
+			/*
+			 * Notify the protocol that some data has been
+			 * drained before blocking.
+			 */
+			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
+				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
+			error = sbwait(&so->so_rcv);
+			if (error) {
+				sbunlock(&so->so_rcv);
+				splx(s);
+				return (0);
+			}
+			m = so->so_rcv.sb_mb;
+			if (m)
+				nextrecord = m->m_nextpkt;
+		}
+	}
+
+	if (m && pr->pr_flags & PR_ATOMIC) {
+		flags |= MSG_TRUNC;
+		if ((flags & MSG_PEEK) == 0)
+			(void) sbdroprecord(&so->so_rcv);
+	}
+	if ((flags & MSG_PEEK) == 0) {
+		if (m == 0)
+			so->so_rcv.sb_mb = nextrecord;
+		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
+			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
+	}
+	if (orig_resid == uio->uio_resid && orig_resid &&
+	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
+		sbunlock(&so->so_rcv);
+		splx(s);
+		goto restart;
+	}
+
+	if (flagsp)
+		*flagsp |= flags;
+release:
+	sbunlock(&so->so_rcv);
+	splx(s);
+	return (error);
+}
+
+int
+soshutdown(so, how)
+	register struct socket *so;
+	register int how;
+{
+	register struct protosw *pr = so->so_proto;
+
+	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
+		return (EINVAL);
+
+	if (how != SHUT_WR)
+		sorflush(so);
+	if (how != SHUT_RD)
+		return ((*pr->pr_usrreqs->pru_shutdown)(so));
+	return (0);
+}
+
+void
+sorflush(so)
+	register struct socket *so;
+{
+	register struct sockbuf *sb = &so->so_rcv;
+	register struct protosw *pr = so->so_proto;
+	register int s;
+	struct sockbuf asb;
+
+	sb->sb_flags |= SB_NOINTR;
+	(void) sblock(sb, M_WAITOK);
+	s = splimp();
+	socantrcvmore(so);
+	sbunlock(sb);
+	asb = *sb;
+	bzero(sb, sizeof (*sb));
+	splx(s);
+	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
+		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
+	sbrelease(&asb, so);
+}
+
+#ifdef INET
+static int
+do_setopt_accept_filter(so, sopt)
+	struct	socket *so;
+	struct	sockopt *sopt;
+{
+	struct accept_filter_arg	*afap = NULL;
+	struct accept_filter	*afp;
+	struct so_accf	*af = so->so_accf;
+	int	error = 0;
+
+	/* do not set/remove accept filters on non listen sockets */
+	if ((so->so_options & SO_ACCEPTCONN) == 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/* removing the filter */
+	if (sopt == NULL) {
+		if (af != NULL) {
+			if (af->so_accept_filter != NULL &&
+				af->so_accept_filter->accf_destroy != NULL) {
+				af->so_accept_filter->accf_destroy(so);
+			}
+			if (af->so_accept_filter_str != NULL) {
+				FREE(af->so_accept_filter_str, M_ACCF);
+			}
+			FREE(af, M_ACCF);
+			so->so_accf = NULL;
+		}
+		so->so_options &= ~SO_ACCEPTFILTER;
+		return (0);
+	}
+	/* adding a filter */
+	/* must remove previous filter first */
+	if (af != NULL) {
+		error = EINVAL;
+		goto out;
+	}
+	/* don't put large objects on the kernel stack */
+	MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK);
+	error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
+	afap->af_name[sizeof(afap->af_name)-1] = '\0';
+	afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
+	if (error)
+		goto out;
+	afp = accept_filt_get(afap->af_name);
+	if (afp == NULL) {
+		error = ENOENT;
+		goto out;
+	}
+	MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO);
+	if (afp->accf_create != NULL) {
+		if (afap->af_name[0] != '\0') {
+			int len = strlen(afap->af_name) + 1;
+
+			MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK);
+			strcpy(af->so_accept_filter_str, afap->af_name);
+		}
+		af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg);
+		if (af->so_accept_filter_arg == NULL) {
+			FREE(af->so_accept_filter_str, M_ACCF);
+			FREE(af, M_ACCF);
+			so->so_accf = NULL;
+			error = EINVAL;
+			goto out;
+		}
+	}
+	af->so_accept_filter = afp;
+	so->so_accf = af;
+	so->so_options |= SO_ACCEPTFILTER;
+out:
+	if (afap != NULL)
+		FREE(afap, M_TEMP);
+	return (error);
+}
+#endif /* INET */
+
+/*
+ * Perhaps this routine, and sooptcopyout(), below, ought to come in
+ * an additional variant to handle the case where the option value needs
+ * to be some kind of integer, but not a specific size.
+ * In addition to their use here, these functions are also called by the
+ * protocol-level pr_ctloutput() routines.
+ */
+int
+sooptcopyin(sopt, buf, len, minlen)
+	struct	sockopt *sopt;
+	void	*buf;
+	size_t	len;
+	size_t	minlen;
+{
+	size_t	valsize;
+
+	/*
+	 * If the user gives us more than we wanted, we ignore it,
+	 * but if we don't get the minimum length the caller
+	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
+	 * is set to however much we actually retrieved.
+	 */
+	if ((valsize = sopt->sopt_valsize) < minlen)
+		return EINVAL;
+	if (valsize > len)
+		sopt->sopt_valsize = valsize = len;
+
+	if (sopt->sopt_td != 0)
+		return (copyin(sopt->sopt_val, buf, valsize));
+
+	bcopy(sopt->sopt_val, buf, valsize);
+	return 0;
+}
+
+int
+sosetopt(so, sopt)
+	struct socket *so;
+	struct sockopt *sopt;
+{
+	int	error, optval;
+	struct	linger l;
+	struct	timeval tv;
+	u_long  val;
+
+	error = 0;
+	if (sopt->sopt_level != SOL_SOCKET) {
+		if (so->so_proto && so->so_proto->pr_ctloutput)
+			return ((*so->so_proto->pr_ctloutput)
+				  (so, sopt));
+		error = ENOPROTOOPT;
+	} else {
+		switch (sopt->sopt_name) {
+#ifdef INET
+		case SO_ACCEPTFILTER:
+			error = do_setopt_accept_filter(so, sopt);
+			if (error)
+				goto bad;
+			break;
+#endif
+		case SO_LINGER:
+			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
+			if (error)
+				goto bad;
+
+			so->so_linger = l.l_linger;
+			if (l.l_onoff)
+				so->so_options |= SO_LINGER;
+			else
+				so->so_options &= ~SO_LINGER;
+			break;
+
+		case SO_DEBUG:
+		case SO_KEEPALIVE:
+		case SO_DONTROUTE:
+		case SO_USELOOPBACK:
+		case SO_BROADCAST:
+		case SO_REUSEADDR:
+		case SO_REUSEPORT:
+		case SO_OOBINLINE:
+		case SO_TIMESTAMP:
+		case SO_NOSIGPIPE:
+			error = sooptcopyin(sopt, &optval, sizeof optval,
+					    sizeof optval);
+			if (error)
+				goto bad;
+			if (optval)
+				so->so_options |= sopt->sopt_name;
+			else
+				so->so_options &= ~sopt->sopt_name;
+			break;
+
+		case SO_SNDBUF:
+		case SO_RCVBUF:
+		case SO_SNDLOWAT:
+		case SO_RCVLOWAT:
+			error = sooptcopyin(sopt, &optval, sizeof optval,
+					    sizeof optval);
+			if (error)
+				goto bad;
+
+			/*
+			 * Values < 1 make no sense for any of these
+			 * options, so disallow them.
+			 */
+			if (optval < 1) {
+				error = EINVAL;
+				goto bad;
+			}
+
+			switch (sopt->sopt_name) {
+			case SO_SNDBUF:
+			case SO_RCVBUF:
+				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
+				    &so->so_snd : &so->so_rcv, (u_long)optval,
+				    so, curthread) == 0) {
+					error = ENOBUFS;
+					goto bad;
+				}
+				break;
+
+			/*
+			 * Make sure the low-water is never greater than
+			 * the high-water.
+			 */
+			case SO_SNDLOWAT:
+				so->so_snd.sb_lowat =
+				    (optval > so->so_snd.sb_hiwat) ?
+				    so->so_snd.sb_hiwat : optval;
+				break;
+			case SO_RCVLOWAT:
+				so->so_rcv.sb_lowat =
+				    (optval > so->so_rcv.sb_hiwat) ?
+				    so->so_rcv.sb_hiwat : optval;
+				break;
+			}
+			break;
+
+		case SO_SNDTIMEO:
+		case SO_RCVTIMEO:
+			error = sooptcopyin(sopt, &tv, sizeof tv,
+					    sizeof tv);
+			if (error)
+				goto bad;
+
+			/* assert(hz > 0); */
+			if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
+			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
+				error = EDOM;
+				goto bad;
+			}
+			/* assert(tick > 0); */
+			/* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
+			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
+			if (val > SHRT_MAX) {
+				error = EDOM;
+				goto bad;
+			}
+
+			switch (sopt->sopt_name) {
+			case SO_SNDTIMEO:
+				so->so_snd.sb_timeo = val;
+				break;
+			case SO_RCVTIMEO:
+				so->so_rcv.sb_timeo = val;
+				break;
+			}
+			break;
+		default:
+			error = ENOPROTOOPT;
+			break;
+		}
+		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
+			(void) ((*so->so_proto->pr_ctloutput)
+				  (so, sopt));
+		}
+	}
+bad:
+	return (error);
+}
+
+/* Helper routine for getsockopt */
+int
+sooptcopyout(sopt, buf, len)
+	struct	sockopt *sopt;
+	void	*buf;
+	size_t	len;
+{
+	int	error;
+	size_t	valsize;
+
+	error = 0;
+
+	/*
+	 * Documented get behavior is that we always return a value,
+	 * possibly truncated to fit in the user's buffer.
+	 * Traditional behavior is that we always tell the user
+	 * precisely how much we copied, rather than something useful
+	 * like the total amount we had available for her.
+	 * Note that this interface is not idempotent; the entire answer must
+	 * generated ahead of time.
+	 */
+	valsize = min(len, sopt->sopt_valsize);
+	sopt->sopt_valsize = valsize;
+	if (sopt->sopt_val != 0) {
+		if (sopt->sopt_td != 0)
+			error = copyout(buf, sopt->sopt_val, valsize);
+		else
+			bcopy(buf, sopt->sopt_val, valsize);
+	}
+	return error;
+}
+
+int
+sogetopt(so, sopt)
+	struct socket *so;
+	struct sockopt *sopt;
+{
+	int	error, optval;
+	struct	linger l;
+	struct	timeval tv;
+#ifdef INET
+	struct accept_filter_arg *afap;
+#endif
+
+	error = 0;
+	if (sopt->sopt_level != SOL_SOCKET) {
+		if (so->so_proto && so->so_proto->pr_ctloutput) {
+			return ((*so->so_proto->pr_ctloutput)
+				  (so, sopt));
+		} else
+			return (ENOPROTOOPT);
+	} else {
+		switch (sopt->sopt_name) {
+#ifdef INET
+		case SO_ACCEPTFILTER:
+			if ((so->so_options & SO_ACCEPTCONN) == 0)
+				return (EINVAL);
+			MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
+				M_TEMP, M_WAITOK | M_ZERO);
+			if ((so->so_options & SO_ACCEPTFILTER) != 0) {
+				strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
+				if (so->so_accf->so_accept_filter_str != NULL)
+					strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
+			}
+			error = sooptcopyout(sopt, afap, sizeof(*afap));
+			FREE(afap, M_TEMP);
+			break;
+#endif
+
+		case SO_LINGER:
+			l.l_onoff = so->so_options & SO_LINGER;
+			l.l_linger = so->so_linger;
+			error = sooptcopyout(sopt, &l, sizeof l);
+			break;
+
+		case SO_USELOOPBACK:
+		case SO_DONTROUTE:
+		case SO_DEBUG:
+		case SO_KEEPALIVE:
+		case SO_REUSEADDR:
+		case SO_REUSEPORT:
+		case SO_BROADCAST:
+		case SO_OOBINLINE:
+		case SO_TIMESTAMP:
+		case SO_NOSIGPIPE:
+			optval = so->so_options & sopt->sopt_name;
+integer:
+			error = sooptcopyout(sopt, &optval, sizeof optval);
+			break;
+
+		case SO_TYPE:
+			optval = so->so_type;
+			goto integer;
+
+		case SO_ERROR:
+			optval = so->so_error;
+			so->so_error = 0;
+			goto integer;
+
+		case SO_SNDBUF:
+			optval = so->so_snd.sb_hiwat;
+			goto integer;
+
+		case SO_RCVBUF:
+			optval = so->so_rcv.sb_hiwat;
+			goto integer;
+
+		case SO_SNDLOWAT:
+			optval = so->so_snd.sb_lowat;
+			goto integer;
+
+		case SO_RCVLOWAT:
+			optval = so->so_rcv.sb_lowat;
+			goto integer;
+
+		case SO_SNDTIMEO:
+		case SO_RCVTIMEO:
+			optval = (sopt->sopt_name == SO_SNDTIMEO ?
+				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
+
+			tv.tv_sec = optval / hz;
+			tv.tv_usec = (optval % hz) * tick;
+			error = sooptcopyout(sopt, &tv, sizeof tv);
+			break;
+
+		default:
+			error = ENOPROTOOPT;
+			break;
+		}
+		return (error);
+	}
+}
+
+/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
+int
+soopt_getm(struct sockopt *sopt, struct mbuf **mp)
+{
+	struct mbuf *m, *m_prev;
+	int sopt_size = sopt->sopt_valsize;
+
+	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
+	if (m == 0)
+		return ENOBUFS;
+	if (sopt_size > MLEN) {
+		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
+		if ((m->m_flags & M_EXT) == 0) {
+			m_free(m);
+			return ENOBUFS;
+		}
+		m->m_len = min(MCLBYTES, sopt_size);
+	} else {
+		m->m_len = min(MLEN, sopt_size);
+	}
+	sopt_size -= m->m_len;
+	*mp = m;
+	m_prev = m;
+
+	while (sopt_size) {
+		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
+		if (m == 0) {
+			m_freem(*mp);
+			return ENOBUFS;
+		}
+		if (sopt_size > MLEN) {
+			MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
+			if ((m->m_flags & M_EXT) == 0) {
+				m_freem(*mp);
+				return ENOBUFS;
+			}
+			m->m_len = min(MCLBYTES, sopt_size);
+		} else {
+			m->m_len = min(MLEN, sopt_size);
+		}
+		sopt_size -= m->m_len;
+		m_prev->m_next = m;
+		m_prev = m;
+	}
+	return 0;
+}
+
+/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
+int
+soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
+{
+	struct mbuf *m0 = m;
+
+	if (sopt->sopt_val == NULL)
+		return 0;
+	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
+		if (sopt->sopt_td != NULL) {
+			int error;
+
+			error = copyin(sopt->sopt_val, mtod(m, char *),
+				       m->m_len);
+			if (error != 0) {
+				m_freem(m0);
+				return(error);
+			}
+		} else
+			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
+		sopt->sopt_valsize -= m->m_len;
+		(caddr_t)sopt->sopt_val += m->m_len;
+		m = m->m_next;
+	}
+	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
+		panic("ip6_sooptmcopyin");
+	return 0;
+}
+
+/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
+int
+soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
+{
+	struct mbuf *m0 = m;
+	size_t valsize = 0;
+
+	if (sopt->sopt_val == NULL)
+		return 0;
+	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
+		if (sopt->sopt_td != NULL) {
+			int error;
+
+			error = copyout(mtod(m, char *), sopt->sopt_val,
+				       m->m_len);
+			if (error != 0) {
+				m_freem(m0);
+				return(error);
+			}
+		} else
+			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
+	       sopt->sopt_valsize -= m->m_len;
+	       (caddr_t)sopt->sopt_val += m->m_len;
+	       valsize += m->m_len;
+	       m = m->m_next;
+	}
+	if (m != NULL) {
+		/* enough soopt buffer should be given from user-land */
+		m_freem(m0);
+		return(EINVAL);
+	}
+	sopt->sopt_valsize = valsize;
+	return 0;
+}
+
+void
+sohasoutofband(so)
+	register struct socket *so;
+{
+	if (so->so_sigio != NULL)
+		pgsigio(&so->so_sigio, SIGURG, 0);
+	selwakeup(&so->so_rcv.sb_sel);
+}
+
+int
+sopoll(struct socket *so, int events, struct ucred *cred, struct thread *td)
+{
+	int revents = 0;
+	int s = splnet();
+
+	if (events & (POLLIN | POLLRDNORM))
+		if (soreadable(so))
+			revents |= events & (POLLIN | POLLRDNORM);
+
+	if (events & POLLINIGNEOF)
+		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
+		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
+			revents |= POLLINIGNEOF;
+
+	if (events & (POLLOUT | POLLWRNORM))
+		if (sowriteable(so))
+			revents |= events & (POLLOUT | POLLWRNORM);
+
+	if (events & (POLLPRI | POLLRDBAND))
+		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
+			revents |= events & (POLLPRI | POLLRDBAND);
+
+	if (revents == 0) {
+		if (events &
+		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
+		     POLLRDBAND)) {
+			selrecord(td, &so->so_rcv.sb_sel);
+			so->so_rcv.sb_flags |= SB_SEL;
+		}
+
+		if (events & (POLLOUT | POLLWRNORM)) {
+			selrecord(td, &so->so_snd.sb_sel);
+			so->so_snd.sb_flags |= SB_SEL;
+		}
+	}
+
+	splx(s);
+	return (revents);
+}
+
+int
+sokqfilter(struct file *fp, struct knote *kn)
+{
+	struct socket *so = (struct socket *)kn->kn_fp->f_data;
+	struct sockbuf *sb;
+	int s;
+
+	switch (kn->kn_filter) {
+	case EVFILT_READ:
+		if (so->so_options & SO_ACCEPTCONN)
+			kn->kn_fop = &solisten_filtops;
+		else
+			kn->kn_fop = &soread_filtops;
+		sb = &so->so_rcv;
+		break;
+	case EVFILT_WRITE:
+		kn->kn_fop = &sowrite_filtops;
+		sb = &so->so_snd;
+		break;
+	default:
+		return (1);
+	}
+
+	s = splnet();
+	SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
+	sb->sb_flags |= SB_KNOTE;
+	splx(s);
+	return (0);
+}
+
+static void
+filt_sordetach(struct knote *kn)
+{
+	struct socket *so = (struct socket *)kn->kn_fp->f_data;
+	int s = splnet();
+
+	SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
+	if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
+		so->so_rcv.sb_flags &= ~SB_KNOTE;
+	splx(s);
+}
+
+/*ARGSUSED*/
+static int
+filt_soread(struct knote *kn, long hint)
+{
+	struct socket *so = (struct socket *)kn->kn_fp->f_data;
+
+	kn->kn_data = so->so_rcv.sb_cc;
+	if (so->so_state & SS_CANTRCVMORE) {
+		kn->kn_flags |= EV_EOF;
+		kn->kn_fflags = so->so_error;
+		return (1);
+	}
+	if (so->so_error)	/* temporary udp error */
+		return (1);
+	if (kn->kn_sfflags & NOTE_LOWAT)
+		return (kn->kn_data >= kn->kn_sdata);
+	return (kn->kn_data >= so->so_rcv.sb_lowat);
+}
+
+static void
+filt_sowdetach(struct knote *kn)
+{
+	struct socket *so = (struct socket *)kn->kn_fp->f_data;
+	int s = splnet();
+
+	SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
+	if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
+		so->so_snd.sb_flags &= ~SB_KNOTE;
+	splx(s);
+}
+
+/*ARGSUSED*/
+static int
+filt_sowrite(struct knote *kn, long hint)
+{
+	struct socket *so = (struct socket *)kn->kn_fp->f_data;
+
+	kn->kn_data = sbspace(&so->so_snd);
+	if (so->so_state & SS_CANTSENDMORE) {
+		kn->kn_flags |= EV_EOF;
+		kn->kn_fflags = so->so_error;
+		return (1);
+	}
+	if (so->so_error)	/* temporary udp error */
+		return (1);
+	if (((so->so_state & SS_ISCONNECTED) == 0) &&
+	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
+		return (0);
+	if (kn->kn_sfflags & NOTE_LOWAT)
+		return (kn->kn_data >= kn->kn_sdata);
+	return (kn->kn_data >= so->so_snd.sb_lowat);
+}
+
+/*ARGSUSED*/
+static int
+filt_solisten(struct knote *kn, long hint)
+{
+	struct socket *so = (struct socket *)kn->kn_fp->f_data;
+
+	kn->kn_data = so->so_qlen;
+	return (! TAILQ_EMPTY(&so->so_comp));
+}
+
+int
+socheckuid(struct socket *so, uid_t uid)
+{
+
+	if (so == NULL)
+		return (EPERM);
+	if (so->so_cred->cr_uid == uid)
+		return (0);
+	return (EPERM);
+}
diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c
new file mode 100644
index 0000000..1e68f83
--- /dev/null
+++ b/sys/kern/uipc_socket2.c
@@ -0,0 +1,983 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/aio.h> /* for aio_swake proto */
+#include <sys/domain.h>
+#include <sys/event.h>
+#include <sys/file.h>	/* for maxfiles */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+int	maxsockets;
+
+void (*aio_swake)(struct socket *, struct sockbuf *);
+
+/*
+ * Primitive routines for operating on sockets and socket buffers
+ */
+
+u_long	sb_max = SB_MAX;		/* XXX should be static */
+
+static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
+
+/*
+ * Procedures to manipulate state flags of socket
+ * and do appropriate wakeups.  Normal sequence from the
+ * active (originating) side is that soisconnecting() is
+ * called during processing of connect() call,
+ * resulting in an eventual call to soisconnected() if/when the
+ * connection is established.  When the connection is torn down
+ * soisdisconnecting() is called during processing of disconnect() call,
+ * and soisdisconnected() is called when the connection to the peer
+ * is totally severed.  The semantics of these routines are such that
+ * connectionless protocols can call soisconnected() and soisdisconnected()
+ * only, bypassing the in-progress calls when setting up a ``connection''
+ * takes no time.
+ *
+ * From the passive side, a socket is created with
+ * two queues of sockets: so_incomp for connections in progress
+ * and so_comp for connections already made and awaiting user acceptance.
+ * As a protocol is preparing incoming connections, it creates a socket
+ * structure queued on so_incomp by calling sonewconn().  When the connection
+ * is established, soisconnected() is called, and transfers the
+ * socket structure to so_comp, making it available to accept().
+ *
+ * If a socket is closed with sockets on either
+ * so_incomp or so_comp, these sockets are dropped.
+ *
+ * If higher level protocols are implemented in
+ * the kernel, the wakeups done here will sometimes
+ * cause software-interrupt process scheduling.
+ */
+
+void
+soisconnecting(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= SS_ISCONNECTING;
+}
+
+void
+soisconnected(so)
+	struct socket *so;
+{
+	struct socket *head = so->so_head;
+
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+	so->so_state |= SS_ISCONNECTED;
+	if (head && (so->so_state & SS_INCOMP)) {
+		if ((so->so_options & SO_ACCEPTFILTER) != 0) {
+			so->so_upcall = head->so_accf->so_accept_filter->accf_callback;
+			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
+			so->so_rcv.sb_flags |= SB_UPCALL;
+			so->so_options &= ~SO_ACCEPTFILTER;
+			so->so_upcall(so, so->so_upcallarg, 0);
+			return;
+		}
+		TAILQ_REMOVE(&head->so_incomp, so, so_list);
+		head->so_incqlen--;
+		so->so_state &= ~SS_INCOMP;
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		head->so_qlen++;
+		so->so_state |= SS_COMP;
+		sorwakeup(head);
+		wakeup_one(&head->so_timeo);
+	} else {
+		wakeup(&so->so_timeo);
+		sorwakeup(so);
+		sowwakeup(so);
+	}
+}
+
+void
+soisdisconnecting(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~SS_ISCONNECTING;
+	so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
+	wakeup(&so->so_timeo);
+	sowwakeup(so);
+	sorwakeup(so);
+}
+
+void
+soisdisconnected(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
+	wakeup(&so->so_timeo);
+	sowwakeup(so);
+	sorwakeup(so);
+}
+
+/*
+ * When an attempt at a new connection is noted on a socket
+ * which accepts connections, sonewconn is called.  If the
+ * connection is possible (subject to space constraints, etc.)
+ * then we allocate a new structure, propoerly linked into the
+ * data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ *
+ * note: the ref count on the socket is 0 on return
+ */
+struct socket *
+sonewconn(head, connstatus)
+	register struct socket *head;
+	int connstatus;
+{
+	register struct socket *so;
+
+	if (head->so_qlen > 3 * head->so_qlimit / 2)
+		return ((struct socket *)0);
+	so = soalloc(0);
+	if (so == NULL)
+		return ((struct socket *)0);
+	if ((head->so_options & SO_ACCEPTFILTER) != 0)
+		connstatus = 0;
+	so->so_head = head;
+	so->so_type = head->so_type;
+	so->so_options = head->so_options &~ SO_ACCEPTCONN;
+	so->so_linger = head->so_linger;
+	so->so_state = head->so_state | SS_NOFDREF;
+	so->so_proto = head->so_proto;
+	so->so_timeo = head->so_timeo;
+	so->so_cred = crhold(head->so_cred);
+	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
+	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+		sotryfree(so);
+		return ((struct socket *)0);
+	}
+
+	if (connstatus) {
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_state |= SS_COMP;
+		head->so_qlen++;
+	} else {
+		if (head->so_incqlen > head->so_qlimit) {
+			struct socket *sp;
+			sp = TAILQ_FIRST(&head->so_incomp);
+			(void) soabort(sp);
+		}
+		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+		so->so_state |= SS_INCOMP;
+		head->so_incqlen++;
+	}
+	if (connstatus) {
+		sorwakeup(head);
+		wakeup(&head->so_timeo);
+		so->so_state |= connstatus;
+	}
+	return (so);
+}
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the
+ * socket; it would normally be applied to a socket when the user
+ * informs the system that no more data is to be sent, by the protocol
+ * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
+ * will be received, and will normally be applied to the socket by a
+ * protocol when it detects that the peer will send no more data.
+ * Data queued for reading in the socket may yet be read.
+ */
+
+void
+socantsendmore(so)
+	struct socket *so;
+{
+
+	so->so_state |= SS_CANTSENDMORE;
+	sowwakeup(so);
+}
+
+void
+socantrcvmore(so)
+	struct socket *so;
+{
+
+	so->so_state |= SS_CANTRCVMORE;
+	sorwakeup(so);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(sb)
+	struct sockbuf *sb;
+{
+
+	sb->sb_flags |= SB_WAIT;
+	return (tsleep(&sb->sb_cc,
+	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+	    sb->sb_timeo));
+}
+
+/*
+ * Lock a sockbuf already known to be locked;
+ * return any error returned from sleep (EINTR).
+ */
+int
+sb_lock(sb)
+	register struct sockbuf *sb;
+{
+	int error;
+
+	while (sb->sb_flags & SB_LOCK) {
+		sb->sb_flags |= SB_WANT;
+		error = tsleep(&sb->sb_flags,
+		    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
+		    "sblock", 0);
+		if (error)
+			return (error);
+	}
+	sb->sb_flags |= SB_LOCK;
+	return (0);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.
+ * Do asynchronous notification via SIGIO
+ * if the socket has the SS_ASYNC flag set.
+ */
+void
+sowakeup(so, sb)
+	register struct socket *so;
+	register struct sockbuf *sb;
+{
+
+	selwakeup(&sb->sb_sel);
+	sb->sb_flags &= ~SB_SEL;
+	if (sb->sb_flags & SB_WAIT) {
+		sb->sb_flags &= ~SB_WAIT;
+		wakeup(&sb->sb_cc);
+	}
+	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+		pgsigio(&so->so_sigio, SIGIO, 0);
+	if (sb->sb_flags & SB_UPCALL)
+		(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
+	if (sb->sb_flags & SB_AIO)
+		aio_swake(so, sb);
+	KNOTE(&sb->sb_sel.si_note, 0);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and
+ * one for receiving data.  Each buffer contains a queue of mbufs,
+ * information about the number of mbufs and amount of data in the
+ * queue, and other fields allowing select() statements and notification
+ * on data availability to be implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.
+ * Each record is a list of mbufs chained together with the m_next
+ * field.  Records are chained together with the m_nextpkt field. The upper
+ * level routine soreceive() expects the following conventions to be
+ * observed when placing information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's
+ *    name, then a record containing that name must be present before
+ *    any associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really
+ *    just additional data associated with the message), and there are
+ *    ``rights'' to be received, then a record containing this data
+ *    should be present (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by
+ *    a data record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve().  This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits).  The space
+ * should be released by calling sbrelease() when the socket is destroyed.
+ */
+
+int
+soreserve(so, sndcc, rcvcc)
+	register struct socket *so;
+	u_long sndcc, rcvcc;
+{
+	struct thread *td = curthread;
+
+	if (sbreserve(&so->so_snd, sndcc, so, td) == 0)
+		goto bad;
+	if (sbreserve(&so->so_rcv, rcvcc, so, td) == 0)
+		goto bad2;
+	if (so->so_rcv.sb_lowat == 0)
+		so->so_rcv.sb_lowat = 1;
+	if (so->so_snd.sb_lowat == 0)
+		so->so_snd.sb_lowat = MCLBYTES;
+	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+	return (0);
+bad2:
+	sbrelease(&so->so_snd, so);
+bad:
+	return (ENOBUFS);
+}
+
+/*
+ * Allot mbufs to a sockbuf.
+ * Attempt to scale mbmax so that mbcnt doesn't become limiting
+ * if buffering efficiency is near the normal case.
+ */
+int
+sbreserve(sb, cc, so, td)
+	struct sockbuf *sb;
+	u_long cc;
+	struct socket *so;
+	struct thread *td;
+{
+
+	/*
+	 * td will only be NULL when we're in an interrupt
+	 * (e.g. in tcp_input())
+	 */
+	if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+		return (0);
+	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
+	    td ? td->td_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur : RLIM_INFINITY)) {
+		return (0);
+	}
+	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+	if (sb->sb_lowat > sb->sb_hiwat)
+		sb->sb_lowat = sb->sb_hiwat;
+	return (1);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease(sb, so)
+	struct sockbuf *sb;
+	struct socket *so;
+{
+
+	sbflush(sb);
+	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
+	    RLIM_INFINITY);
+	sb->sb_mbmax = 0;
+}
+
+/*
+ * Routines to add and remove
+ * data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to
+ * append new mbufs to a socket buffer, after checking that adequate
+ * space is available, comparing the function sbspace() with the amount
+ * of data to be added.  sbappendrecord() differs from sbappend() in
+ * that data supplied is treated as the beginning of a new record.
+ * To place a sender's address, optional access rights, and data in a
+ * socket receive buffer, sbappendaddr() should be used.  To place
+ * access rights and data in a socket receive buffer, sbappendrights()
+ * should be used.  In either case, the new data begins a new record.
+ * Note that unlike sbappend() and sbappendrecord(), these routines check
+ * for the caller that there will be enough space to store the data.
+ * Each fails if there is not enough space, or if it cannot find mbufs
+ * to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data
+ * awaiting acknowledgement.  Data is normally copied from a socket
+ * send buffer in a protocol with m_copy for output to a peer,
+ * and then removing the data from the socket buffer with sbdrop()
+ * or sbdroprecord() when the data is acknowledged by the peer.
+ */
+
+/*
+ * Append mbuf chain m to the last record in the
+ * socket buffer sb.  The additional space associated
+ * the mbuf chain is recorded in sb.  Empty mbufs are
+ * discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(sb, m)
+	struct sockbuf *sb;
+	struct mbuf *m;
+{
+	register struct mbuf *n;
+
+	if (m == 0)
+		return;
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		do {
+			if (n->m_flags & M_EOR) {
+				sbappendrecord(sb, m); /* XXXXXX!!!! */
+				return;
+			}
+		} while (n->m_next && (n = n->m_next));
+	}
+	sbcompress(sb, m, n);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(sb)
+	register struct sockbuf *sb;
+{
+	register struct mbuf *m;
+	register struct mbuf *n = 0;
+	register u_long len = 0, mbcnt = 0;
+
+	for (m = sb->sb_mb; m; m = n) {
+	    n = m->m_nextpkt;
+	    for (; m; m = m->m_next) {
+		len += m->m_len;
+		mbcnt += MSIZE;
+		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+			mbcnt += m->m_ext.ext_size;
+	    }
+	}
+	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+		printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
+		    mbcnt, sb->sb_mbcnt);
+		panic("sbcheck");
+	}
+}
+#endif
+
+/*
+ * As above, except the mbuf chain
+ * begins a new record.
+ */
+void
+sbappendrecord(sb, m0)
+	register struct sockbuf *sb;
+	register struct mbuf *m0;
+{
+	register struct mbuf *m;
+
+	if (m0 == 0)
+		return;
+	m = sb->sb_mb;
+	if (m)
+		while (m->m_nextpkt)
+			m = m->m_nextpkt;
+	/*
+	 * Put the first mbuf on the queue.
+	 * Note this permits zero length records.
+	 */
+	sballoc(sb, m0);
+	if (m)
+		m->m_nextpkt = m0;
+	else
+		sb->sb_mb = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * As above except that OOB data
+ * is inserted at the beginning of the sockbuf,
+ * but after any other OOB data.
+ */
+void
+sbinsertoob(sb, m0)
+	register struct sockbuf *sb;
+	register struct mbuf *m0;
+{
+	register struct mbuf *m;
+	register struct mbuf **mp;
+
+	if (m0 == 0)
+		return;
+	for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+	    m = *mp;
+	    again:
+		switch (m->m_type) {
+
+		case MT_OOBDATA:
+			continue;		/* WANT next train */
+
+		case MT_CONTROL:
+			m = m->m_next;
+			if (m)
+				goto again;	/* inspect THIS train further */
+		}
+		break;
+	}
+	/*
+	 * Put the first mbuf on the queue.
+	 * Note this permits zero length records.
+	 */
+	sballoc(sb, m0);
+	m0->m_nextpkt = *mp;
+	*mp = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data
+ * to the receive queue of a socket.  If present,
+ * m0 must include a packet header with total length.
+ * Returns 0 if no space in sockbuf or insufficient mbufs.
+ */
+int
+sbappendaddr(sb, asa, m0, control)
+	register struct sockbuf *sb;
+	struct sockaddr *asa;
+	struct mbuf *m0, *control;
+{
+	register struct mbuf *m, *n;
+	int space = asa->sa_len;
+
+	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+		panic("sbappendaddr");
+	if (m0)
+		space += m0->m_pkthdr.len;
+	for (n = control; n; n = n->m_next) {
+		space += n->m_len;
+		if (n->m_next == 0)	/* keep pointer to last control buf */
+			break;
+	}
+	if (space > sbspace(sb))
+		return (0);
+	if (asa->sa_len > MLEN)
+		return (0);
+	MGET(m, M_DONTWAIT, MT_SONAME);
+	if (m == 0)
+		return (0);
+	m->m_len = asa->sa_len;
+	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
+	if (n)
+		n->m_next = m0;		/* concatenate data to control */
+	else
+		control = m0;
+	m->m_next = control;
+	for (n = m; n; n = n->m_next)
+		sballoc(sb, n);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		n->m_nextpkt = m;
+	} else
+		sb->sb_mb = m;
+	return (1);
+}
+
+int
+sbappendcontrol(sb, m0, control)
+	struct sockbuf *sb;
+	struct mbuf *control, *m0;
+{
+	register struct mbuf *m, *n;
+	int space = 0;
+
+	if (control == 0)
+		panic("sbappendcontrol");
+	for (m = control; ; m = m->m_next) {
+		space += m->m_len;
+		if (m->m_next == 0)
+			break;
+	}
+	n = m;			/* save pointer to last control buffer */
+	for (m = m0; m; m = m->m_next)
+		space += m->m_len;
+	if (space > sbspace(sb))
+		return (0);
+	n->m_next = m0;			/* concatenate data to control */
+	for (m = control; m; m = m->m_next)
+		sballoc(sb, m);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		n->m_nextpkt = control;
+	} else
+		sb->sb_mb = control;
+	return (1);
+}
+
+/*
+ * Compress mbuf chain m into the socket
+ * buffer sb following mbuf n.  If n
+ * is null, the buffer is presumed empty.
+ */
+void
+sbcompress(sb, m, n)
+	register struct sockbuf *sb;
+	register struct mbuf *m, *n;
+{
+	register int eor = 0;
+	register struct mbuf *o;
+
+	while (m) {
+		eor |= m->m_flags & M_EOR;
+		if (m->m_len == 0 &&
+		    (eor == 0 ||
+		     (((o = m->m_next) || (o = n)) &&
+		      o->m_type == m->m_type))) {
+			m = m_free(m);
+			continue;
+		}
+		if (n && (n->m_flags & M_EOR) == 0 &&
+		    M_WRITABLE(n) &&
+		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
+		    m->m_len <= M_TRAILINGSPACE(n) &&
+		    n->m_type == m->m_type) {
+			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+			    (unsigned)m->m_len);
+			n->m_len += m->m_len;
+			sb->sb_cc += m->m_len;
+			m = m_free(m);
+			continue;
+		}
+		if (n)
+			n->m_next = m;
+		else
+			sb->sb_mb = m;
+		sballoc(sb, m);
+		n = m;
+		m->m_flags &= ~M_EOR;
+		m = m->m_next;
+		n->m_next = 0;
+	}
+	if (eor) {
+		if (n)
+			n->m_flags |= eor;
+		else
+			printf("semi-panic: sbcompress\n");
+	}
+}
+
+/*
+ * Free all mbufs in a sockbuf.
+ * Check that all resources are reclaimed.
+ */
+void
+sbflush(sb)
+	register struct sockbuf *sb;
+{
+
+	if (sb->sb_flags & SB_LOCK)
+		panic("sbflush: locked");
+	while (sb->sb_mbcnt) {
+		/*
+		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
+		 * we would loop forever. Panic instead.
+		 */
+		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
+			break;
+		sbdrop(sb, (int)sb->sb_cc);
+	}
+	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+		panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop(sb, len)
+	register struct sockbuf *sb;
+	register int len;
+{
+	register struct mbuf *m;
+	struct mbuf *next;
+
+	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+	while (len > 0) {
+		if (m == 0) {
+			if (next == 0)
+				panic("sbdrop");
+			m = next;
+			next = m->m_nextpkt;
+			continue;
+		}
+		if (m->m_len > len) {
+			m->m_len -= len;
+			m->m_data += len;
+			sb->sb_cc -= len;
+			break;
+		}
+		len -= m->m_len;
+		sbfree(sb, m);
+		m = m_free(m);
+	}
+	while (m && m->m_len == 0) {
+		sbfree(sb, m);
+		m = m_free(m);
+	}
+	if (m) {
+		sb->sb_mb = m;
+		m->m_nextpkt = next;
+	} else
+		sb->sb_mb = next;
+}
+
+/*
+ * Drop a record off the front of a sockbuf
+ * and move the next record to the front.
+ */
+void
+sbdroprecord(sb)
+	register struct sockbuf *sb;
+{
+	register struct mbuf *m;
+
+	m = sb->sb_mb;
+	if (m) {
+		sb->sb_mb = m->m_nextpkt;
+		do {
+			sbfree(sb, m);
+			m = m_free(m);
+		} while (m);
+	}
+}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+	caddr_t p;
+	register int size;
+	int type, level;
+{
+	register struct cmsghdr *cp;
+	struct mbuf *m;
+
+	if (CMSG_SPACE((u_int)size) > MCLBYTES)
+		return ((struct mbuf *) NULL);
+	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+		return ((struct mbuf *) NULL);
+	if (CMSG_SPACE((u_int)size) > MLEN) {
+		MCLGET(m, M_DONTWAIT);
+		if ((m->m_flags & M_EXT) == 0) {
+			m_free(m);
+			return ((struct mbuf *) NULL);
+		}
+	}
+	cp = mtod(m, struct cmsghdr *);
+	m->m_len = 0;
+	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
+	    ("sbcreatecontrol: short mbuf"));
+	if (p != NULL)
+		(void)memcpy(CMSG_DATA(cp), p, size);
+	m->m_len = CMSG_SPACE(size);
+	cp->cmsg_len = CMSG_LEN(size);
+	cp->cmsg_level = level;
+	cp->cmsg_type = type;
+	return (m);
+}
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol.  Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+		    struct ifnet *ifp, struct thread *td)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, struct thread *td)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	return 0;
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+dup_sockaddr(sa, canwait)
+	struct sockaddr *sa;
+	int canwait;
+{
+	struct sockaddr *sa2;
+
+	MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME, 
+	       canwait ? M_WAITOK : M_NOWAIT);
+	if (sa2)
+		bcopy(sa, sa2, sa->sa_len);
+	return sa2;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information
+ * in the kernel-format socket structure pointed to by so.  This is done
+ * to reduce the spew of irrelevant information over this interface,
+ * to isolate user code from changes in the kernel structure, and
+ * potentially to provide information-hiding if we decide that
+ * some of this information should be hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+	xso->xso_len = sizeof *xso;
+	xso->xso_so = so;
+	xso->so_type = so->so_type;
+	xso->so_options = so->so_options;
+	xso->so_linger = so->so_linger;
+	xso->so_state = so->so_state;
+	xso->so_pcb = so->so_pcb;
+	xso->xso_protocol = so->so_proto->pr_protocol;
+	xso->xso_family = so->so_proto->pr_domain->dom_family;
+	xso->so_qlen = so->so_qlen;
+	xso->so_incqlen = so->so_incqlen;
+	xso->so_qlimit = so->so_qlimit;
+	xso->so_timeo = so->so_timeo;
+	xso->so_error = so->so_error;
+	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+	xso->so_oobmark = so->so_oobmark;
+	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+	xso->so_uid = so->so_cred->cr_uid;
+}
+
+/*
+ * This does the same for sockbufs.  Note that the xsockbuf structure,
+ * since it is always embedded in a socket, does not include a self
+ * pointer nor a length.  We make this entry point public in case
+ * some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+	xsb->sb_cc = sb->sb_cc;
+	xsb->sb_hiwat = sb->sb_hiwat;
+	xsb->sb_mbcnt = sb->sb_mbcnt;
+	xsb->sb_mbmax = sb->sb_mbmax;
+	xsb->sb_lowat = sb->sb_lowat;
+	xsb->sb_flags = sb->sb_flags;
+	xsb->sb_timeo = sb->sb_timeo;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, 
+    &sb_max, 0, "Maximum socket buffer size");
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, 
+    &maxsockets, 0, "Maximum number of sockets avaliable");
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+    &sb_efficiency, 0, "");
+
+/*
+ * Initialise maxsockets 
+ */
+static void init_maxsockets(void *ignored)
+{
+	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
+	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
+}
+SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
new file mode 100644
index 0000000..1e9c5fa
--- /dev/null
+++ b/sys/kern/uipc_syscalls.c
@@ -0,0 +1,1945 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * sendfile(2) and related extensions:
+ * Copyright (c) 1998, David Greenman. All rights reserved. 
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+static void sf_buf_init(void *arg);
+SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
+struct sf_buf *sf_buf_alloc(void);
+void sf_buf_free(void *addr, void *args);
+
+static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
+static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
+  
+static int accept1(struct thread *td, struct accept_args *uap, int compat);
+static int getsockname1(struct thread *td, struct getsockname_args *uap,
+			int compat);
+static int getpeername1(struct thread *td, struct getpeername_args *uap,
+			int compat);
+
+/*
+ * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the
+ * sf_freelist head with the sf_lock mutex.
+ */
+static struct {
+	SLIST_HEAD(, sf_buf) sf_head;
+	struct mtx sf_lock;
+} sf_freelist;
+
+vm_offset_t sf_base;
+struct sf_buf *sf_bufs;
+u_int sf_buf_alloc_want;
+
+/*
+ * System call interface to the socket abstraction.
+ */
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#define COMPAT_OLDSOCK
+#endif
+
+extern	struct fileops socketops;
+
+/*
+ * MPSAFE
+ */
+int
+socket(td, uap)
+	struct thread *td;
+	register struct socket_args /* {
+		int	domain;
+		int	type;
+		int	protocol;
+	} */ *uap;
+{
+	struct filedesc *fdp;
+	struct socket *so;
+	struct file *fp;
+	int fd, error;
+
+	mtx_lock(&Giant);
+	fdp = td->td_proc->p_fd;
+	error = falloc(td, &fp, &fd);
+	if (error)
+		goto done2;
+	fhold(fp);
+	error = socreate(uap->domain, &so, uap->type, uap->protocol,
+	    td->td_ucred, td);
+	FILEDESC_LOCK(fdp);
+	if (error) {
+		if (fdp->fd_ofiles[fd] == fp) {
+			fdp->fd_ofiles[fd] = NULL;
+			FILEDESC_UNLOCK(fdp);
+			fdrop(fp, td);
+		} else
+			FILEDESC_UNLOCK(fdp);
+	} else {
+		fp->f_data = so;	/* already has ref count */
+		fp->f_flag = FREAD|FWRITE;
+		fp->f_ops = &socketops;
+		fp->f_type = DTYPE_SOCKET;
+		FILEDESC_UNLOCK(fdp);
+		td->td_retval[0] = fd;
+	}
+	fdrop(fp, td);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+bind(td, uap)
+	struct thread *td;
+	register struct bind_args /* {
+		int	s;
+		caddr_t	name;
+		int	namelen;
+	} */ *uap;
+{
+	struct socket *so;
+	struct sockaddr *sa;
+	int error;
+
+	mtx_lock(&Giant);
+	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
+		goto done2;
+	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
+		goto done1;
+	error = sobind(so, sa, td);
+	FREE(sa, M_SONAME);
+done1:
+	fputsock(so);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+listen(td, uap)
+	struct thread *td;
+	register struct listen_args /* {
+		int	s;
+		int	backlog;
+	} */ *uap;
+{
+	struct socket *so;
+	int error;
+
+	mtx_lock(&Giant);
+	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
+		error = solisten(so, uap->backlog, td);
+		fputsock(so);
+	}
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+/*
+ * accept1()
+ * MPSAFE
+ */
+static int
+accept1(td, uap, compat)
+	struct thread *td;
+	register struct accept_args /* {
+		int	s;
+		caddr_t	name;
+		int	*anamelen;
+	} */ *uap;
+	int compat;
+{
+	struct filedesc *fdp;
+	struct file *nfp = NULL;
+	struct sockaddr *sa;
+	int namelen, error, s;
+	struct socket *head, *so;
+	int fd;
+	u_int fflag;
+
+	mtx_lock(&Giant);
+	fdp = td->td_proc->p_fd;
+	if (uap->name) {
+		error = copyin(uap->anamelen, &namelen, sizeof (namelen));
+		if(error)
+			goto done2;
+	}
+	error = fgetsock(td, uap->s, &head, &fflag);
+	if (error)
+		goto done2;
+	s = splnet();
+	if ((head->so_options & SO_ACCEPTCONN) == 0) {
+		splx(s);
+		error = EINVAL;
+		goto done;
+	}
+	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
+		splx(s);
+		error = EWOULDBLOCK;
+		goto done;
+	}
+	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
+		if (head->so_state & SS_CANTRCVMORE) {
+			head->so_error = ECONNABORTED;
+			break;
+		}
+		error = tsleep(&head->so_timeo, PSOCK | PCATCH,
+		    "accept", 0);
+		if (error) {
+			splx(s);
+			goto done;
+		}
+	}
+	if (head->so_error) {
+		error = head->so_error;
+		head->so_error = 0;
+		splx(s);
+		goto done;
+	}
+
+	/*
+	 * At this point we know that there is at least one connection
+	 * ready to be accepted. Remove it from the queue prior to
+	 * allocating the file descriptor for it since falloc() may
+	 * block allowing another process to accept the connection
+	 * instead.
+	 */
+	so = TAILQ_FIRST(&head->so_comp);
+	TAILQ_REMOVE(&head->so_comp, so, so_list);
+	head->so_qlen--;
+
+	error = falloc(td, &nfp, &fd);
+	if (error) {
+		/*
+		 * Probably ran out of file descriptors. Put the
+		 * unaccepted connection back onto the queue and
+		 * do another wakeup so some other process might
+		 * have a chance at it.
+		 */
+		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
+		head->so_qlen++;
+		wakeup_one(&head->so_timeo);
+		splx(s);
+		goto done;
+	}
+	fhold(nfp);
+	td->td_retval[0] = fd;
+
+	/* connection has been removed from the listen queue */
+	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
+
+	so->so_state &= ~SS_COMP;
+	so->so_head = NULL;
+	if (head->so_sigio != NULL)
+		fsetown(fgetown(head->so_sigio), &so->so_sigio);
+
+	FILE_LOCK(nfp);
+	soref(so);			/* file descriptor reference */
+	nfp->f_data = so;		/* nfp has ref count from falloc */
+	nfp->f_flag = fflag;
+	nfp->f_ops = &socketops;
+	nfp->f_type = DTYPE_SOCKET;
+	FILE_UNLOCK(nfp);
+	sa = 0;
+	error = soaccept(so, &sa);
+	if (error) {
+		/*
+		 * return a namelen of zero for older code which might
+	 	 * ignore the return value from accept.
+		 */	
+		if (uap->name != NULL) {
+			namelen = 0;
+			(void) copyout(&namelen,
+			    uap->anamelen, sizeof(*uap->anamelen));
+		}
+		goto noconnection;
+	}
+	if (sa == NULL) {
+		namelen = 0;
+		if (uap->name)
+			goto gotnoname;
+		splx(s);
+		error = 0;
+		goto done;
+	}
+	if (uap->name) {
+		/* check sa_len before it is destroyed */
+		if (namelen > sa->sa_len)
+			namelen = sa->sa_len;
+#ifdef COMPAT_OLDSOCK
+		if (compat)
+			((struct osockaddr *)sa)->sa_family =
+			    sa->sa_family;
+#endif
+		error = copyout(sa, uap->name, (u_int)namelen);
+		if (!error)
+gotnoname:
+			error = copyout(&namelen,
+			    uap->anamelen, sizeof (*uap->anamelen));
+	}
+noconnection:
+	if (sa)
+		FREE(sa, M_SONAME);
+
+	/*
+	 * close the new descriptor, assuming someone hasn't ripped it
+	 * out from under us.
+	 */
+	if (error) {
+		FILEDESC_LOCK(fdp);
+		if (fdp->fd_ofiles[fd] == nfp) {
+			fdp->fd_ofiles[fd] = NULL;
+			FILEDESC_UNLOCK(fdp);
+			fdrop(nfp, td);
+		} else {
+			FILEDESC_UNLOCK(fdp);
+		}
+	}
+	splx(s);
+
+	/*
+	 * Release explicitly held references before returning.
+	 */
+done:
+	if (nfp != NULL)
+		fdrop(nfp, td);
+	fputsock(head);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE (accept1() is MPSAFE)
+ */
+int
+accept(td, uap)
+	struct thread *td;
+	struct accept_args *uap;
+{
+
+	return (accept1(td, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE (accept1() is MPSAFE)
+ */
+int
+oaccept(td, uap)
+	struct thread *td;
+	struct accept_args *uap;
+{
+
+	return (accept1(td, uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+connect(td, uap)
+	struct thread *td;
+	register struct connect_args /* {
+		int	s;
+		caddr_t	name;
+		int	namelen;
+	} */ *uap;
+{
+	struct socket *so;
+	struct sockaddr *sa;
+	int error, s;
+
+	mtx_lock(&Giant);
+	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
+		goto done2;
+	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
+		error = EALREADY;
+		goto done1;
+	}
+	error = getsockaddr(&sa, uap->name, uap->namelen);
+	if (error)
+		goto done1;
+	error = soconnect(so, sa, td);
+	if (error)
+		goto bad;
+	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
+		FREE(sa, M_SONAME);
+		error = EINPROGRESS;
+		goto done1;
+	}
+	s = splnet();
+	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+		error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0);
+		if (error)
+			break;
+	}
+	if (error == 0) {
+		error = so->so_error;
+		so->so_error = 0;
+	}
+	splx(s);
+bad:
+	so->so_state &= ~SS_ISCONNECTING;
+	FREE(sa, M_SONAME);
+	if (error == ERESTART)
+		error = EINTR;
+done1:
+	fputsock(so);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+socketpair(td, uap)
+	struct thread *td;
+	register struct socketpair_args /* {
+		int	domain;
+		int	type;
+		int	protocol;
+		int	*rsv;
+	} */ *uap;
+{
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	struct file *fp1, *fp2;
+	struct socket *so1, *so2;
+	int fd, error, sv[2];
+
+	mtx_lock(&Giant);
+	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
+	    td->td_ucred, td);
+	if (error)
+		goto done2;
+	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
+	    td->td_ucred, td);
+	if (error)
+		goto free1;
+	error = falloc(td, &fp1, &fd);
+	if (error)
+		goto free2;
+	fhold(fp1);
+	sv[0] = fd;
+	fp1->f_data = so1;		/* so1 already has ref count */
+	error = falloc(td, &fp2, &fd);
+	if (error)
+		goto free3;
+	fhold(fp2);
+	fp2->f_data = so2;		/* so2 already has ref count */
+	sv[1] = fd;
+	error = soconnect2(so1, so2);
+	if (error)
+		goto free4;
+	if (uap->type == SOCK_DGRAM) {
+		/*
+		 * Datagram socket connection is asymmetric.
+		 */
+		 error = soconnect2(so2, so1);
+		 if (error)
+			goto free4;
+	}
+	FILE_LOCK(fp1);
+	fp1->f_flag = FREAD|FWRITE;
+	fp1->f_ops = &socketops;
+	fp1->f_type = DTYPE_SOCKET;
+	FILE_UNLOCK(fp1);
+	FILE_LOCK(fp2);
+	fp2->f_flag = FREAD|FWRITE;
+	fp2->f_ops = &socketops;
+	fp2->f_type = DTYPE_SOCKET;
+	FILE_UNLOCK(fp2);
+	error = copyout(sv, uap->rsv, 2 * sizeof (int));
+	fdrop(fp1, td);
+	fdrop(fp2, td);
+	goto done2;
+free4:
+	FILEDESC_LOCK(fdp);
+	if (fdp->fd_ofiles[sv[1]] == fp2) {
+		fdp->fd_ofiles[sv[1]] = NULL;
+		FILEDESC_UNLOCK(fdp);
+		fdrop(fp2, td);
+	} else
+		FILEDESC_UNLOCK(fdp);
+	fdrop(fp2, td);
+free3:
+	FILEDESC_LOCK(fdp);
+	if (fdp->fd_ofiles[sv[0]] == fp1) {
+		fdp->fd_ofiles[sv[0]] = NULL;
+		FILEDESC_UNLOCK(fdp);
+		fdrop(fp1, td);
+	} else
+		FILEDESC_UNLOCK(fdp);
+	fdrop(fp1, td);
+free2:
+	(void)soclose(so2);
+free1:
+	(void)soclose(so1);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static int
+sendit(td, s, mp, flags)
+	register struct thread *td;
+	int s;
+	register struct msghdr *mp;
+	int flags;
+{
+	struct uio auio;
+	register struct iovec *iov;
+	register int i;
+	struct mbuf *control;
+	struct sockaddr *to = NULL;
+	int len, error;
+	struct socket *so;
+#ifdef KTRACE
+	struct iovec *ktriov = NULL;
+	struct uio ktruio;
+	int iovlen;
+#endif
+
+	if ((error = fgetsock(td, s, &so, NULL)) != 0)
+		return (error);
+	auio.uio_iov = mp->msg_iov;
+	auio.uio_iovcnt = mp->msg_iovlen;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	iov = mp->msg_iov;
+	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+		if ((auio.uio_resid += iov->iov_len) < 0) {
+			error = EINVAL;
+			goto bad;
+		}
+	}
+	if (mp->msg_name) {
+		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
+		if (error)
+			goto bad;
+	}
+	if (mp->msg_control) {
+		if (mp->msg_controllen < sizeof(struct cmsghdr)
+#ifdef COMPAT_OLDSOCK
+		    && mp->msg_flags != MSG_COMPAT
+#endif
+		) {
+			error = EINVAL;
+			goto bad;
+		}
+		error = sockargs(&control, mp->msg_control,
+		    mp->msg_controllen, MT_CONTROL);
+		if (error)
+			goto bad;
+#ifdef COMPAT_OLDSOCK
+		if (mp->msg_flags == MSG_COMPAT) {
+			register struct cmsghdr *cm;
+
+			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
+			if (control == 0) {
+				error = ENOBUFS;
+				goto bad;
+			} else {
+				cm = mtod(control, struct cmsghdr *);
+				cm->cmsg_len = control->m_len;
+				cm->cmsg_level = SOL_SOCKET;
+				cm->cmsg_type = SCM_RIGHTS;
+			}
+		}
+#endif
+	} else {
+		control = 0;
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_GENIO)) {
+		iovlen = auio.uio_iovcnt * sizeof (struct iovec);
+		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+		bcopy(auio.uio_iov, ktriov, iovlen);
+		ktruio = auio;
+	}
+#endif
+	len = auio.uio_resid;
+	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
+						     flags, td);
+	if (error) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		/* Generation of SIGPIPE can be controlled per socket */
+		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE)) {
+			PROC_LOCK(td->td_proc);
+			psignal(td->td_proc, SIGPIPE);
+			PROC_UNLOCK(td->td_proc);
+		}
+	}
+	if (error == 0)
+		td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+	if (ktriov != NULL) {
+		if (error == 0) {
+			ktruio.uio_iov = ktriov;
+			ktruio.uio_resid = td->td_retval[0];
+			ktrgenio(s, UIO_WRITE, &ktruio, error);
+		}
+		FREE(ktriov, M_TEMP);
+	}
+#endif
+bad:
+	fputsock(so);
+	if (to)
+		FREE(to, M_SONAME);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+sendto(td, uap)
+	struct thread *td;
+	register struct sendto_args /* {
+		int	s;
+		caddr_t	buf;
+		size_t	len;
+		int	flags;
+		caddr_t	to;
+		int	tolen;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+	int error;
+
+	msg.msg_name = uap->to;
+	msg.msg_namelen = uap->tolen;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = 0;
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags = 0;
+#endif
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	mtx_lock(&Giant);
+	error = sendit(td, uap->s, &msg, uap->flags);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE
+ */
+int
+osend(td, uap)
+	struct thread *td;
+	register struct osend_args /* {
+		int	s;
+		caddr_t	buf;
+		int	len;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+	int error;
+
+	msg.msg_name = 0;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = 0;
+	mtx_lock(&Giant);
+	error = sendit(td, uap->s, &msg, uap->flags);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+osendmsg(td, uap)
+	struct thread *td;
+	register struct osendmsg_args /* {
+		int	s;
+		caddr_t	msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov[UIO_SMALLIOV], *iov;
+	int error;
+
+	mtx_lock(&Giant);
+	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
+	if (error)
+		goto done2;
+	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
+			error = EMSGSIZE;
+			goto done2;
+		}
+		MALLOC(iov, struct iovec *,
+		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+		      M_WAITOK);
+	} else {
+		iov = aiov;
+	}
+	error = copyin(msg.msg_iov, iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+	if (error)
+		goto done;
+	msg.msg_flags = MSG_COMPAT;
+	msg.msg_iov = iov;
+	error = sendit(td, uap->s, &msg, uap->flags);
+done:
+	if (iov != aiov)
+		FREE(iov, M_IOV);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+sendmsg(td, uap)
+	struct thread *td;
+	register struct sendmsg_args /* {
+		int	s;
+		caddr_t	msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov[UIO_SMALLIOV], *iov;
+	int error;
+
+	mtx_lock(&Giant);
+	error = copyin(uap->msg, &msg, sizeof (msg));
+	if (error)
+		goto done2;
+	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
+			error = EMSGSIZE;
+			goto done2;
+		}
+		MALLOC(iov, struct iovec *,
+		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+		       M_WAITOK);
+	} else {
+		iov = aiov;
+	}
+	if (msg.msg_iovlen &&
+	    (error = copyin(msg.msg_iov, iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
+		goto done;
+	msg.msg_iov = iov;
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags = 0;
+#endif
+	error = sendit(td, uap->s, &msg, uap->flags);
+done:
+	if (iov != aiov)
+		FREE(iov, M_IOV);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static int
+recvit(td, s, mp, namelenp)
+	register struct thread *td;
+	int s;
+	register struct msghdr *mp;
+	void *namelenp;
+{
+	struct uio auio;
+	register struct iovec *iov;
+	register int i;
+	int len, error;
+	struct mbuf *m, *control = 0;
+	caddr_t ctlbuf;
+	struct socket *so;
+	struct sockaddr *fromsa = 0;
+#ifdef KTRACE
+	struct iovec *ktriov = NULL;
+	struct uio ktruio;
+	int iovlen;
+#endif
+
+	if ((error = fgetsock(td, s, &so, NULL)) != 0)
+		return (error);
+	auio.uio_iov = mp->msg_iov;
+	auio.uio_iovcnt = mp->msg_iovlen;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_READ;
+	auio.uio_td = td;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	iov = mp->msg_iov;
+	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+		if ((auio.uio_resid += iov->iov_len) < 0) {
+			fputsock(so);
+			return (EINVAL);
+		}
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_GENIO)) {
+		iovlen = auio.uio_iovcnt * sizeof (struct iovec);
+		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+		bcopy(auio.uio_iov, ktriov, iovlen);
+		ktruio = auio;
+	}
+#endif
+	len = auio.uio_resid;
+	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
+	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
+	    &mp->msg_flags);
+	if (error) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	}
+#ifdef KTRACE
+	if (ktriov != NULL) {
+		if (error == 0) {
+			ktruio.uio_iov = ktriov;
+			ktruio.uio_resid = len - auio.uio_resid;
+			ktrgenio(s, UIO_READ, &ktruio, error);
+		}
+		FREE(ktriov, M_TEMP);
+	}
+#endif
+	if (error)
+		goto out;
+	td->td_retval[0] = len - auio.uio_resid;
+	if (mp->msg_name) {
+		len = mp->msg_namelen;
+		if (len <= 0 || fromsa == 0)
+			len = 0;
+		else {
+#ifndef MIN
+#define MIN(a,b) ((a)>(b)?(b):(a))
+#endif
+			/* save sa_len before it is destroyed by MSG_COMPAT */
+			len = MIN(len, fromsa->sa_len);
+#ifdef COMPAT_OLDSOCK
+			if (mp->msg_flags & MSG_COMPAT)
+				((struct osockaddr *)fromsa)->sa_family =
+				    fromsa->sa_family;
+#endif
+			error = copyout(fromsa, mp->msg_name, (unsigned)len);
+			if (error)
+				goto out;
+		}
+		mp->msg_namelen = len;
+		if (namelenp &&
+		    (error = copyout(&len, namelenp, sizeof (int)))) {
+#ifdef COMPAT_OLDSOCK
+			if (mp->msg_flags & MSG_COMPAT)
+				error = 0;	/* old recvfrom didn't check */
+			else
+#endif
+			goto out;
+		}
+	}
+	if (mp->msg_control) {
+#ifdef COMPAT_OLDSOCK
+		/*
+		 * We assume that old recvmsg calls won't receive access
+		 * rights and other control info, esp. as control info
+		 * is always optional and those options didn't exist in 4.3.
+		 * If we receive rights, trim the cmsghdr; anything else
+		 * is tossed.
+		 */
+		if (control && mp->msg_flags & MSG_COMPAT) {
+			if (mtod(control, struct cmsghdr *)->cmsg_level !=
+			    SOL_SOCKET ||
+			    mtod(control, struct cmsghdr *)->cmsg_type !=
+			    SCM_RIGHTS) {
+				mp->msg_controllen = 0;
+				goto out;
+			}
+			control->m_len -= sizeof (struct cmsghdr);
+			control->m_data += sizeof (struct cmsghdr);
+		}
+#endif
+		len = mp->msg_controllen;
+		m = control;
+		mp->msg_controllen = 0;
+		ctlbuf = mp->msg_control;
+
+		while (m && len > 0) {
+			unsigned int tocopy;
+
+			if (len >= m->m_len) 
+				tocopy = m->m_len;
+			else {
+				mp->msg_flags |= MSG_CTRUNC;
+				tocopy = len;
+			}
+		
+			if ((error = copyout(mtod(m, caddr_t),
+					ctlbuf, tocopy)) != 0)
+				goto out;
+
+			ctlbuf += tocopy;
+			len -= tocopy;
+			m = m->m_next;
+		}
+		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
+	}
+out:
+	fputsock(so);
+	if (fromsa)
+		FREE(fromsa, M_SONAME);
+	if (control)
+		m_freem(control);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+recvfrom(td, uap)
+	struct thread *td;
+	register struct recvfrom_args /* {
+		int	s;
+		caddr_t	buf;
+		size_t	len;
+		int	flags;
+		caddr_t	from;
+		int	*fromlenaddr;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+	int error;
+
+	mtx_lock(&Giant);
+	if (uap->fromlenaddr) {
+		error = copyin(uap->fromlenaddr,
+		    &msg.msg_namelen, sizeof (msg.msg_namelen));
+		if (error)
+			goto done2;
+	} else {
+		msg.msg_namelen = 0;
+	}
+	msg.msg_name = uap->from;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = uap->flags;
+	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
+done2:
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE
+ */
+int
+orecvfrom(td, uap)
+	struct thread *td;
+	struct recvfrom_args *uap;
+{
+
+	uap->flags |= MSG_COMPAT;
+	return (recvfrom(td, uap));
+}
+#endif
+
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE
+ */
+int
+orecv(td, uap)
+	struct thread *td;
+	register struct orecv_args /* {
+		int	s;
+		caddr_t	buf;
+		int	len;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+	int error;
+
+	mtx_lock(&Giant);
+	msg.msg_name = 0;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = uap->flags;
+	error = recvit(td, uap->s, &msg, NULL);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Old recvmsg.  This code takes advantage of the fact that the old msghdr
+ * overlays the new one, missing only the flags, and with the (old) access
+ * rights where the control fields are now.
+ *
+ * MPSAFE
+ */
+int
+orecvmsg(td, uap)
+	struct thread *td;
+	register struct orecvmsg_args /* {
+		int	s;
+		struct	omsghdr *msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov[UIO_SMALLIOV], *iov;
+	int error;
+
+	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
+	if (error)
+		return (error);
+
+	mtx_lock(&Giant);
+	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
+			error = EMSGSIZE;
+			goto done2;
+		}
+		MALLOC(iov, struct iovec *,
+		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+		      M_WAITOK);
+	} else {
+		iov = aiov;
+	}
+	msg.msg_flags = uap->flags | MSG_COMPAT;
+	error = copyin(msg.msg_iov, iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+	if (error)
+		goto done;
+	msg.msg_iov = iov;
+	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
+
+	if (msg.msg_controllen && error == 0)
+		error = copyout(&msg.msg_controllen,
+		    &uap->msg->msg_accrightslen, sizeof (int));
+done:
+	if (iov != aiov)
+		FREE(iov, M_IOV);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+#endif
+
+/*
+ * MPSAFE
+ */
+int
+recvmsg(td, uap)
+	struct thread *td;
+	register struct recvmsg_args /* {
+		int	s;
+		struct	msghdr *msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
+	register int error;
+
+	mtx_lock(&Giant);
+	error = copyin(uap->msg, &msg, sizeof (msg));
+	if (error)
+		goto done2;
+	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
+			error = EMSGSIZE;
+			goto done2;
+		}
+		MALLOC(iov, struct iovec *,
+		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+		       M_WAITOK);
+	} else {
+		iov = aiov;
+	}
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags = uap->flags &~ MSG_COMPAT;
+#else
+	msg.msg_flags = uap->flags;
+#endif
+	uiov = msg.msg_iov;
+	msg.msg_iov = iov;
+	error = copyin(uiov, iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+	if (error)
+		goto done;
+	error = recvit(td, uap->s, &msg, NULL);
+	if (!error) {
+		msg.msg_iov = uiov;
+		error = copyout(&msg, uap->msg, sizeof(msg));
+	}
+done:
+	if (iov != aiov)
+		FREE(iov, M_IOV);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+shutdown(td, uap)
+	struct thread *td;
+	register struct shutdown_args /* {
+		int	s;
+		int	how;
+	} */ *uap;
+{
+	struct socket *so;
+	int error;
+
+	mtx_lock(&Giant);
+	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
+		error = soshutdown(so, uap->how);
+		fputsock(so);
+	}
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+setsockopt(td, uap)
+	struct thread *td;
+	register struct setsockopt_args /* {
+		int	s;
+		int	level;
+		int	name;
+		caddr_t	val;
+		int	valsize;
+	} */ *uap;
+{
+	struct socket *so;
+	struct sockopt sopt;
+	int error;
+
+	if (uap->val == 0 && uap->valsize != 0)
+		return (EFAULT);
+	if (uap->valsize < 0)
+		return (EINVAL);
+
+	mtx_lock(&Giant);
+	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
+		sopt.sopt_dir = SOPT_SET;
+		sopt.sopt_level = uap->level;
+		sopt.sopt_name = uap->name;
+		sopt.sopt_val = uap->val;
+		sopt.sopt_valsize = uap->valsize;
+		sopt.sopt_td = td;
+		error = sosetopt(so, &sopt);
+		fputsock(so);
+	}
+	mtx_unlock(&Giant);
+	return(error);
+}
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+getsockopt(td, uap)
+	struct thread *td;
+	register struct getsockopt_args /* {
+		int	s;
+		int	level;
+		int	name;
+		caddr_t	val;
+		int	*avalsize;
+	} */ *uap;
+{
+	int	valsize, error;
+	struct  socket *so;
+	struct	sockopt sopt;
+
+	mtx_lock(&Giant);
+	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
+		goto done2;
+	if (uap->val) {
+		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
+		if (error)
+			goto done1;
+		if (valsize < 0) {
+			error = EINVAL;
+			goto done1;
+		}
+	} else {
+		valsize = 0;
+	}
+
+	sopt.sopt_dir = SOPT_GET;
+	sopt.sopt_level = uap->level;
+	sopt.sopt_name = uap->name;
+	sopt.sopt_val = uap->val;
+	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
+	sopt.sopt_td = td;
+
+	error = sogetopt(so, &sopt);
+	if (error == 0) {
+		valsize = sopt.sopt_valsize;
+		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
+	}
+done1:
+	fputsock(so);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * getsockname1() - Get socket name. 
+ *
+ * MPSAFE
+ */
+/* ARGSUSED */
+static int
+getsockname1(td, uap, compat)
+	struct thread *td;
+	register struct getsockname_args /* {
+		int	fdes;
+		caddr_t	asa;
+		int	*alen;
+	} */ *uap;
+	int compat;
+{
+	struct socket *so;
+	struct sockaddr *sa;
+	int len, error;
+
+	mtx_lock(&Giant);
+	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
+		goto done2;
+	error = copyin(uap->alen, &len, sizeof (len));
+	if (error)
+		goto done1;
+	sa = 0;
+	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
+	if (error)
+		goto bad;
+	if (sa == 0) {
+		len = 0;
+		goto gotnothing;
+	}
+
+	len = MIN(len, sa->sa_len);
+#ifdef COMPAT_OLDSOCK
+	if (compat)
+		((struct osockaddr *)sa)->sa_family = sa->sa_family;
+#endif
+	error = copyout(sa, uap->asa, (u_int)len);
+	if (error == 0)
+gotnothing:
+		error = copyout(&len, uap->alen, sizeof (len));
+bad:
+	if (sa)
+		FREE(sa, M_SONAME);
+done1:
+	fputsock(so);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+getsockname(td, uap)
+	struct thread *td;
+	struct getsockname_args *uap;
+{
+
+	return (getsockname1(td, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE
+ */
+int
+ogetsockname(td, uap)
+	struct thread *td;
+	struct getsockname_args *uap;
+{
+
+	return (getsockname1(td, uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/*
+ * getpeername1() - Get name of peer for connected socket.
+ *
+ * MPSAFE
+ */
+/* ARGSUSED */
+static int
+getpeername1(td, uap, compat)
+	struct thread *td;
+	register struct getpeername_args /* {
+		int	fdes;
+		caddr_t	asa;
+		int	*alen;
+	} */ *uap;
+	int compat;
+{
+	struct socket *so;
+	struct sockaddr *sa;
+	int len, error;
+
+	mtx_lock(&Giant);
+	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
+		goto done2;
+	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
+		error = ENOTCONN;
+		goto done1;
+	}
+	error = copyin(uap->alen, &len, sizeof (len));
+	if (error)
+		goto done1;
+	sa = 0;
+	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
+	if (error)
+		goto bad;
+	if (sa == 0) {
+		len = 0;
+		goto gotnothing;
+	}
+	len = MIN(len, sa->sa_len);
+#ifdef COMPAT_OLDSOCK
+	if (compat)
+		((struct osockaddr *)sa)->sa_family =
+		    sa->sa_family;
+#endif
+	error = copyout(sa, uap->asa, (u_int)len);
+	if (error)
+		goto bad;
+gotnothing:
+	error = copyout(&len, uap->alen, sizeof (len));
+bad:
+	if (sa)
+		FREE(sa, M_SONAME);
+done1:
+	fputsock(so);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * MPSAFE
+ */
+int
+getpeername(td, uap)
+	struct thread *td;
+	struct getpeername_args *uap;
+{
+
+	return (getpeername1(td, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+/*
+ * MPSAFE
+ */
+int
+ogetpeername(td, uap)
+	struct thread *td;
+	struct ogetpeername_args *uap;
+{
+
+	/* XXX uap should have type `getpeername_args *' to begin with. */
+	return (getpeername1(td, (struct getpeername_args *)uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+int
+sockargs(mp, buf, buflen, type)
+	struct mbuf **mp;
+	caddr_t buf;
+	int buflen, type;
+{
+	register struct sockaddr *sa;
+	register struct mbuf *m;
+	int error;
+
+	if ((u_int)buflen > MLEN) {
+#ifdef COMPAT_OLDSOCK
+		if (type == MT_SONAME && (u_int)buflen <= 112)
+			buflen = MLEN;		/* unix domain compat. hack */
+		else
+#endif
+		return (EINVAL);
+	}
+	m = m_get(M_TRYWAIT, type);
+	if (m == NULL)
+		return (ENOBUFS);
+	m->m_len = buflen;
+	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
+	if (error)
+		(void) m_free(m);
+	else {
+		*mp = m;
+		if (type == MT_SONAME) {
+			sa = mtod(m, struct sockaddr *);
+
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+				sa->sa_family = sa->sa_len;
+#endif
+			sa->sa_len = buflen;
+		}
+	}
+	return (error);
+}
+
+int
+getsockaddr(namp, uaddr, len)
+	struct sockaddr **namp;
+	caddr_t uaddr;
+	size_t len;
+{
+	struct sockaddr *sa;
+	int error;
+
+	if (len > SOCK_MAXADDRLEN)
+		return ENAMETOOLONG;
+	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
+	error = copyin(uaddr, sa, len);
+	if (error) {
+		FREE(sa, M_SONAME);
+	} else {
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+			sa->sa_family = sa->sa_len;
+#endif
+		sa->sa_len = len;
+		*namp = sa;
+	}
+	return error;
+}
+
+/*
+ * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
+ * XXX - The sf_buf functions are currently private to sendfile(2), so have
+ * been made static, but may be useful in the future for doing zero-copy in
+ * other parts of the networking code. 
+ */
+static void
+sf_buf_init(void *arg)
+{
+	int i;
+
+	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
+	mtx_lock(&sf_freelist.sf_lock);
+	SLIST_INIT(&sf_freelist.sf_head);
+	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
+	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
+	    M_NOWAIT | M_ZERO);
+	for (i = 0; i < nsfbufs; i++) {
+		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
+		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
+	}
+	sf_buf_alloc_want = 0;
+	mtx_unlock(&sf_freelist.sf_lock);
+}
+
+/*
+ * Get an sf_buf from the freelist. Will block if none are available.
+ */
+struct sf_buf *
+sf_buf_alloc()
+{
+	struct sf_buf *sf;
+	int error;
+
+	mtx_lock(&sf_freelist.sf_lock);
+	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
+		sf_buf_alloc_want++;
+		error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH,
+		    "sfbufa", 0);
+		sf_buf_alloc_want--;
+
+		/*
+		 * If we got a signal, don't risk going back to sleep. 
+		 */
+		if (error)
+			break;
+	}
+	if (sf != NULL)
+		SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
+	mtx_unlock(&sf_freelist.sf_lock);
+	return (sf);
+}
+
+#define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
+
+/*
+ * Detatch mapped page and release resources back to the system.
+ */
+void
+sf_buf_free(void *addr, void *args)
+{
+	struct sf_buf *sf;
+	struct vm_page *m;
+
+	GIANT_REQUIRED;
+
+	sf = dtosf(addr);
+	pmap_qremove((vm_offset_t)addr, 1);
+	m = sf->m;
+	vm_page_unwire(m, 0);
+	/*
+	 * Check for the object going away on us. This can
+	 * happen since we don't hold a reference to it.
+	 * If so, we're responsible for freeing the page.
+	 */
+	if (m->wire_count == 0 && m->object == NULL)
+		vm_page_free(m);
+	sf->m = NULL;
+	mtx_lock(&sf_freelist.sf_lock);
+	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
+	if (sf_buf_alloc_want > 0)
+		wakeup_one(&sf_freelist);
+	mtx_unlock(&sf_freelist.sf_lock);
+}
+
+/*
+ * sendfile(2)
+ *
+ * MPSAFE
+ *
+ * int sendfile(int fd, int s, off_t offset, size_t nbytes,
+ *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
+ *
+ * Send a file specified by 'fd' and starting at 'offset' to a socket
+ * specified by 's'. Send only 'nbytes' of the file or until EOF if
+ * nbytes == 0. Optionally add a header and/or trailer to the socket
+ * output. If specified, write the total number of bytes sent into *sbytes.
+ *
+ */
+int
+sendfile(struct thread *td, struct sendfile_args *uap)
+{
+	struct vnode *vp;
+	struct vm_object *obj;
+	struct socket *so = NULL;
+	struct mbuf *m;
+	struct sf_buf *sf;
+	struct vm_page *pg;
+	struct writev_args nuap;
+	struct sf_hdtr hdtr;
+	off_t off, xfsize, hdtr_size, sbytes = 0;
+	int error, s;
+
+	mtx_lock(&Giant);
+
+	hdtr_size = 0;
+
+	/*
+	 * The descriptor must be a regular file and have a backing VM object.
+	 */
+	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
+		goto done;
+	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
+		error = EINVAL;
+		goto done;
+	}
+	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
+		goto done;
+	if (so->so_type != SOCK_STREAM) {
+		error = EINVAL;
+		goto done;
+	}
+	if ((so->so_state & SS_ISCONNECTED) == 0) {
+		error = ENOTCONN;
+		goto done;
+	}
+	if (uap->offset < 0) {
+		error = EINVAL;
+		goto done;
+	}
+
+	/*
+	 * If specified, get the pointer to the sf_hdtr struct for
+	 * any headers/trailers.
+	 */
+	if (uap->hdtr != NULL) {
+		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
+		if (error)
+			goto done;
+		/*
+		 * Send any headers. Wimp out and use writev(2).
+		 */
+		if (hdtr.headers != NULL) {
+			nuap.fd = uap->s;
+			nuap.iovp = hdtr.headers;
+			nuap.iovcnt = hdtr.hdr_cnt;
+			error = writev(td, &nuap);
+			if (error)
+				goto done;
+			hdtr_size += td->td_retval[0];
+		}
+	}
+
+	/*
+	 * Protect against multiple writers to the socket.
+	 */
+	(void) sblock(&so->so_snd, M_WAITOK);
+
+	/*
+	 * Loop through the pages in the file, starting with the requested
+	 * offset. Get a file page (do I/O if necessary), map the file page
+	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
+	 * it on the socket.
+	 */
+	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
+		vm_pindex_t pindex;
+		vm_offset_t pgoff;
+
+		pindex = OFF_TO_IDX(off);
+retry_lookup:
+		/*
+		 * Calculate the amount to transfer. Not to exceed a page,
+		 * the EOF, or the passed in nbytes.
+		 */
+		xfsize = obj->un_pager.vnp.vnp_size - off;
+		if (xfsize > PAGE_SIZE)
+			xfsize = PAGE_SIZE;
+		pgoff = (vm_offset_t)(off & PAGE_MASK);
+		if (PAGE_SIZE - pgoff < xfsize)
+			xfsize = PAGE_SIZE - pgoff;
+		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
+			xfsize = uap->nbytes - sbytes;
+		if (xfsize <= 0)
+			break;
+		/*
+		 * Optimize the non-blocking case by looking at the socket space
+		 * before going to the extra work of constituting the sf_buf.
+		 */
+		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
+			if (so->so_state & SS_CANTSENDMORE)
+				error = EPIPE;
+			else
+				error = EAGAIN;
+			sbunlock(&so->so_snd);
+			goto done;
+		}
+		/*
+		 * Attempt to look up the page.  
+		 *
+		 *	Allocate if not found
+		 *
+		 *	Wait and loop if busy.
+		 */
+		pg = vm_page_lookup(obj, pindex);
+
+		if (pg == NULL) {
+			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
+			if (pg == NULL) {
+				VM_WAIT;
+				goto retry_lookup;
+			}
+			vm_page_wakeup(pg);
+		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
+			goto retry_lookup;
+		}
+
+		/*
+		 * Wire the page so it does not get ripped out from under
+		 * us. 
+		 */
+
+		vm_page_wire(pg);
+
+		/*
+		 * If page is not valid for what we need, initiate I/O
+		 */
+
+		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
+			int bsize;
+
+			/*
+			 * Ensure that our page is still around when the I/O 
+			 * completes.
+			 */
+			vm_page_io_start(pg);
+
+			/*
+			 * Get the page from backing store.
+			 */
+			bsize = vp->v_mount->mnt_stat.f_iosize;
+			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
+			error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
+			    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
+			    IO_VMIO | ((MAXBSIZE / bsize) << 16),
+			    td->td_ucred, NULL, td);
+			VOP_UNLOCK(vp, 0, td);
+			vm_page_flag_clear(pg, PG_ZERO);
+			vm_page_io_finish(pg);
+			if (error) {
+				vm_page_unwire(pg, 0);
+				/*
+				 * See if anyone else might know about this page.
+				 * If not and it is not valid, then free it.
+				 */
+				if (pg->wire_count == 0 && pg->valid == 0 &&
+				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
+				    pg->hold_count == 0) {
+					vm_page_busy(pg);
+					vm_page_free(pg);
+				}
+				sbunlock(&so->so_snd);
+				goto done;
+			}
+		}
+
+
+		/*
+		 * Get a sendfile buf. We usually wait as long as necessary,
+		 * but this wait can be interrupted.
+		 */
+		if ((sf = sf_buf_alloc()) == NULL) {
+			vm_page_unwire(pg, 0);
+			if (pg->wire_count == 0 && pg->object == NULL)
+				vm_page_free(pg);
+			sbunlock(&so->so_snd);
+			error = EINTR;
+			goto done;
+		}
+
+		/*
+		 * Allocate a kernel virtual page and insert the physical page
+		 * into it.
+		 */
+		sf->m = pg;
+		pmap_qenter(sf->kva, &pg, 1);
+		/*
+		 * Get an mbuf header and set it up as having external storage.
+		 */
+		MGETHDR(m, M_TRYWAIT, MT_DATA);
+		if (m == NULL) {
+			error = ENOBUFS;
+			sf_buf_free((void *)sf->kva, NULL);
+			sbunlock(&so->so_snd);
+			goto done;
+		}
+		/*
+		 * Setup external storage for mbuf.
+		 */
+		MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY,
+		    EXT_SFBUF);
+		m->m_data = (char *) sf->kva + pgoff;
+		m->m_pkthdr.len = m->m_len = xfsize;
+		/*
+		 * Add the buffer to the socket buffer chain.
+		 */
+		s = splnet();
+retry_space:
+		/*
+		 * Make sure that the socket is still able to take more data.
+		 * CANTSENDMORE being true usually means that the connection
+		 * was closed. so_error is true when an error was sensed after
+		 * a previous send.
+		 * The state is checked after the page mapping and buffer
+		 * allocation above since those operations may block and make
+		 * any socket checks stale. From this point forward, nothing
+		 * blocks before the pru_send (or more accurately, any blocking
+		 * results in a loop back to here to re-check).
+		 */
+		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
+			if (so->so_state & SS_CANTSENDMORE) {
+				error = EPIPE;
+			} else {
+				error = so->so_error;
+				so->so_error = 0;
+			}
+			m_freem(m);
+			sbunlock(&so->so_snd);
+			splx(s);
+			goto done;
+		}
+		/*
+		 * Wait for socket space to become available. We do this just
+		 * after checking the connection state above in order to avoid
+		 * a race condition with sbwait().
+		 */
+		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
+			if (so->so_state & SS_NBIO) {
+				m_freem(m);
+				sbunlock(&so->so_snd);
+				splx(s);
+				error = EAGAIN;
+				goto done;
+			}
+			error = sbwait(&so->so_snd);
+			/*
+			 * An error from sbwait usually indicates that we've
+			 * been interrupted by a signal. If we've sent anything
+			 * then return bytes sent, otherwise return the error.
+			 */
+			if (error) {
+				m_freem(m);
+				sbunlock(&so->so_snd);
+				splx(s);
+				goto done;
+			}
+			goto retry_space;
+		}
+		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
+		splx(s);
+		if (error) {
+			sbunlock(&so->so_snd);
+			goto done;
+		}
+	}
+	sbunlock(&so->so_snd);
+
+	/*
+	 * Send trailers. Wimp out and use writev(2).
+	 */
+	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
+			nuap.fd = uap->s;
+			nuap.iovp = hdtr.trailers;
+			nuap.iovcnt = hdtr.trl_cnt;
+			error = writev(td, &nuap);
+			if (error)
+				goto done;
+			hdtr_size += td->td_retval[0];
+	}
+
+done:
+	/*
+	 * If there was no error we have to clear td->td_retval[0]
+	 * because it may have been set by writev.
+	 */
+	if (error == 0) {
+		td->td_retval[0] = 0;
+	}
+	if (uap->sbytes != NULL) {
+		sbytes += hdtr_size;
+		copyout(&sbytes, uap->sbytes, sizeof(off_t));
+	}
+	if (vp)
+		vrele(vp);
+	if (so)
+		fputsock(so);
+	mtx_unlock(&Giant);
+	return (error);
+}
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
new file mode 100644
index 0000000..b227d91
--- /dev/null
+++ b/sys/kern/uipc_usrreq.c
@@ -0,0 +1,1503 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/domain.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/resourcevar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/un.h>
+#include <sys/unpcb.h>
+#include <sys/vnode.h>
+
+#include <vm/uma.h>
+
+static uma_zone_t unp_zone;
+static	unp_gen_t unp_gencnt;
+static	u_int unp_count;
+
+static	struct unp_head unp_shead, unp_dhead;
+
+/*
+ * Unix communications domain.
+ *
+ * TODO:
+ *	SEQPACKET, RDM
+ *	rethink name space problems
+ *	need a proper out-of-band
+ *	lock pushdown
+ */
+static struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
+static ino_t	unp_ino;		/* prototype for fake inode numbers */
+
+static int     unp_attach(struct socket *);
+static void    unp_detach(struct unpcb *);
+static int     unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
+static int     unp_connect(struct socket *,struct sockaddr *, struct thread *);
+static void    unp_disconnect(struct unpcb *);
+static void    unp_shutdown(struct unpcb *);
+static void    unp_drop(struct unpcb *, int);
+static void    unp_gc(void);
+static void    unp_scan(struct mbuf *, void (*)(struct file *));
+static void    unp_mark(struct file *);
+static void    unp_discard(struct file *);
+static void    unp_freerights(struct file **, int);
+static int     unp_internalize(struct mbuf **, struct thread *);
+static int     unp_listen(struct unpcb *, struct thread *);
+
+static int
+uipc_abort(struct socket *so)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	unp_drop(unp, ECONNABORTED);
+	unp_detach(unp);
+	sotryfree(so);
+	return 0;
+}
+
+static int
+uipc_accept(struct socket *so, struct sockaddr **nam)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+
+	/*
+	 * Pass back name of connected socket,
+	 * if it was bound and we are still connected
+	 * (our peer may have closed already!).
+	 */
+	if (unp->unp_conn && unp->unp_conn->unp_addr) {
+		*nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
+				    1);
+	} else {
+		*nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1);
+	}
+	return 0;
+}
+
+static int
+uipc_attach(struct socket *so, int proto, struct thread *td)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp != 0)
+		return EISCONN;
+	return unp_attach(so);
+}
+
+static int
+uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+
+	return unp_bind(unp, nam, td);
+}
+
+static int
+uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	return unp_connect(so, nam, curthread);
+}
+
+static int
+uipc_connect2(struct socket *so1, struct socket *so2)
+{
+	struct unpcb *unp = sotounpcb(so1);
+
+	if (unp == 0)
+		return EINVAL;
+
+	return unp_connect2(so1, so2);
+}
+
+/* control is EOPNOTSUPP */
+
+static int
+uipc_detach(struct socket *so)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+
+	unp_detach(unp);
+	return 0;
+}
+
+static int
+uipc_disconnect(struct socket *so)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	unp_disconnect(unp);
+	return 0;
+}
+
+static int
+uipc_listen(struct socket *so, struct thread *td)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0 || unp->unp_vnode == 0)
+		return EINVAL;
+	return unp_listen(unp, td);
+}
+
+static int
+uipc_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	if (unp->unp_conn && unp->unp_conn->unp_addr)
+		*nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
+				    1);
+	return 0;
+}
+
+static int
+uipc_rcvd(struct socket *so, int flags)
+{
+	struct unpcb *unp = sotounpcb(so);
+	struct socket *so2;
+	u_long newhiwat;
+
+	if (unp == 0)
+		return EINVAL;
+	switch (so->so_type) {
+	case SOCK_DGRAM:
+		panic("uipc_rcvd DGRAM?");
+		/*NOTREACHED*/
+
+	case SOCK_STREAM:
+		if (unp->unp_conn == 0)
+			break;
+		so2 = unp->unp_conn->unp_socket;
+		/*
+		 * Adjust backpressure on sender
+		 * and wakeup any waiting to write.
+		 */
+		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
+		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
+		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
+		    so->so_rcv.sb_cc;
+		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
+		    newhiwat, RLIM_INFINITY);
+		unp->unp_cc = so->so_rcv.sb_cc;
+		sowwakeup(so2);
+		break;
+
+	default:
+		panic("uipc_rcvd unknown socktype");
+	}
+	return 0;
+}
+
+/* pru_rcvoob is EOPNOTSUPP */
+
+static int
+uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+	  struct mbuf *control, struct thread *td)
+{
+	int error = 0;
+	struct unpcb *unp = sotounpcb(so);
+	struct socket *so2;
+	u_long newhiwat;
+
+	if (unp == 0) {
+		error = EINVAL;
+		goto release;
+	}
+	if (flags & PRUS_OOB) {
+		error = EOPNOTSUPP;
+		goto release;
+	}
+
+	if (control && (error = unp_internalize(&control, td)))
+		goto release;
+
+	switch (so->so_type) {
+	case SOCK_DGRAM: 
+	{
+		struct sockaddr *from;
+
+		if (nam) {
+			if (unp->unp_conn) {
+				error = EISCONN;
+				break;
+			}
+			error = unp_connect(so, nam, td);
+			if (error)
+				break;
+		} else {
+			if (unp->unp_conn == 0) {
+				error = ENOTCONN;
+				break;
+			}
+		}
+		so2 = unp->unp_conn->unp_socket;
+		if (unp->unp_addr)
+			from = (struct sockaddr *)unp->unp_addr;
+		else
+			from = &sun_noname;
+		if (sbappendaddr(&so2->so_rcv, from, m, control)) {
+			sorwakeup(so2);
+			m = 0;
+			control = 0;
+		} else
+			error = ENOBUFS;
+		if (nam)
+			unp_disconnect(unp);
+		break;
+	}
+
+	case SOCK_STREAM:
+		/* Connect if not connected yet. */
+		/*
+		 * Note: A better implementation would complain
+		 * if not equal to the peer's address.
+		 */
+		if ((so->so_state & SS_ISCONNECTED) == 0) {
+			if (nam) {
+				error = unp_connect(so, nam, td);
+				if (error)
+					break;	/* XXX */
+			} else {
+				error = ENOTCONN;
+				break;
+			}
+		}
+
+		if (so->so_state & SS_CANTSENDMORE) {
+			error = EPIPE;
+			break;
+		}
+		if (unp->unp_conn == 0)
+			panic("uipc_send connected but no connection?");
+		so2 = unp->unp_conn->unp_socket;
+		/*
+		 * Send to paired receive port, and then reduce
+		 * send buffer hiwater marks to maintain backpressure.
+		 * Wake up readers.
+		 */
+		if (control) {
+			if (sbappendcontrol(&so2->so_rcv, m, control))
+				control = 0;
+		} else
+			sbappend(&so2->so_rcv, m);
+		so->so_snd.sb_mbmax -=
+			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
+		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
+		newhiwat = so->so_snd.sb_hiwat -
+		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
+		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
+		    newhiwat, RLIM_INFINITY);
+		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
+		sorwakeup(so2);
+		m = 0;
+		break;
+
+	default:
+		panic("uipc_send unknown socktype");
+	}
+
+	/*
+	 * SEND_EOF is equivalent to a SEND followed by
+	 * a SHUTDOWN.
+	 */
+	if (flags & PRUS_EOF) {
+		socantsendmore(so);
+		unp_shutdown(unp);
+	}
+
+	if (control && error != 0)
+		unp_dispose(control);
+
+release:
+	if (control)
+		m_freem(control);
+	if (m)
+		m_freem(m);
+	return error;
+}
+
+static int
+uipc_sense(struct socket *so, struct stat *sb)
+{
+	struct unpcb *unp = sotounpcb(so);
+	struct socket *so2;
+
+	if (unp == 0)
+		return EINVAL;
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
+		so2 = unp->unp_conn->unp_socket;
+		sb->st_blksize += so2->so_rcv.sb_cc;
+	}
+	sb->st_dev = NOUDEV;
+	if (unp->unp_ino == 0)
+		unp->unp_ino = unp_ino++;
+	sb->st_ino = unp->unp_ino;
+	return (0);
+}
+
+static int
+uipc_shutdown(struct socket *so)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	socantsendmore(so);
+	unp_shutdown(unp);
+	return 0;
+}
+
+static int
+uipc_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+	struct unpcb *unp = sotounpcb(so);
+
+	if (unp == 0)
+		return EINVAL;
+	if (unp->unp_addr)
+		*nam = dup_sockaddr((struct sockaddr *)unp->unp_addr, 1);
+	else
+		*nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1);
+	return 0;
+}
+
+struct pr_usrreqs uipc_usrreqs = {
+	uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect,
+	uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect,
+	uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp,
+	uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr,
+	sosend, soreceive, sopoll
+};
+
+int
+uipc_ctloutput(so, sopt)
+	struct socket *so;
+	struct sockopt *sopt;
+{
+	struct unpcb *unp = sotounpcb(so);
+	int error;
+
+	switch (sopt->sopt_dir) {
+	case SOPT_GET:
+		switch (sopt->sopt_name) {
+		case LOCAL_PEERCRED:
+			if (unp->unp_flags & UNP_HAVEPC)
+				error = sooptcopyout(sopt, &unp->unp_peercred,
+				    sizeof(unp->unp_peercred));
+			else {
+				if (so->so_type == SOCK_STREAM)
+					error = ENOTCONN;
+				else
+					error = EINVAL;
+			}
+			break;
+		default:
+			error = EOPNOTSUPP;
+			break;
+		}
+		break;
+	case SOPT_SET:
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	return (error);
+}
+	
+/*
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering
+ * for stream sockets, although the total for sender and receiver is
+ * actually only PIPSIZ.
+ * Datagram sockets really use the sendspace as the maximum datagram size,
+ * and don't really want to reserve the sendspace.  Their recvspace should
+ * be large enough for at least one max-size datagram plus address.
+ */
+#ifndef PIPSIZ
+#define	PIPSIZ	8192
+#endif
+static u_long	unpst_sendspace = PIPSIZ;
+static u_long	unpst_recvspace = PIPSIZ;
+static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
+static u_long	unpdg_recvspace = 4*1024;
+
+static int	unp_rights;			/* file descriptors in flight */
+
+SYSCTL_DECL(_net_local_stream);
+SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, 
+	   &unpst_sendspace, 0, "");
+SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpst_recvspace, 0, "");
+SYSCTL_DECL(_net_local_dgram);
+SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+	   &unpdg_sendspace, 0, "");
+SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpdg_recvspace, 0, "");
+SYSCTL_DECL(_net_local);
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
+
+static int
+unp_attach(so)
+	struct socket *so;
+{
+	register struct unpcb *unp;
+	int error;
+
+	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+		switch (so->so_type) {
+
+		case SOCK_STREAM:
+			error = soreserve(so, unpst_sendspace, unpst_recvspace);
+			break;
+
+		case SOCK_DGRAM:
+			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
+			break;
+
+		default:
+			panic("unp_attach");
+		}
+		if (error)
+			return (error);
+	}
+	unp = uma_zalloc(unp_zone, M_WAITOK);
+	if (unp == NULL)
+		return (ENOBUFS);
+	bzero(unp, sizeof *unp);
+	unp->unp_gencnt = ++unp_gencnt;
+	unp_count++;
+	LIST_INIT(&unp->unp_refs);
+	unp->unp_socket = so;
+	FILEDESC_LOCK(curproc->p_fd);
+	unp->unp_rvnode = curthread->td_proc->p_fd->fd_rdir;
+	FILEDESC_UNLOCK(curproc->p_fd);
+	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
+			 : &unp_shead, unp, unp_link);
+	so->so_pcb = unp;
+	return (0);
+}
+
+static void
+unp_detach(unp)
+	register struct unpcb *unp;
+{
+	LIST_REMOVE(unp, unp_link);
+	unp->unp_gencnt = ++unp_gencnt;
+	--unp_count;
+	if (unp->unp_vnode) {
+		unp->unp_vnode->v_socket = 0;
+		vrele(unp->unp_vnode);
+		unp->unp_vnode = 0;
+	}
+	if (unp->unp_conn)
+		unp_disconnect(unp);
+	while (!LIST_EMPTY(&unp->unp_refs))
+		unp_drop(LIST_FIRST(&unp->unp_refs), ECONNRESET);
+	soisdisconnected(unp->unp_socket);
+	unp->unp_socket->so_pcb = 0;
+	if (unp_rights) {
+		/*
+		 * Normally the receive buffer is flushed later,
+		 * in sofree, but if our receive buffer holds references
+		 * to descriptors that are now garbage, we will dispose
+		 * of those descriptor references after the garbage collector
+		 * gets them (resulting in a "panic: closef: count < 0").
+		 */
+		sorflush(unp->unp_socket);
+		unp_gc();
+	}
+	if (unp->unp_addr)
+		FREE(unp->unp_addr, M_SONAME);
+	uma_zfree(unp_zone, unp);
+}
+
+static int
+unp_bind(unp, nam, td)
+	struct unpcb *unp;
+	struct sockaddr *nam;
+	struct thread *td;
+{
+	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+	struct vnode *vp;
+	struct mount *mp;
+	struct vattr vattr;
+	int error, namelen;
+	struct nameidata nd;
+	char *buf;
+
+	if (unp->unp_vnode != NULL)
+		return (EINVAL);
+	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
+	if (namelen <= 0)
+		return EINVAL;
+	buf = malloc(SOCK_MAXADDRLEN, M_TEMP, M_WAITOK);
+	strncpy(buf, soun->sun_path, namelen);
+	buf[namelen] = 0;	/* null-terminate the string */
+restart:
+	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
+	    buf, td);
+/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
+	error = namei(&nd);
+	if (error) {
+		free(buf, M_TEMP);
+		return (error);
+	}
+	vp = nd.ni_vp;
+	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (vp != NULL) {
+			vrele(vp);
+			free(buf, M_TEMP);
+			return (EADDRINUSE);
+		}
+		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+		if (error) {
+			free(buf, M_TEMP);
+			return (error);
+		}
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VSOCK;
+	FILEDESC_LOCK(td->td_proc->p_fd);
+	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	if (error) {
+		free(buf, M_TEMP);
+		return (error);
+	}
+	vp = nd.ni_vp;
+	vp->v_socket = unp->unp_socket;
+	unp->unp_vnode = vp;
+	unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	free(buf, M_TEMP);
+	return (0);
+}
+
+static int
+unp_connect(so, nam, td)
+	struct socket *so;
+	struct sockaddr *nam;
+	struct thread *td;
+{
+	register struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+	register struct vnode *vp;
+	register struct socket *so2, *so3;
+	struct unpcb *unp, *unp2, *unp3;
+	int error, len;
+	struct nameidata nd;
+	char buf[SOCK_MAXADDRLEN];
+
+	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
+	if (len <= 0)
+		return EINVAL;
+	strncpy(buf, soun->sun_path, len);
+	buf[len] = 0;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (vp->v_type != VSOCK) {
+		error = ENOTSOCK;
+		goto bad;
+	}
+	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
+	if (error)
+		goto bad;
+	so2 = vp->v_socket;
+	if (so2 == 0) {
+		error = ECONNREFUSED;
+		goto bad;
+	}
+	if (so->so_type != so2->so_type) {
+		error = EPROTOTYPE;
+		goto bad;
+	}
+	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
+		    (so3 = sonewconn(so2, 0)) == 0) {
+			error = ECONNREFUSED;
+			goto bad;
+		}
+		unp = sotounpcb(so);
+		unp2 = sotounpcb(so2);
+		unp3 = sotounpcb(so3);
+		if (unp2->unp_addr)
+			unp3->unp_addr = (struct sockaddr_un *)
+				dup_sockaddr((struct sockaddr *)
+					     unp2->unp_addr, 1);
+
+		/*
+		 * unp_peercred management:
+		 *
+		 * The connecter's (client's) credentials are copied
+		 * from its process structure at the time of connect()
+		 * (which is now).
+		 */
+		cru2x(td->td_ucred, &unp3->unp_peercred);
+		unp3->unp_flags |= UNP_HAVEPC;
+		/*
+		 * The receiver's (server's) credentials are copied
+		 * from the unp_peercred member of socket on which the
+		 * former called listen(); unp_listen() cached that
+		 * process's credentials at that time so we can use
+		 * them now.
+		 */
+		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
+		    ("unp_connect: listener without cached peercred"));
+		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
+		    sizeof(unp->unp_peercred));
+		unp->unp_flags |= UNP_HAVEPC;
+
+		so2 = so3;
+	}
+	error = unp_connect2(so, so2);
+bad:
+	vput(vp);
+	return (error);
+}
+
+int
+unp_connect2(so, so2)
+	register struct socket *so;
+	register struct socket *so2;
+{
+	register struct unpcb *unp = sotounpcb(so);
+	register struct unpcb *unp2;
+
+	if (so2->so_type != so->so_type)
+		return (EPROTOTYPE);
+	unp2 = sotounpcb(so2);
+	unp->unp_conn = unp2;
+	switch (so->so_type) {
+
+	case SOCK_DGRAM:
+		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
+		soisconnected(so);
+		break;
+
+	case SOCK_STREAM:
+		unp2->unp_conn = unp;
+		soisconnected(so);
+		soisconnected(so2);
+		break;
+
+	default:
+		panic("unp_connect2");
+	}
+	return (0);
+}
+
+static void
+unp_disconnect(unp)
+	struct unpcb *unp;
+{
+	register struct unpcb *unp2 = unp->unp_conn;
+
+	if (unp2 == 0)
+		return;
+	unp->unp_conn = 0;
+	switch (unp->unp_socket->so_type) {
+
+	case SOCK_DGRAM:
+		LIST_REMOVE(unp, unp_reflink);
+		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
+		break;
+
+	case SOCK_STREAM:
+		soisdisconnected(unp->unp_socket);
+		unp2->unp_conn = 0;
+		soisdisconnected(unp2->unp_socket);
+		break;
+	}
+}
+
+#ifdef notdef
+void
+unp_abort(unp)
+	struct unpcb *unp;
+{
+
+	unp_detach(unp);
+}
+#endif
+
+static int
+unp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+	int error, i, n;
+	struct unpcb *unp, **unp_list;
+	unp_gen_t gencnt;
+	struct xunpgen *xug;
+	struct unp_head *head;
+	struct xunpcb *xu;
+
+	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
+
+	/*
+	 * The process of preparing the PCB list is too time-consuming and
+	 * resource-intensive to repeat twice on every request.
+	 */
+	if (req->oldptr == 0) {
+		n = unp_count;
+		req->oldidx = 2 * (sizeof *xug)
+			+ (n + n/8) * sizeof(struct xunpcb);
+		return 0;
+	}
+
+	if (req->newptr != 0)
+		return EPERM;
+
+	/*
+	 * OK, now we're committed to doing something.
+	 */
+	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
+	gencnt = unp_gencnt;
+	n = unp_count;
+
+	xug->xug_len = sizeof *xug;
+	xug->xug_count = n;
+	xug->xug_gen = gencnt;
+	xug->xug_sogen = so_gencnt;
+	error = SYSCTL_OUT(req, xug, sizeof *xug);
+	if (error) {
+		free(xug, M_TEMP);
+		return error;
+	}
+
+	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
+	
+	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
+	     unp = LIST_NEXT(unp, unp_link)) {
+		if (unp->unp_gencnt <= gencnt) {
+			if (cr_cansee(req->td->td_ucred,
+			    unp->unp_socket->so_cred))
+				continue;
+			unp_list[i++] = unp;
+		}
+	}
+	n = i;			/* in case we lost some during malloc */
+
+	error = 0;
+	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK);
+	for (i = 0; i < n; i++) {
+		unp = unp_list[i];
+		if (unp->unp_gencnt <= gencnt) {
+			xu->xu_len = sizeof *xu;
+			xu->xu_unpp = unp;
+			/*
+			 * XXX - need more locking here to protect against
+			 * connect/disconnect races for SMP.
+			 */
+			if (unp->unp_addr)
+				bcopy(unp->unp_addr, &xu->xu_addr, 
+				      unp->unp_addr->sun_len);
+			if (unp->unp_conn && unp->unp_conn->unp_addr)
+				bcopy(unp->unp_conn->unp_addr,
+				      &xu->xu_caddr,
+				      unp->unp_conn->unp_addr->sun_len);
+			bcopy(unp, &xu->xu_unp, sizeof *unp);
+			sotoxsocket(unp->unp_socket, &xu->xu_socket);
+			error = SYSCTL_OUT(req, xu, sizeof *xu);
+		}
+	}
+	free(xu, M_TEMP);
+	if (!error) {
+		/*
+		 * Give the user an updated idea of our state.
+		 * If the generation differs from what we told
+		 * her before, she knows that something happened
+		 * while we were processing this request, and it
+		 * might be necessary to retry.
+		 */
+		xug->xug_gen = unp_gencnt;
+		xug->xug_sogen = so_gencnt;
+		xug->xug_count = unp_count;
+		error = SYSCTL_OUT(req, xug, sizeof *xug);
+	}
+	free(unp_list, M_TEMP);
+	free(xug, M_TEMP);
+	return error;
+}
+
+SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, 
+	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
+	    "List of active local datagram sockets");
+SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, 
+	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
+	    "List of active local stream sockets");
+
+static void
+unp_shutdown(unp)
+	struct unpcb *unp;
+{
+	struct socket *so;
+
+	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
+	    (so = unp->unp_conn->unp_socket))
+		socantrcvmore(so);
+}
+
+static void
+unp_drop(unp, errno)
+	struct unpcb *unp;
+	int errno;
+{
+	struct socket *so = unp->unp_socket;
+
+	so->so_error = errno;
+	unp_disconnect(unp);
+}
+
+#ifdef notdef
+void
+unp_drain()
+{
+
+}
+#endif
+
+static void
+unp_freerights(rp, fdcount)
+	struct file **rp;
+	int fdcount;
+{
+	int i;
+	struct file *fp;
+
+	for (i = 0; i < fdcount; i++) {
+		fp = *rp;
+		/*
+		 * zero the pointer before calling
+		 * unp_discard since it may end up
+		 * in unp_gc()..
+		 */
+		*rp++ = 0;
+		unp_discard(fp);
+	}
+}
+
+int
+unp_externalize(control, controlp)
+	struct mbuf *control, **controlp;
+{
+	struct thread *td = curthread;		/* XXX */
+	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
+	int i;
+	int *fdp;
+	struct file **rp;
+	struct file *fp;
+	void *data;
+	socklen_t clen = control->m_len, datalen;
+	int error, newfds;
+	int f;
+	u_int newlen;
+
+	error = 0;
+	if (controlp != NULL) /* controlp == NULL => free control messages */
+		*controlp = NULL;
+
+	while (cm != NULL) {
+		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
+			error = EINVAL;
+			break;
+		}
+
+		data = CMSG_DATA(cm);
+		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
+
+		if (cm->cmsg_level == SOL_SOCKET
+		    && cm->cmsg_type == SCM_RIGHTS) {
+			newfds = datalen / sizeof(struct file *);
+			rp = data;
+
+			/* If we're not outputting the discriptors free them. */
+			if (error || controlp == NULL) {
+				unp_freerights(rp, newfds);
+				goto next;
+			}
+			FILEDESC_LOCK(td->td_proc->p_fd);
+			/* if the new FD's will not fit free them.  */
+			if (!fdavail(td, newfds)) {
+				FILEDESC_UNLOCK(td->td_proc->p_fd);
+				error = EMSGSIZE;
+				unp_freerights(rp, newfds);
+				goto next;
+			}
+			/*
+			 * now change each pointer to an fd in the global
+			 * table to an integer that is the index to the
+			 * local fd table entry that we set up to point
+			 * to the global one we are transferring.
+			 */
+			newlen = newfds * sizeof(int);
+			*controlp = sbcreatecontrol(NULL, newlen,
+			    SCM_RIGHTS, SOL_SOCKET);
+			if (*controlp == NULL) {
+				FILEDESC_UNLOCK(td->td_proc->p_fd);
+				error = E2BIG;
+				unp_freerights(rp, newfds);
+				goto next;
+			}
+
+			fdp = (int *)
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+			for (i = 0; i < newfds; i++) {
+				if (fdalloc(td, 0, &f))
+					panic("unp_externalize fdalloc failed");
+				fp = *rp++;
+				td->td_proc->p_fd->fd_ofiles[f] = fp;
+				FILE_LOCK(fp);
+				fp->f_msgcount--;
+				FILE_UNLOCK(fp);
+				unp_rights--;
+				*fdp++ = f;
+			}
+			FILEDESC_UNLOCK(td->td_proc->p_fd);
+		} else { /* We can just copy anything else across */
+			if (error || controlp == NULL)
+				goto next;
+			*controlp = sbcreatecontrol(NULL, datalen,
+			    cm->cmsg_type, cm->cmsg_level);
+			if (*controlp == NULL) {
+				error = ENOBUFS;
+				goto next;
+			}
+			bcopy(data,
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
+			    datalen);
+		}
+
+		controlp = &(*controlp)->m_next;
+
+next:
+		if (CMSG_SPACE(datalen) < clen) {
+			clen -= CMSG_SPACE(datalen);
+			cm = (struct cmsghdr *)
+			    ((caddr_t)cm + CMSG_SPACE(datalen));
+		} else {
+			clen = 0;
+			cm = NULL;
+		}
+	}
+
+	m_freem(control);
+
+	return (error);
+}
+
+void
+unp_init(void)
+{
+	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	uma_zone_set_max(unp_zone, nmbclusters);
+	if (unp_zone == 0)
+		panic("unp_init");
+	LIST_INIT(&unp_dhead);
+	LIST_INIT(&unp_shead);
+}
+
+#ifndef MIN
+#define	MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+static int
+unp_internalize(controlp, td)
+	struct mbuf **controlp;
+	struct thread *td;
+{
+	struct mbuf *control = *controlp;
+	struct proc *p = td->td_proc;
+	struct filedesc *fdescp = p->p_fd;
+	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
+	struct cmsgcred *cmcred;
+	struct file **rp;
+	struct file *fp;
+	struct timeval *tv;
+	int i, fd, *fdp;
+	void *data;
+	socklen_t clen = control->m_len, datalen;
+	int error, oldfds;
+	u_int newlen;
+
+	error = 0;
+	*controlp = NULL;
+
+	while (cm != NULL) {
+		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
+		    || cm->cmsg_len > clen) {
+			error = EINVAL;
+			goto out;
+		}
+
+		data = CMSG_DATA(cm);
+		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
+
+		switch (cm->cmsg_type) {
+		/*
+		 * Fill in credential information.
+		 */
+		case SCM_CREDS:
+			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
+			    SCM_CREDS, SOL_SOCKET);
+			if (*controlp == NULL) {
+				error = ENOBUFS;
+				goto out;
+			}
+
+			cmcred = (struct cmsgcred *)
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+			cmcred->cmcred_pid = p->p_pid;
+			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
+			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
+			cmcred->cmcred_euid = td->td_ucred->cr_uid;
+			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
+							CMGROUP_MAX);
+			for (i = 0; i < cmcred->cmcred_ngroups; i++)
+				cmcred->cmcred_groups[i] =
+				    td->td_ucred->cr_groups[i];
+			break;
+
+		case SCM_RIGHTS:
+			oldfds = datalen / sizeof (int);
+			/*
+			 * check that all the FDs passed in refer to legal files
+			 * If not, reject the entire operation.
+			 */
+			fdp = data;
+			FILEDESC_LOCK(fdescp);
+			for (i = 0; i < oldfds; i++) {
+				fd = *fdp++;
+				if ((unsigned)fd >= fdescp->fd_nfiles ||
+				    fdescp->fd_ofiles[fd] == NULL) {
+					FILEDESC_UNLOCK(fdescp);
+					error = EBADF;
+					goto out;
+				}
+			}
+			/*
+			 * Now replace the integer FDs with pointers to
+			 * the associated global file table entry..
+			 */
+			newlen = oldfds * sizeof(struct file *);
+			*controlp = sbcreatecontrol(NULL, newlen,
+			    SCM_RIGHTS, SOL_SOCKET);
+			if (*controlp == NULL) {
+				FILEDESC_UNLOCK(fdescp);
+				error = E2BIG;
+				goto out;
+			}
+
+			fdp = data;
+			rp = (struct file **)
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+			for (i = 0; i < oldfds; i++) {
+				fp = fdescp->fd_ofiles[*fdp++];
+				*rp++ = fp;
+				FILE_LOCK(fp);
+				fp->f_count++;
+				fp->f_msgcount++;
+				FILE_UNLOCK(fp);
+				unp_rights++;
+			}
+			FILEDESC_UNLOCK(fdescp);
+			break;
+
+		case SCM_TIMESTAMP:
+			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
+			    SCM_TIMESTAMP, SOL_SOCKET);
+			if (*controlp == NULL) {
+				error = ENOBUFS;
+				goto out;
+			}
+			tv = (struct timeval *)
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+			microtime(tv);
+			break;
+
+		default:
+			error = EINVAL;
+			goto out;
+		}
+
+		controlp = &(*controlp)->m_next;
+
+		if (CMSG_SPACE(datalen) < clen) {
+			clen -= CMSG_SPACE(datalen);
+			cm = (struct cmsghdr *)
+			    ((caddr_t)cm + CMSG_SPACE(datalen));
+		} else {
+			clen = 0;
+			cm = NULL;
+		}
+	}
+
+out:
+	m_freem(control);
+
+	return (error);
+}
+
+static int	unp_defer, unp_gcing;
+
+static void
+unp_gc()
+{
+	register struct file *fp, *nextfp;
+	register struct socket *so;
+	struct file **extra_ref, **fpp;
+	int nunref, i;
+
+	if (unp_gcing)
+		return;
+	unp_gcing = 1;
+	unp_defer = 0;
+	/* 
+	 * before going through all this, set all FDs to 
+	 * be NOT defered and NOT externally accessible
+	 */
+	sx_slock(&filelist_lock);
+	LIST_FOREACH(fp, &filehead, f_list)
+		fp->f_gcflag &= ~(FMARK|FDEFER);
+	do {
+		LIST_FOREACH(fp, &filehead, f_list) {
+			FILE_LOCK(fp);
+			/*
+			 * If the file is not open, skip it
+			 */
+			if (fp->f_count == 0) {
+				FILE_UNLOCK(fp);
+				continue;
+			}
+			/*
+			 * If we already marked it as 'defer'  in a
+			 * previous pass, then try process it this time
+			 * and un-mark it
+			 */
+			if (fp->f_gcflag & FDEFER) {
+				fp->f_gcflag &= ~FDEFER;
+				unp_defer--;
+			} else {
+				/*
+				 * if it's not defered, then check if it's
+				 * already marked.. if so skip it
+				 */
+				if (fp->f_gcflag & FMARK) {
+					FILE_UNLOCK(fp);
+					continue;
+				}
+				/* 
+				 * If all references are from messages
+				 * in transit, then skip it. it's not 
+				 * externally accessible.
+				 */ 
+				if (fp->f_count == fp->f_msgcount) {
+					FILE_UNLOCK(fp);
+					continue;
+				}
+				/* 
+				 * If it got this far then it must be
+				 * externally accessible.
+				 */
+				fp->f_gcflag |= FMARK;
+			}
+			/*
+			 * either it was defered, or it is externally 
+			 * accessible and not already marked so.
+			 * Now check if it is possibly one of OUR sockets.
+			 */ 
+			if (fp->f_type != DTYPE_SOCKET ||
+			    (so = (struct socket *)fp->f_data) == 0) {
+				FILE_UNLOCK(fp);
+				continue;
+			}
+			FILE_UNLOCK(fp);
+			if (so->so_proto->pr_domain != &localdomain ||
+			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
+				continue;
+#ifdef notdef
+			if (so->so_rcv.sb_flags & SB_LOCK) {
+				/*
+				 * This is problematical; it's not clear
+				 * we need to wait for the sockbuf to be
+				 * unlocked (on a uniprocessor, at least),
+				 * and it's also not clear what to do
+				 * if sbwait returns an error due to receipt
+				 * of a signal.  If sbwait does return
+				 * an error, we'll go into an infinite
+				 * loop.  Delete all of this for now.
+				 */
+				(void) sbwait(&so->so_rcv);
+				goto restart;
+			}
+#endif
+			/*
+			 * So, Ok, it's one of our sockets and it IS externally
+			 * accessible (or was defered). Now we look
+			 * to see if we hold any file descriptors in its
+			 * message buffers. Follow those links and mark them 
+			 * as accessible too.
+			 */
+			unp_scan(so->so_rcv.sb_mb, unp_mark);
+		}
+	} while (unp_defer);
+	sx_sunlock(&filelist_lock);
+	/*
+	 * We grab an extra reference to each of the file table entries
+	 * that are not otherwise accessible and then free the rights
+	 * that are stored in messages on them.
+	 *
+	 * The bug in the orginal code is a little tricky, so I'll describe
+	 * what's wrong with it here.
+	 *
+	 * It is incorrect to simply unp_discard each entry for f_msgcount
+	 * times -- consider the case of sockets A and B that contain
+	 * references to each other.  On a last close of some other socket,
+	 * we trigger a gc since the number of outstanding rights (unp_rights)
+	 * is non-zero.  If during the sweep phase the gc code un_discards,
+	 * we end up doing a (full) closef on the descriptor.  A closef on A
+	 * results in the following chain.  Closef calls soo_close, which
+	 * calls soclose.   Soclose calls first (through the switch
+	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
+	 * returns because the previous instance had set unp_gcing, and
+	 * we return all the way back to soclose, which marks the socket
+	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
+	 * to free up the rights that are queued in messages on the socket A,
+	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
+	 * switch unp_dispose, which unp_scans with unp_discard.  This second
+	 * instance of unp_discard just calls closef on B.
+	 *
+	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
+	 * which results in another closef on A.  Unfortunately, A is already
+	 * being closed, and the descriptor has already been marked with
+	 * SS_NOFDREF, and soclose panics at this point.
+	 *
+	 * Here, we first take an extra reference to each inaccessible
+	 * descriptor.  Then, we call sorflush ourself, since we know
+	 * it is a Unix domain socket anyhow.  After we destroy all the
+	 * rights carried in messages, we do a last closef to get rid
+	 * of our extra reference.  This is the last close, and the
+	 * unp_detach etc will shut down the socket.
+	 *
+	 * 91/09/19, bsy@cs.cmu.edu
+	 */
+	extra_ref = malloc(nfiles * sizeof(struct file *), M_TEMP, M_WAITOK);
+	sx_slock(&filelist_lock);
+	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0;
+	    fp = nextfp) {
+		nextfp = LIST_NEXT(fp, f_list);
+		FILE_LOCK(fp);
+		/* 
+		 * If it's not open, skip it
+		 */
+		if (fp->f_count == 0) {
+			FILE_UNLOCK(fp);
+			continue;
+		}
+		/* 
+		 * If all refs are from msgs, and it's not marked accessible
+		 * then it must be referenced from some unreachable cycle
+		 * of (shut-down) FDs, so include it in our
+		 * list of FDs to remove
+		 */
+		if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
+			*fpp++ = fp;
+			nunref++;
+			fp->f_count++;
+		}
+		FILE_UNLOCK(fp);
+	}
+	sx_sunlock(&filelist_lock);
+	/* 
+	 * for each FD on our hit list, do the following two things
+	 */
+	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
+		struct file *tfp = *fpp;
+		FILE_LOCK(tfp);
+		if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) {
+			FILE_UNLOCK(tfp);
+			sorflush((struct socket *)(tfp->f_data));
+		} else
+			FILE_UNLOCK(tfp);
+	}
+	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
+		closef(*fpp, (struct thread *) NULL);
+	free(extra_ref, M_TEMP);
+	unp_gcing = 0;
+}
+
+void
+unp_dispose(m)
+	struct mbuf *m;
+{
+
+	if (m)
+		unp_scan(m, unp_discard);
+}
+
+static int
+unp_listen(unp, td)
+	struct unpcb *unp;
+	struct thread *td;
+{
+
+	cru2x(td->td_ucred, &unp->unp_peercred);
+	unp->unp_flags |= UNP_HAVEPCCACHED;
+	return (0);
+}
+
+static void
+unp_scan(m0, op)
+	register struct mbuf *m0;
+	void (*op)(struct file *);
+{
+	struct mbuf *m;
+	struct file **rp;
+	struct cmsghdr *cm;
+	void *data;
+	int i;
+	socklen_t clen, datalen;
+	int qfds;
+
+	while (m0) {
+		for (m = m0; m; m = m->m_next) {
+			if (m->m_type != MT_CONTROL)
+				continue;
+
+			cm = mtod(m, struct cmsghdr *);
+			clen = m->m_len;
+
+			while (cm != NULL) {
+				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
+					break;
+
+				data = CMSG_DATA(cm);
+				datalen = (caddr_t)cm + cm->cmsg_len
+				    - (caddr_t)data;
+
+				if (cm->cmsg_level == SOL_SOCKET &&
+				    cm->cmsg_type == SCM_RIGHTS) {
+					qfds = datalen / sizeof (struct file *);
+					rp = data;
+					for (i = 0; i < qfds; i++)
+						(*op)(*rp++);
+				}
+
+				if (CMSG_SPACE(datalen) < clen) {
+					clen -= CMSG_SPACE(datalen);
+					cm = (struct cmsghdr *)
+					    ((caddr_t)cm + CMSG_SPACE(datalen));
+				} else {
+					clen = 0;
+					cm = NULL;
+				}
+			}
+		}
+		m0 = m0->m_act;
+	}
+}
+
+static void
+unp_mark(fp)
+	struct file *fp;
+{
+	if (fp->f_gcflag & FMARK)
+		return;
+	unp_defer++;
+	fp->f_gcflag |= (FMARK|FDEFER);
+}
+
+static void
+unp_discard(fp)
+	struct file *fp;
+{
+	FILE_LOCK(fp);
+	fp->f_msgcount--;
+	unp_rights--;
+	FILE_UNLOCK(fp);
+	(void) closef(fp, (struct thread *)NULL);
+}
diff --git a/sys/kern/vfs_acl.c b/sys/kern/vfs_acl.c
new file mode 100644
index 0000000..70be0ec
--- /dev/null
+++ b/sys/kern/vfs_acl.c
@@ -0,0 +1,830 @@
+/*-
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ * Support for POSIX.1e access control lists.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+
+MALLOC_DEFINE(M_ACL, "acl", "access control list");
+
+static int	vacl_set_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_get_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+
+/*
+ * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
+ * Return 0 on success, else an errno value.  Should be merged into
+ * vaccess() eventually.
+ */
+int
+vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
+    struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
+{
+	struct acl_entry *acl_other, *acl_mask;
+	mode_t dac_granted;
+	mode_t cap_granted;
+	mode_t acl_mask_granted;
+	int group_matched, i;
+
+	/*
+	 * Look for a normal, non-privileged way to access the file/directory
+	 * as requested.  If it exists, go with that.  Otherwise, attempt
+	 * to use privileges granted via cap_granted.  In some cases,
+	 * which privileges to use may be ambiguous due to "best match",
+	 * in which case fall back on first match for the time being.
+	 */
+	if (privused != NULL)
+		*privused = 0;
+
+	/*
+	 * Determine privileges now, but don't apply until we've found
+	 * a DAC entry that matches but has failed to allow access.
+	 */
+#ifndef CAPABILITIES
+	if (suser_cred(cred, PRISON_ROOT) == 0)
+		cap_granted = (VEXEC | VREAD | VWRITE | VADMIN);
+	else
+		cap_granted = 0;
+#else
+	cap_granted = 0;
+
+	if (type == VDIR) {
+		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+		     CAP_DAC_READ_SEARCH, PRISON_ROOT))
+			cap_granted |= VEXEC;
+	} else {
+		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
+		    CAP_DAC_EXECUTE, PRISON_ROOT))
+			cap_granted |= VEXEC;
+	}
+
+	if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
+	    PRISON_ROOT))
+		cap_granted |= VREAD;
+
+	if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE,
+	    PRISON_ROOT))
+		cap_granted |= VWRITE;
+
+	if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
+	    PRISON_ROOT))
+		cap_granted |= VADMIN;
+#endif /* CAPABILITIES */
+
+	/*
+	 * The owner matches if the effective uid associated with the
+	 * credential matches that of the ACL_USER_OBJ entry.  While we're
+	 * doing the first scan, also cache the location of the ACL_MASK
+	 * and ACL_OTHER entries, preventing some future iterations.
+	 */
+	acl_mask = acl_other = NULL;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			if (file_uid != cred->cr_uid)
+				break;
+			dac_granted = 0;
+			dac_granted |= VADMIN;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+			if ((acc_mode & (dac_granted | cap_granted)) ==
+			    acc_mode) {
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+			}
+			goto error;
+
+		case ACL_MASK:
+			acl_mask = &acl->acl_entry[i];
+			break;
+
+		case ACL_OTHER:
+			acl_other = &acl->acl_entry[i];
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * An ACL_OTHER entry should always exist in a valid access
+	 * ACL.  If it doesn't, then generate a serious failure.  For now,
+	 * this means a debugging message and EPERM, but in the future
+	 * should probably be a panic.
+	 */
+	if (acl_other == NULL) {
+		/*
+		 * XXX This should never happen
+		 */
+		printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
+		return (EPERM);
+	}
+
+	/*
+	 * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
+	 * are masked by an ACL_MASK entry, if any.  As such, first identify
+	 * the ACL_MASK field, then iterate through identifying potential
+	 * user matches, then group matches.  If there is no ACL_MASK,
+	 * assume that the mask allows all requests to succeed.
+	 */
+	if (acl_mask != NULL) {
+		acl_mask_granted = 0;
+		if (acl_mask->ae_perm & ACL_EXECUTE)
+			acl_mask_granted |= VEXEC;
+		if (acl_mask->ae_perm & ACL_READ)
+			acl_mask_granted |= VREAD;
+		if (acl_mask->ae_perm & ACL_WRITE)
+			acl_mask_granted |= VWRITE;
+	} else
+		acl_mask_granted = VEXEC | VREAD | VWRITE;
+
+	/*
+	 * Iterate through user ACL entries.  Do checks twice, first
+	 * without privilege, and then if a match is found but failed,
+	 * a second time with privilege.
+	 */
+
+	/*
+	 * Check ACL_USER ACL entries.
+	 */
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER:
+			if (acl->acl_entry[i].ae_id != cred->cr_uid)
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			dac_granted &= acl_mask_granted;
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+			if ((acc_mode & (dac_granted | cap_granted)) !=
+			    acc_mode)
+				goto error;
+
+			if (privused != NULL)
+				*privused = 1;
+			return (0);
+		}
+	}
+
+	/*
+	 * Group match is best-match, not first-match, so find a 
+	 * "best" match.  Iterate across, testing each potential group
+	 * match.  Make sure we keep track of whether we found a match
+	 * or not, so that we know if we should try again with any
+	 * available privilege, or if we should move on to ACL_OTHER.
+	 */
+	group_matched = 0;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_GROUP_OBJ:
+			if (!groupmember(file_gid, cred))
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			dac_granted  &= acl_mask_granted;
+
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+
+			group_matched = 1;
+			break;
+
+		case ACL_GROUP:
+			if (!groupmember(acl->acl_entry[i].ae_id, cred))
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= VWRITE;
+			dac_granted  &= acl_mask_granted;
+
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+
+			group_matched = 1;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	if (group_matched == 1) {
+		/*
+		 * There was a match, but it did not grant rights via
+		 * pure DAC.  Try again, this time with privilege.
+		 */
+		for (i = 0; i < acl->acl_cnt; i++) {
+			switch (acl->acl_entry[i].ae_tag) {
+			case ACL_GROUP_OBJ:
+				if (!groupmember(file_gid, cred))
+					break;
+				dac_granted = 0;
+				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+					dac_granted |= VEXEC;
+				if (acl->acl_entry[i].ae_perm & ACL_READ)
+					dac_granted |= VREAD;
+				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+					dac_granted |= VWRITE;
+				dac_granted &= acl_mask_granted;
+
+				if ((acc_mode & (dac_granted | cap_granted)) !=
+				    acc_mode)
+					break;
+
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+
+			case ACL_GROUP:
+				if (!groupmember(acl->acl_entry[i].ae_id,
+				    cred))
+					break;
+				dac_granted = 0;
+				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+				if (acl->acl_entry[i].ae_perm & ACL_READ)
+					dac_granted |= VREAD;
+				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+					dac_granted |= VWRITE;
+				dac_granted &= acl_mask_granted;
+
+				if ((acc_mode & (dac_granted | cap_granted)) !=
+				    acc_mode)
+					break;
+
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+
+			default:
+				break;
+			}
+		}
+		/*
+		 * Even with privilege, group membership was not sufficient.
+		 * Return failure.
+		 */
+		goto error;
+	}
+		
+	/*
+	 * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
+	 */
+	dac_granted = 0;
+	if (acl_other->ae_perm & ACL_EXECUTE)
+		dac_granted |= VEXEC;
+	if (acl_other->ae_perm & ACL_READ)
+		dac_granted |= VREAD;
+	if (acl_other->ae_perm & ACL_WRITE)
+		dac_granted |= VWRITE;
+
+	if ((acc_mode & dac_granted) == acc_mode)
+		return (0);
+	if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
+		if (privused != NULL)
+			*privused = 1;
+		return (0);
+	}
+
+error:
+	return ((acc_mode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * For the purposes of filesystems maintaining the _OBJ entries in an
+ * inode with a mode_t field, this routine converts a mode_t entry
+ * to an acl_perm_t.
+ */
+acl_perm_t
+acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
+{
+	acl_perm_t	perm = 0;
+
+	switch(tag) {
+	case ACL_USER_OBJ:
+		if (mode & S_IXUSR)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IRUSR)
+			perm |= ACL_READ;
+		if (mode & S_IWUSR)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	case ACL_GROUP_OBJ:
+		if (mode & S_IXGRP)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IRGRP)
+			perm |= ACL_READ;
+		if (mode & S_IWGRP)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	case ACL_OTHER:
+		if (mode & S_IXOTH)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IROTH)
+			perm |= ACL_READ;
+		if (mode & S_IWOTH)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	default:
+		printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
+		return (0);
+	}
+}
+
+/*
+ * Given inode information (uid, gid, mode), return an acl entry of the
+ * appropriate type.
+ */
+struct acl_entry
+acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
+{
+	struct acl_entry	acl_entry;
+
+	acl_entry.ae_tag = tag;
+	acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
+	switch(tag) {
+	case ACL_USER_OBJ:
+		acl_entry.ae_id = uid;
+		break;
+
+	case ACL_GROUP_OBJ:
+		acl_entry.ae_id = gid;
+		break;
+
+	case ACL_OTHER:
+		acl_entry.ae_id = ACL_UNDEFINED_ID;
+		break;
+
+	default:
+		acl_entry.ae_id = ACL_UNDEFINED_ID;
+		printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
+	}
+
+	return (acl_entry);
+}
+
+/*
+ * Utility function to generate a file mode given appropriate ACL entries.
+ */
+mode_t
+acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
+    struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
+{
+	mode_t	mode;
+
+	mode = 0;
+	if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXUSR;
+	if (acl_user_obj_entry->ae_perm & ACL_READ)
+		mode |= S_IRUSR;
+	if (acl_user_obj_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWUSR;
+	if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXGRP;
+	if (acl_group_obj_entry->ae_perm & ACL_READ)
+		mode |= S_IRGRP;
+	if (acl_group_obj_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWGRP;
+	if (acl_other_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXOTH;
+	if (acl_other_entry->ae_perm & ACL_READ)
+		mode |= S_IROTH;
+	if (acl_other_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWOTH;
+
+	return (mode);
+}
+
+/*
+ * Perform a syntactic check of the ACL, sufficient to allow an
+ * implementing filesystem to determine if it should accept this and
+ * rely on the POSIX.1e ACL properties.
+ */
+int
+acl_posix1e_check(struct acl *acl)
+{
+	int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
+	int num_acl_mask, num_acl_other, i;
+
+	/*
+	 * Verify that the number of entries does not exceed the maximum
+	 * defined for acl_t.
+	 * Verify that the correct number of various sorts of ae_tags are
+	 * present:
+	 *   Exactly one ACL_USER_OBJ
+	 *   Exactly one ACL_GROUP_OBJ
+	 *   Exactly one ACL_OTHER
+	 *   If any ACL_USER or ACL_GROUP entries appear, then exactly one
+	 *   ACL_MASK entry must also appear.
+	 * Verify that all ae_perm entries are in ACL_PERM_BITS.
+	 * Verify all ae_tag entries are understood by this implementation.
+	 * Note: Does not check for uniqueness of qualifier (ae_id) field.
+	 */
+	num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
+	    num_acl_mask = num_acl_other = 0;
+	if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
+		return (EINVAL);
+	for (i = 0; i < acl->acl_cnt; i++) {
+		/*
+		 * Check for a valid tag.
+		 */
+		switch(acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_user_obj++;
+			break;
+		case ACL_GROUP_OBJ:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_group_obj++;
+			break;
+		case ACL_USER:
+			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_user++;
+			break;
+		case ACL_GROUP:
+			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_group++;
+			break;
+		case ACL_OTHER:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_other++;
+			break;
+		case ACL_MASK:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_mask++;
+			break;
+		default:
+			return (EINVAL);
+		}
+		/*
+		 * Check for valid perm entries.
+		 */
+		if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
+		    ACL_PERM_BITS)
+			return (EINVAL);
+	}
+	if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
+	    (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
+		return (EINVAL);
+	if (((num_acl_group != 0) || (num_acl_user != 0)) &&
+	    (num_acl_mask != 1))
+		return (EINVAL);
+	return (0);
+}
+
+/*
+ * These calls wrap the real vnode operations, and are called by the 
+ * syscall code once the syscall has converted the path or file
+ * descriptor to a vnode (unlocked).  The aclp pointer is assumed
+ * still to point to userland, so this should not be consumed within
+ * the kernel except by syscall code.  Other code should directly
+ * invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernacl;
+	struct mount *mp;
+	int error;
+
+	error = copyin(aclp, &inkernacl, sizeof(struct acl));
+	if (error)
+		return(error);
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return(error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernelacl;
+	int error;
+
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	if (error == 0)
+		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
+	return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+	struct mount *mp;
+	int error;
+
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = VOP_SETACL(vp, type, NULL, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernelacl;
+	int error;
+
+	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
+	if (error)
+		return(error);
+	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
+	return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
+ * Don't need to lock, as the vacl_ code will get/release any locks
+ * required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), 
+			    SCARG(uap, aclp));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type),
+			    SCARG(uap, aclp));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_get_acl(td, (struct vnode *)fp->f_data,
+			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_set_acl(td, (struct vnode *)fp->f_data,
+			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_delete(td, nd.ni_vp, SCARG(uap, type));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ *
+ * MPSAFE
+ */
+int
+__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_delete(td, (struct vnode *)fp->f_data, 
+			    SCARG(uap, type));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+	struct nameidata	nd;
+	int	error;
+
+	mtx_lock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type),
+			    SCARG(uap, aclp));
+		NDFREE(&nd, 0);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it
+ *
+ * MPSAFE
+ */
+int
+__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	mtx_lock(&Giant);
+	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
+	if (error == 0) {
+		error = vacl_aclcheck(td, (struct vnode *)fp->f_data,
+			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
new file mode 100644
index 0000000..891f272
--- /dev/null
+++ b/sys/kern/vfs_aio.c
@@ -0,0 +1,2307 @@
+/*
+ * Copyright (c) 1997 John S. Dyson.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. John S. Dyson's name may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
+ * bad that happens because of using this software isn't the responsibility
+ * of the author.  This software is distributed AS-IS.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This file contains support for the POSIX 1003.1B AIO/LIO facility.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/unistd.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/protosw.h>
+#include <sys/socketvar.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/sx.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/uma.h>
+#include <sys/aio.h>
+
+#include <machine/limits.h>
+
+#include "opt_vfs_aio.h"
+
+/*
+ * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
+ * overflow.
+ */
+static	long jobrefid;
+
+#define JOBST_NULL		0x0
+#define JOBST_JOBQGLOBAL	0x2
+#define JOBST_JOBRUNNING	0x3
+#define JOBST_JOBFINISHED	0x4
+#define	JOBST_JOBQBUF		0x5
+#define	JOBST_JOBBFINISHED	0x6
+
+#ifndef MAX_AIO_PER_PROC
+#define MAX_AIO_PER_PROC	32
+#endif
+
+#ifndef MAX_AIO_QUEUE_PER_PROC
+#define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef MAX_AIO_PROCS
+#define MAX_AIO_PROCS		32
+#endif
+
+#ifndef MAX_AIO_QUEUE
+#define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef TARGET_AIO_PROCS
+#define TARGET_AIO_PROCS	4
+#endif
+
+#ifndef MAX_BUF_AIO
+#define MAX_BUF_AIO		16
+#endif
+
+#ifndef AIOD_TIMEOUT_DEFAULT
+#define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
+#endif
+
+#ifndef AIOD_LIFETIME_DEFAULT
+#define AIOD_LIFETIME_DEFAULT	(30 * hz)
+#endif
+
+SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
+
+static int max_aio_procs = MAX_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
+	CTLFLAG_RW, &max_aio_procs, 0,
+	"Maximum number of kernel threads to use for handling async IO ");
+
+static int num_aio_procs = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
+	CTLFLAG_RD, &num_aio_procs, 0,
+	"Number of presently active kernel threads for async IO");
+
+/*
+ * The code will adjust the actual number of AIO processes towards this
+ * number when it gets a chance.
+ */
+static int target_aio_procs = TARGET_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
+	0, "Preferred number of ready kernel threads for async IO");
+
+static int max_queue_count = MAX_AIO_QUEUE;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
+    "Maximum number of aio requests to queue, globally");
+
+static int num_queue_count = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
+    "Number of queued aio requests");
+
+static int num_buf_aio = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
+    "Number of aio requests presently handled by the buf subsystem");
+
+/* Number of async I/O thread in the process of being started */
+/* XXX This should be local to _aio_aqueue() */
+static int num_aio_resv_start = 0;
+
+static int aiod_timeout;
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
+    "Timeout value for synchronous aio operations");
+
+static int aiod_lifetime;
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
+    "Maximum lifetime for idle aiod");
+
+static int unloadable = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
+    "Allow unload of aio (not recommended)");
+
+
+static int max_aio_per_proc = MAX_AIO_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
+    0, "Maximum active aio requests per process (stored in the process)");
+
+static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
+    &max_aio_queue_per_proc, 0,
+    "Maximum queued aio requests per process (stored in the process)");
+
+static int max_buf_aio = MAX_BUF_AIO;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
+    "Maximum buf aio requests per process (stored in the process)");
+
+struct aiocblist {
+        TAILQ_ENTRY(aiocblist) list;	/* List of jobs */
+        TAILQ_ENTRY(aiocblist) plist;	/* List of jobs for proc */
+        int	jobflags;
+        int	jobstate;
+	int	inputcharge;
+	int	outputcharge;
+	struct	callout_handle timeouthandle;
+        struct	buf *bp;		/* Buffer pointer */
+        struct	proc *userproc;		/* User process */ /* Not td! */
+        struct	file *fd_file;		/* Pointer to file structure */ 
+        struct	aio_liojob *lio;	/* Optional lio job */
+        struct	aiocb *uuaiocb;		/* Pointer in userspace of aiocb */
+	struct	klist klist;		/* list of knotes */
+        struct	aiocb uaiocb;		/* Kernel I/O control block */
+};
+
+/* jobflags */
+#define AIOCBLIST_RUNDOWN       0x4
+#define AIOCBLIST_ASYNCFREE     0x8
+#define AIOCBLIST_DONE          0x10
+
+/*
+ * AIO process info
+ */
+#define AIOP_FREE	0x1			/* proc on free queue */
+#define AIOP_SCHED	0x2			/* proc explicitly scheduled */
+
+struct aiothreadlist {
+	int aiothreadflags;			/* AIO proc flags */
+	TAILQ_ENTRY(aiothreadlist) list;	/* List of processes */
+	struct thread *aiothread;		/* The AIO thread */
+};
+
+/*
+ * data-structure for lio signal management
+ */
+struct aio_liojob {
+	int	lioj_flags;
+	int	lioj_buffer_count;
+	int	lioj_buffer_finished_count;
+	int	lioj_queue_count;
+	int	lioj_queue_finished_count;
+	struct	sigevent lioj_signal;	/* signal on all I/O done */
+	TAILQ_ENTRY(aio_liojob) lioj_list;
+	struct	kaioinfo *lioj_ki;
+};
+#define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
+#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
+
+/*
+ * per process aio data structure
+ */
+struct kaioinfo {
+	int	kaio_flags;		/* per process kaio flags */
+	int	kaio_maxactive_count;	/* maximum number of AIOs */
+	int	kaio_active_count;	/* number of currently used AIOs */
+	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
+	int	kaio_queue_count;	/* size of AIO queue */
+	int	kaio_ballowed_count;	/* maximum number of buffers */
+	int	kaio_queue_finished_count; /* number of daemon jobs finished */
+	int	kaio_buffer_count;	/* number of physio buffers */
+	int	kaio_buffer_finished_count; /* count of I/O done */
+	struct 	proc *kaio_p;		/* process that uses this kaio block */
+	TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
+	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* job queue for process */
+	TAILQ_HEAD(,aiocblist) kaio_jobdone;	/* done queue for process */
+	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* buffer job queue for process */
+	TAILQ_HEAD(,aiocblist) kaio_bufdone;	/* buffer done queue for process */
+	TAILQ_HEAD(,aiocblist) kaio_sockqueue;	/* queue for aios waiting on sockets */
+};
+
+#define KAIO_RUNDOWN	0x1	/* process is being run down */
+#define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
+
+static TAILQ_HEAD(,aiothreadlist) aio_activeproc;	/* Active daemons */
+static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* Idle daemons */
+static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
+static TAILQ_HEAD(,aiocblist) aio_bufjobs;		/* Phys I/O job list */
+
+static void	aio_init_aioinfo(struct proc *p);
+static void	aio_onceonly(void);
+static int	aio_free_entry(struct aiocblist *aiocbe);
+static void	aio_process(struct aiocblist *aiocbe);
+static int	aio_newproc(void);
+static int	aio_aqueue(struct thread *td, struct aiocb *job, int type);
+static void	aio_physwakeup(struct buf *bp);
+static void	aio_proc_rundown(struct proc *p);
+static int	aio_fphysio(struct aiocblist *aiocbe);
+static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
+static void	aio_daemon(void *uproc);
+static void	aio_swake_cb(struct socket *, struct sockbuf *);
+static int	aio_unload(void);
+static void	process_signal(void *aioj);
+static int	filt_aioattach(struct knote *kn);
+static void	filt_aiodetach(struct knote *kn);
+static int	filt_aio(struct knote *kn, long hint);
+
+/*
+ * Zones for:
+ * 	kaio	Per process async io info
+ *	aiop	async io thread data
+ *	aiocb	async io jobs
+ *	aiol	list io job pointer - internal to aio_suspend XXX
+ *	aiolio	list io jobs
+ */
+static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
+
+/* kqueue filters for aio */
+static struct filterops aio_filtops =
+	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
+
+/*
+ * Main operations function for use as a kernel module.
+ */
+static int
+aio_modload(struct module *module, int cmd, void *arg)
+{
+	int error = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		aio_onceonly();
+		break;
+	case MOD_UNLOAD:
+		error = aio_unload();
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t aio_mod = {
+	"aio",
+	&aio_modload,
+	NULL
+};
+
+SYSCALL_MODULE_HELPER(aio_return);
+SYSCALL_MODULE_HELPER(aio_suspend);
+SYSCALL_MODULE_HELPER(aio_cancel);
+SYSCALL_MODULE_HELPER(aio_error);
+SYSCALL_MODULE_HELPER(aio_read);
+SYSCALL_MODULE_HELPER(aio_write);
+SYSCALL_MODULE_HELPER(aio_waitcomplete);
+SYSCALL_MODULE_HELPER(lio_listio);
+
+DECLARE_MODULE(aio, aio_mod,
+	SI_SUB_VFS, SI_ORDER_ANY);
+MODULE_VERSION(aio, 1);
+
+/*
+ * Startup initialization
+ */
+static void
+aio_onceonly(void)
+{
+
+	/* XXX: should probably just use so->callback */
+	aio_swake = &aio_swake_cb;
+	at_exit(aio_proc_rundown);
+	at_exec(aio_proc_rundown);
+	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
+	TAILQ_INIT(&aio_freeproc);
+	TAILQ_INIT(&aio_activeproc);
+	TAILQ_INIT(&aio_jobs);
+	TAILQ_INIT(&aio_bufjobs);
+	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
+	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
+	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aio_liojob), NULL,
+	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
+	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
+	jobrefid = 1;
+}
+
+/*
+ * Callback for unload of AIO when used as a module.
+ */
+static int
+aio_unload(void)
+{
+
+	/*
+	 * XXX: no unloads by default, it's too dangerous.
+	 * perhaps we could do it if locked out callers and then
+	 * did an aio_proc_rundown() on each process.
+	 */
+	if (!unloadable)
+		return (EOPNOTSUPP);
+
+	aio_swake = NULL;
+	rm_at_exit(aio_proc_rundown);
+	rm_at_exec(aio_proc_rundown);
+	kqueue_del_filteropts(EVFILT_AIO);
+	return (0);
+}
+
+/*
+ * Init the per-process aioinfo structure.  The aioinfo limits are set
+ * per-process for user limit (resource) management.
+ */
+static void
+aio_init_aioinfo(struct proc *p)
+{
+	struct kaioinfo *ki;
+	if (p->p_aioinfo == NULL) {
+		ki = uma_zalloc(kaio_zone, M_WAITOK);
+		p->p_aioinfo = ki;
+		ki->kaio_flags = 0;
+		ki->kaio_maxactive_count = max_aio_per_proc;
+		ki->kaio_active_count = 0;
+		ki->kaio_qallowed_count = max_aio_queue_per_proc;
+		ki->kaio_queue_count = 0;
+		ki->kaio_ballowed_count = max_buf_aio;
+		ki->kaio_buffer_count = 0;
+		ki->kaio_buffer_finished_count = 0;
+		ki->kaio_p = p;
+		TAILQ_INIT(&ki->kaio_jobdone);
+		TAILQ_INIT(&ki->kaio_jobqueue);
+		TAILQ_INIT(&ki->kaio_bufdone);
+		TAILQ_INIT(&ki->kaio_bufqueue);
+		TAILQ_INIT(&ki->kaio_liojoblist);
+		TAILQ_INIT(&ki->kaio_sockqueue);
+	}
+	
+	while (num_aio_procs < target_aio_procs)
+		aio_newproc();
+}
+
+/*
+ * Free a job entry.  Wait for completion if it is currently active, but don't
+ * delay forever.  If we delay, we return a flag that says that we have to
+ * restart the queue scan.
+ */
+static int
+aio_free_entry(struct aiocblist *aiocbe)
+{
+	struct kaioinfo *ki;
+	struct aio_liojob *lj;
+	struct proc *p;
+	int error;
+	int s;
+
+	if (aiocbe->jobstate == JOBST_NULL)
+		panic("aio_free_entry: freeing already free job");
+
+	p = aiocbe->userproc;
+	ki = p->p_aioinfo;
+	lj = aiocbe->lio;
+	if (ki == NULL)
+		panic("aio_free_entry: missing p->p_aioinfo");
+
+	while (aiocbe->jobstate == JOBST_JOBRUNNING) {
+		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
+			return 0;
+		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
+		tsleep(aiocbe, PRIBIO, "jobwai", 0);
+	}
+	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
+
+	if (aiocbe->bp == NULL) {
+		if (ki->kaio_queue_count <= 0)
+			panic("aio_free_entry: process queue size <= 0");
+		if (num_queue_count <= 0)
+			panic("aio_free_entry: system wide queue size <= 0");
+	
+		if (lj) {
+			lj->lioj_queue_count--;
+			if (aiocbe->jobflags & AIOCBLIST_DONE)
+				lj->lioj_queue_finished_count--;
+		}
+		ki->kaio_queue_count--;
+		if (aiocbe->jobflags & AIOCBLIST_DONE)
+			ki->kaio_queue_finished_count--;
+		num_queue_count--;
+	} else {
+		if (lj) {
+			lj->lioj_buffer_count--;
+			if (aiocbe->jobflags & AIOCBLIST_DONE)
+				lj->lioj_buffer_finished_count--;
+		}
+		if (aiocbe->jobflags & AIOCBLIST_DONE)
+			ki->kaio_buffer_finished_count--;
+		ki->kaio_buffer_count--;
+		num_buf_aio--;
+	}
+
+	/* aiocbe is going away, we need to destroy any knotes */
+	/* XXXKSE Note the thread here is used to eventually find the 
+	 * owning process again, but it is also used to do a fo_close
+	 * and that requires the thread. (but does it require the
+	 * OWNING thread? (or maybe the running thread?)
+	 * There is a semantic problem here... 
+	 */
+	knote_remove(FIRST_THREAD_IN_PROC(p), &aiocbe->klist); /* XXXKSE */
+
+	if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
+	    && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
+		ki->kaio_flags &= ~KAIO_WAKEUP;
+		wakeup(p);
+	}
+
+	if (aiocbe->jobstate == JOBST_JOBQBUF) {
+		if ((error = aio_fphysio(aiocbe)) != 0)
+			return error;
+		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
+			panic("aio_free_entry: invalid physio finish-up state");
+		s = splbio();
+		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
+		splx(s);
+	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
+		s = splnet();
+		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
+		TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
+		splx(s);
+	} else if (aiocbe->jobstate == JOBST_JOBFINISHED)
+		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
+	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
+		s = splbio();
+		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
+		splx(s);
+		if (aiocbe->bp) {
+			vunmapbuf(aiocbe->bp);
+			relpbuf(aiocbe->bp, NULL);
+			aiocbe->bp = NULL;
+		}
+	}
+	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
+		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+		uma_zfree(aiolio_zone, lj);
+	}
+	aiocbe->jobstate = JOBST_NULL;
+	untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
+	fdrop(aiocbe->fd_file, curthread);
+	uma_zfree(aiocb_zone, aiocbe);
+	return 0;
+}
+
+/*
+ * Rundown the jobs for a given process.  
+ */
+static void
+aio_proc_rundown(struct proc *p)
+{
+	int s;
+	struct kaioinfo *ki;
+	struct aio_liojob *lj, *ljn;
+	struct aiocblist *aiocbe, *aiocbn;
+	struct file *fp;
+	struct socket *so;
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return;
+
+	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
+	while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
+	    ki->kaio_buffer_finished_count)) {
+		ki->kaio_flags |= KAIO_RUNDOWN;
+		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
+			break;
+	}
+
+	/*
+	 * Move any aio ops that are waiting on socket I/O to the normal job
+	 * queues so they are cleaned up with any others.
+	 */
+	s = splnet();
+	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
+	    aiocbn) {
+		aiocbn = TAILQ_NEXT(aiocbe, plist);
+		fp = aiocbe->fd_file;
+		if (fp != NULL) {
+			so = (struct socket *)fp->f_data;
+			TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
+			if (TAILQ_EMPTY(&so->so_aiojobq)) {
+				so->so_snd.sb_flags &= ~SB_AIO;
+				so->so_rcv.sb_flags &= ~SB_AIO;
+			}
+		}
+		TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
+		TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
+		TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
+	}
+	splx(s);
+
+restart1:
+	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
+		aiocbn = TAILQ_NEXT(aiocbe, plist);
+		if (aio_free_entry(aiocbe))
+			goto restart1;
+	}
+
+restart2:
+	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
+	    aiocbn) {
+		aiocbn = TAILQ_NEXT(aiocbe, plist);
+		if (aio_free_entry(aiocbe))
+			goto restart2;
+	}
+
+/*
+ * Note the use of lots of splbio here, trying to avoid splbio for long chains
+ * of I/O.  Probably unnecessary.
+ */
+restart3:
+	s = splbio();
+	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
+		ki->kaio_flags |= KAIO_WAKEUP;
+		tsleep(p, PRIBIO, "aioprn", 0);
+		splx(s);
+		goto restart3;
+	}
+	splx(s);
+
+restart4:
+	s = splbio();
+	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
+		aiocbn = TAILQ_NEXT(aiocbe, plist);
+		if (aio_free_entry(aiocbe)) {
+			splx(s);
+			goto restart4;
+		}
+	}
+	splx(s);
+
+        /*
+         * If we've slept, jobs might have moved from one queue to another.
+         * Retry rundown if we didn't manage to empty the queues.
+         */
+        if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
+	    TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
+	    TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
+	    TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
+		goto restart1;
+
+	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
+		ljn = TAILQ_NEXT(lj, lioj_list);
+		if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
+		    0)) {
+			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+			uma_zfree(aiolio_zone, lj);
+		} else {
+#ifdef DIAGNOSTIC
+			printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
+			    "QF:%d\n", lj->lioj_buffer_count,
+			    lj->lioj_buffer_finished_count,
+			    lj->lioj_queue_count,
+			    lj->lioj_queue_finished_count);
+#endif
+		}
+	}
+
+	uma_zfree(kaio_zone, ki);
+	p->p_aioinfo = NULL;
+}
+
+/*
+ * Select a job to run (called by an AIO daemon).
+ */
+static struct aiocblist *
+aio_selectjob(struct aiothreadlist *aiop)
+{
+	int s;
+	struct aiocblist *aiocbe;
+	struct kaioinfo *ki;
+	struct proc *userp;
+
+	s = splnet();
+	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
+	    TAILQ_NEXT(aiocbe, list)) {
+		userp = aiocbe->userproc;
+		ki = userp->p_aioinfo;
+
+		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
+			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
+			splx(s);
+			return aiocbe;
+		}
+	}
+	splx(s);
+
+	return NULL;
+}
+
+/*
+ * The AIO processing activity.  This is the code that does the I/O request for
+ * the non-physio version of the operations.  The normal vn operations are used,
+ * and this code should work in all instances for every type of file, including
+ * pipes, sockets, fifos, and regular files.
+ */
+static void
+aio_process(struct aiocblist *aiocbe)
+{
+	struct thread *td;
+	struct proc *mycp;
+	struct aiocb *cb;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	int cnt;
+	int error;
+	int oublock_st, oublock_end;
+	int inblock_st, inblock_end;
+
+	td = curthread;
+	mycp = td->td_proc;
+	cb = &aiocbe->uaiocb;
+	fp = aiocbe->fd_file;
+
+	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
+	aiov.iov_len = cb->aio_nbytes;
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = cb->aio_offset;
+	auio.uio_resid = cb->aio_nbytes;
+	cnt = cb->aio_nbytes;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+
+	inblock_st = mycp->p_stats->p_ru.ru_inblock;
+	oublock_st = mycp->p_stats->p_ru.ru_oublock;
+	/*
+	 * _aio_aqueue() acquires a reference to the file that is
+	 * released in aio_free_entry().
+	 */
+	if (cb->aio_lio_opcode == LIO_READ) {
+		auio.uio_rw = UIO_READ;
+		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+	} else {
+		auio.uio_rw = UIO_WRITE;
+		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+	}
+	inblock_end = mycp->p_stats->p_ru.ru_inblock;
+	oublock_end = mycp->p_stats->p_ru.ru_oublock;
+
+	aiocbe->inputcharge = inblock_end - inblock_st;
+	aiocbe->outputcharge = oublock_end - oublock_st;
+
+	if ((error) && (auio.uio_resid != cnt)) {
+		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
+			error = 0;
+		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
+			PROC_LOCK(aiocbe->userproc);
+			psignal(aiocbe->userproc, SIGPIPE);
+			PROC_UNLOCK(aiocbe->userproc);
+		}
+	}
+
+	cnt -= auio.uio_resid;
+	cb->_aiocb_private.error = error;
+	cb->_aiocb_private.status = cnt;
+}
+
+/*
+ * The AIO daemon, most of the actual work is done in aio_process,
+ * but the setup (and address space mgmt) is done in this routine.
+ */
+static void
+aio_daemon(void *uproc)
+{
+	int s;
+	struct aio_liojob *lj;
+	struct aiocb *cb;
+	struct aiocblist *aiocbe;
+	struct aiothreadlist *aiop;
+	struct kaioinfo *ki;
+	struct proc *curcp, *mycp, *userp;
+	struct vmspace *myvm, *tmpvm;
+	struct thread *td = curthread;
+	struct pgrp *newpgrp;
+	struct session *newsess;
+
+	mtx_lock(&Giant);
+	/*
+	 * Local copies of curproc (cp) and vmspace (myvm)
+	 */
+	mycp = td->td_proc;
+	myvm = mycp->p_vmspace;
+
+	if (mycp->p_textvp) {
+		vrele(mycp->p_textvp);
+		mycp->p_textvp = NULL;
+	}
+
+	/*
+	 * Allocate and ready the aio control info.  There is one aiop structure
+	 * per daemon.
+	 */
+	aiop = uma_zalloc(aiop_zone, M_WAITOK);
+	aiop->aiothread = td;
+	aiop->aiothreadflags |= AIOP_FREE;
+
+	s = splnet();
+
+	/*
+	 * Place thread (lightweight process) onto the AIO free thread list.
+	 */
+	if (TAILQ_EMPTY(&aio_freeproc))
+		wakeup(&aio_freeproc);
+	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+
+	splx(s);
+
+	/*
+	 * Get rid of our current filedescriptors.  AIOD's don't need any
+	 * filedescriptors, except as temporarily inherited from the client.
+	 */
+	fdfree(td);
+	mycp->p_fd = NULL;
+
+	mtx_unlock(&Giant);
+	/* The daemon resides in its own pgrp. */
+	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP,
+		M_WAITOK | M_ZERO);
+	MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION,
+		M_WAITOK | M_ZERO);
+
+	sx_xlock(&proctree_lock);
+	enterpgrp(mycp, mycp->p_pid, newpgrp, newsess);
+	sx_xunlock(&proctree_lock);
+	mtx_lock(&Giant);
+
+	/* Mark special process type. */
+	mycp->p_flag |= P_SYSTEM;
+
+	/*
+	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
+	 * and creating too many daemons.)
+	 */
+	wakeup(mycp);
+
+	for (;;) {
+		/*
+		 * curcp is the current daemon process context.
+		 * userp is the current user process context.
+		 */
+		curcp = mycp;
+
+		/*
+		 * Take daemon off of free queue
+		 */
+		if (aiop->aiothreadflags & AIOP_FREE) {
+			s = splnet();
+			TAILQ_REMOVE(&aio_freeproc, aiop, list);
+			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
+			aiop->aiothreadflags &= ~AIOP_FREE;
+			splx(s);
+		}
+		aiop->aiothreadflags &= ~AIOP_SCHED;
+
+		/*
+		 * Check for jobs.
+		 */
+		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
+			cb = &aiocbe->uaiocb;
+			userp = aiocbe->userproc;
+
+			aiocbe->jobstate = JOBST_JOBRUNNING;
+
+			/*
+			 * Connect to process address space for user program.
+			 */
+			if (userp != curcp) {
+				/*
+				 * Save the current address space that we are
+				 * connected to.
+				 */
+				tmpvm = mycp->p_vmspace;
+				
+				/*
+				 * Point to the new user address space, and
+				 * refer to it.
+				 */
+				mycp->p_vmspace = userp->p_vmspace;
+				mycp->p_vmspace->vm_refcnt++;
+				
+				/* Activate the new mapping. */
+				pmap_activate(FIRST_THREAD_IN_PROC(mycp));
+				
+				/*
+				 * If the old address space wasn't the daemons
+				 * own address space, then we need to remove the
+				 * daemon's reference from the other process
+				 * that it was acting on behalf of.
+				 */
+				if (tmpvm != myvm) {
+					vmspace_free(tmpvm);
+				}
+				curcp = userp;
+			}
+
+			ki = userp->p_aioinfo;
+			lj = aiocbe->lio;
+
+			/* Account for currently active jobs. */
+			ki->kaio_active_count++;
+
+			/* Do the I/O function. */
+			aio_process(aiocbe);
+
+			/* Decrement the active job count. */
+			ki->kaio_active_count--;
+
+			/*
+			 * Increment the completion count for wakeup/signal
+			 * comparisons.
+			 */
+			aiocbe->jobflags |= AIOCBLIST_DONE;
+			ki->kaio_queue_finished_count++;
+			if (lj)
+				lj->lioj_queue_finished_count++;
+			if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
+			    & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
+				ki->kaio_flags &= ~KAIO_WAKEUP;
+				wakeup(userp);
+			}
+
+			s = splbio();
+			if (lj && (lj->lioj_flags &
+			    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
+				if ((lj->lioj_queue_finished_count ==
+				    lj->lioj_queue_count) &&
+				    (lj->lioj_buffer_finished_count ==
+				    lj->lioj_buffer_count)) {
+					PROC_LOCK(userp);
+					psignal(userp,
+					    lj->lioj_signal.sigev_signo);
+					PROC_UNLOCK(userp);
+					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+				}
+			}
+			splx(s);
+
+			aiocbe->jobstate = JOBST_JOBFINISHED;
+
+			/*
+			 * If the I/O request should be automatically rundown,
+			 * do the needed cleanup.  Otherwise, place the queue
+			 * entry for the just finished I/O request into the done
+			 * queue for the associated client.
+			 */
+			s = splnet();
+			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
+				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
+				uma_zfree(aiocb_zone, aiocbe);
+			} else {
+				TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
+				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
+				    plist);
+			}
+			splx(s);
+			KNOTE(&aiocbe->klist, 0);
+
+			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
+				wakeup(aiocbe);
+				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
+			}
+
+			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+				PROC_LOCK(userp);
+				psignal(userp, cb->aio_sigevent.sigev_signo);
+				PROC_UNLOCK(userp);
+			}
+		}
+
+		/*
+		 * Disconnect from user address space.
+		 */
+		if (curcp != mycp) {
+			/* Get the user address space to disconnect from. */
+			tmpvm = mycp->p_vmspace;
+			
+			/* Get original address space for daemon. */
+			mycp->p_vmspace = myvm;
+			
+			/* Activate the daemon's address space. */
+			pmap_activate(FIRST_THREAD_IN_PROC(mycp));
+#ifdef DIAGNOSTIC
+			if (tmpvm == myvm) {
+				printf("AIOD: vmspace problem -- %d\n",
+				    mycp->p_pid);
+			}
+#endif
+			/* Remove our vmspace reference. */
+			vmspace_free(tmpvm);
+			
+			curcp = mycp;
+		}
+
+		/*
+		 * If we are the first to be put onto the free queue, wakeup
+		 * anyone waiting for a daemon.
+		 */
+		s = splnet();
+		TAILQ_REMOVE(&aio_activeproc, aiop, list);
+		if (TAILQ_EMPTY(&aio_freeproc))
+			wakeup(&aio_freeproc);
+		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+		aiop->aiothreadflags |= AIOP_FREE;
+		splx(s);
+
+		/*
+		 * If daemon is inactive for a long time, allow it to exit,
+		 * thereby freeing resources.
+		 */
+		if ((aiop->aiothreadflags & AIOP_SCHED) == 0 &&
+		    tsleep(aiop->aiothread, PRIBIO, "aiordy", aiod_lifetime)) {
+			s = splnet();
+			if (TAILQ_EMPTY(&aio_jobs)) {
+				if ((aiop->aiothreadflags & AIOP_FREE) &&
+				    (num_aio_procs > target_aio_procs)) {
+					TAILQ_REMOVE(&aio_freeproc, aiop, list);
+					splx(s);
+					uma_zfree(aiop_zone, aiop);
+					num_aio_procs--;
+#ifdef DIAGNOSTIC
+					if (mycp->p_vmspace->vm_refcnt <= 1) {
+						printf("AIOD: bad vm refcnt for"
+						    " exiting daemon: %d\n",
+						    mycp->p_vmspace->vm_refcnt);
+					}
+#endif
+					kthread_exit(0);
+				}
+			}
+			splx(s);
+		}
+	}
+}
+
+/*
+ * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
+ * AIO daemon modifies its environment itself.
+ */
+static int
+aio_newproc()
+{
+	int error;
+	struct proc *p;
+
+	error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d",
+			       num_aio_procs);
+	if (error)
+		return error;
+
+	/*
+	 * Wait until daemon is started, but continue on just in case to
+	 * handle error conditions.
+	 */
+	error = tsleep(p, PZERO, "aiosta", aiod_timeout);
+
+	num_aio_procs++;
+
+	return error;
+}
+
+/*
+ * Try the high-performance, low-overhead physio method for eligible
+ * VCHR devices.  This method doesn't use an aio helper thread, and
+ * thus has very low overhead. 
+ *
+ * Assumes that the caller, _aio_aqueue(), has incremented the file
+ * structure's reference count, preventing its deallocation for the
+ * duration of this call. 
+ */
+static int
+aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
+{
+	int error;
+	struct aiocb *cb;
+	struct file *fp;
+	struct buf *bp;
+	struct vnode *vp;
+	struct kaioinfo *ki;
+	struct aio_liojob *lj;
+	int s;
+	int notify;
+
+	cb = &aiocbe->uaiocb;
+	fp = aiocbe->fd_file;
+
+	if (fp->f_type != DTYPE_VNODE) 
+		return (-1);
+
+	vp = (struct vnode *)fp->f_data;
+
+	/*
+	 * If its not a disk, we don't want to return a positive error.
+	 * It causes the aio code to not fall through to try the thread
+	 * way when you're talking to a regular file.
+	 */
+	if (!vn_isdisk(vp, &error)) {
+		if (error == ENOTBLK)
+			return (-1);
+		else
+			return (error);
+	}
+
+ 	if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
+		return (-1);
+
+	if (cb->aio_nbytes >
+	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
+		return (-1);
+
+	ki = p->p_aioinfo;
+	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 
+		return (-1);
+
+	ki->kaio_buffer_count++;
+
+	lj = aiocbe->lio;
+	if (lj)
+		lj->lioj_buffer_count++;
+
+	/* Create and build a buffer header for a transfer. */
+	bp = (struct buf *)getpbuf(NULL);
+	BUF_KERNPROC(bp);
+
+	/*
+	 * Get a copy of the kva from the physical buffer.
+	 */
+	bp->b_caller1 = p;
+	bp->b_dev = vp->v_rdev;
+	error = bp->b_error = 0;
+
+	bp->b_bcount = cb->aio_nbytes;
+	bp->b_bufsize = cb->aio_nbytes;
+	bp->b_flags = B_PHYS;
+	bp->b_iodone = aio_physwakeup;
+	bp->b_saveaddr = bp->b_data;
+	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
+	bp->b_blkno = btodb(cb->aio_offset);
+
+	if (cb->aio_lio_opcode == LIO_WRITE) {
+		bp->b_iocmd = BIO_WRITE;
+		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
+			error = EFAULT;
+			goto doerror;
+		}
+	} else {
+		bp->b_iocmd = BIO_READ;
+		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
+			error = EFAULT;
+			goto doerror;
+		}
+	}
+
+	/* Bring buffer into kernel space. */
+	vmapbuf(bp);
+
+	s = splbio();
+	aiocbe->bp = bp;
+	bp->b_spc = (void *)aiocbe;
+	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
+	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
+	aiocbe->jobstate = JOBST_JOBQBUF;
+	cb->_aiocb_private.status = cb->aio_nbytes;
+	num_buf_aio++;
+	bp->b_error = 0;
+
+	splx(s);
+	
+	/* Perform transfer. */
+	DEV_STRATEGY(bp, 0);
+
+	notify = 0;
+	s = splbio();
+	
+	/*
+	 * If we had an error invoking the request, or an error in processing
+	 * the request before we have returned, we process it as an error in
+	 * transfer.  Note that such an I/O error is not indicated immediately,
+	 * but is returned using the aio_error mechanism.  In this case,
+	 * aio_suspend will return immediately.
+	 */
+	if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
+		struct aiocb *job = aiocbe->uuaiocb;
+
+		aiocbe->uaiocb._aiocb_private.status = 0;
+		suword(&job->_aiocb_private.status, 0);
+		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+		suword(&job->_aiocb_private.error, bp->b_error);
+
+		ki->kaio_buffer_finished_count++;
+
+		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
+			aiocbe->jobstate = JOBST_JOBBFINISHED;
+			aiocbe->jobflags |= AIOCBLIST_DONE;
+			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
+			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
+			notify = 1;
+		}
+	}
+	splx(s);
+	if (notify)
+		KNOTE(&aiocbe->klist, 0);
+	return 0;
+
+doerror:
+	ki->kaio_buffer_count--;
+	if (lj)
+		lj->lioj_buffer_count--;
+	aiocbe->bp = NULL;
+	relpbuf(bp, NULL);
+	return error;
+}
+
+/*
+ * This waits/tests physio completion.
+ */
+static int
+aio_fphysio(struct aiocblist *iocb)
+{
+	int s;
+	struct buf *bp;
+	int error;
+
+	bp = iocb->bp;
+
+	s = splbio();
+	while ((bp->b_flags & B_DONE) == 0) {
+		if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
+			if ((bp->b_flags & B_DONE) == 0) {
+				splx(s);
+				return EINPROGRESS;
+			} else
+				break;
+		}
+	}
+	splx(s);
+
+	/* Release mapping into kernel space. */
+	vunmapbuf(bp);
+	iocb->bp = 0;
+
+	error = 0;
+	
+	/* Check for an error. */
+	if (bp->b_ioflags & BIO_ERROR)
+		error = bp->b_error;
+
+	relpbuf(bp, NULL);
+	return (error);
+}
+
+/*
+ * Wake up aio requests that may be serviceable now.
+ */
+static void
+aio_swake_cb(struct socket *so, struct sockbuf *sb)
+{
+	struct aiocblist *cb,*cbn;
+	struct proc *p;
+	struct kaioinfo *ki = NULL;
+	int opcode, wakecount = 0;
+	struct aiothreadlist *aiop;
+
+	if (sb == &so->so_snd) {
+		opcode = LIO_WRITE;
+		so->so_snd.sb_flags &= ~SB_AIO;
+	} else {
+		opcode = LIO_READ;
+		so->so_rcv.sb_flags &= ~SB_AIO;
+	}
+
+	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
+		cbn = TAILQ_NEXT(cb, list);
+		if (opcode == cb->uaiocb.aio_lio_opcode) {
+			p = cb->userproc;
+			ki = p->p_aioinfo;
+			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
+			TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
+			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
+			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
+			wakecount++;
+			if (cb->jobstate != JOBST_JOBQGLOBAL)
+				panic("invalid queue value");
+		}
+	}
+
+	while (wakecount--) {
+		if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
+			TAILQ_REMOVE(&aio_freeproc, aiop, list);
+			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
+			aiop->aiothreadflags &= ~AIOP_FREE;
+			wakeup(aiop->aiothread);
+		}
+	}
+}
+
+/*
+ * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
+ * technique is done in this code.
+ */
+static int
+_aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp;
+	struct file *fp;
+	unsigned int fd;
+	struct socket *so;
+	int s;
+	int error;
+	int opcode;
+	struct aiocblist *aiocbe;
+	struct aiothreadlist *aiop;
+	struct kaioinfo *ki;
+	struct kevent kev;
+	struct kqueue *kq;
+	struct file *kq_fp;
+
+	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK);
+	aiocbe->inputcharge = 0;
+	aiocbe->outputcharge = 0;
+	callout_handle_init(&aiocbe->timeouthandle);
+	SLIST_INIT(&aiocbe->klist);
+
+	suword(&job->_aiocb_private.status, -1);
+	suword(&job->_aiocb_private.error, 0);
+	suword(&job->_aiocb_private.kernelinfo, -1);
+
+	error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
+	if (error) {
+		suword(&job->_aiocb_private.error, error);
+		uma_zfree(aiocb_zone, aiocbe);
+		return error;
+	}
+	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
+		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
+		uma_zfree(aiocb_zone, aiocbe);
+		return EINVAL;
+	}
+
+	/* Save userspace address of the job info. */
+	aiocbe->uuaiocb = job;
+
+	/* Get the opcode. */
+	if (type != LIO_NOP)
+		aiocbe->uaiocb.aio_lio_opcode = type;
+	opcode = aiocbe->uaiocb.aio_lio_opcode;
+
+	/* Get the fd info for process. */
+	fdp = p->p_fd;
+
+	/*
+	 * Range check file descriptor.
+	 */
+	fd = aiocbe->uaiocb.aio_fildes;
+	if (fd >= fdp->fd_nfiles) {
+		uma_zfree(aiocb_zone, aiocbe);
+		if (type == 0)
+			suword(&job->_aiocb_private.error, EBADF);
+		return EBADF;
+	}
+
+	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
+	if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
+	    0))) {
+		uma_zfree(aiocb_zone, aiocbe);
+		if (type == 0)
+			suword(&job->_aiocb_private.error, EBADF);
+		return EBADF;
+	}
+	fhold(fp);
+
+	if (aiocbe->uaiocb.aio_offset == -1LL) {
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
+	if (error) {
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
+	if (jobrefid == LONG_MAX)
+		jobrefid = 1;
+	else
+		jobrefid++;
+	
+	if (opcode == LIO_NOP) {
+		fdrop(fp, td);
+		uma_zfree(aiocb_zone, aiocbe);
+		if (type == 0) {
+			suword(&job->_aiocb_private.error, 0);
+			suword(&job->_aiocb_private.status, 0);
+			suword(&job->_aiocb_private.kernelinfo, 0);
+		}
+		return 0;
+	}
+	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
+		if (type == 0)
+			suword(&job->_aiocb_private.status, 0);
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+
+	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
+		kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
+		kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
+	}
+	else {
+		/*
+		 * This method for requesting kevent-based notification won't
+		 * work on the alpha, since we're passing in a pointer
+		 * via aio_lio_opcode, which is an int.  Use the SIGEV_KEVENT-
+		 * based method instead.
+		 */
+		struct kevent *kevp;
+
+		kevp = (struct kevent *)(uintptr_t)job->aio_lio_opcode;
+		if (kevp == NULL)
+			goto no_kqueue;
+
+		error = copyin(kevp, &kev, sizeof(kev));
+		if (error)
+			goto aqueue_fail;
+	}
+	if ((u_int)kev.ident >= fdp->fd_nfiles ||
+	    (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
+	    (kq_fp->f_type != DTYPE_KQUEUE)) {
+		error = EBADF;
+		goto aqueue_fail;
+	}
+	kq = (struct kqueue *)kq_fp->f_data;
+	kev.ident = (uintptr_t)aiocbe;
+	kev.filter = EVFILT_AIO;
+	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
+	error = kqueue_register(kq, &kev, td);
+aqueue_fail:
+	if (error) {
+		fdrop(fp, td);
+		uma_zfree(aiocb_zone, aiocbe);
+		if (type == 0)
+			suword(&job->_aiocb_private.error, error);
+		goto done;
+	}
+no_kqueue:
+
+	suword(&job->_aiocb_private.error, EINPROGRESS);
+	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
+	aiocbe->userproc = p;
+	aiocbe->jobflags = 0;
+	aiocbe->lio = lj;
+	ki = p->p_aioinfo;
+
+	if (fp->f_type == DTYPE_SOCKET) {
+		/*
+		 * Alternate queueing for socket ops: Reach down into the
+		 * descriptor to get the socket data.  Then check to see if the
+		 * socket is ready to be read or written (based on the requested
+		 * operation).
+		 *
+		 * If it is not ready for io, then queue the aiocbe on the
+		 * socket, and set the flags so we get a call when sbnotify()
+		 * happens.
+		 */
+		so = (struct socket *)fp->f_data;
+		s = splnet();
+		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
+		    LIO_WRITE) && (!sowriteable(so)))) {
+			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
+			TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
+			if (opcode == LIO_READ)
+				so->so_rcv.sb_flags |= SB_AIO;
+			else
+				so->so_snd.sb_flags |= SB_AIO;
+			aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
+			ki->kaio_queue_count++;
+			num_queue_count++;
+			splx(s);
+			error = 0;
+			goto done;
+		}
+		splx(s);
+	}
+
+	if ((error = aio_qphysio(p, aiocbe)) == 0)
+		goto done;
+	if (error > 0) {
+		suword(&job->_aiocb_private.status, 0);
+		aiocbe->uaiocb._aiocb_private.error = error;
+		suword(&job->_aiocb_private.error, error);
+		goto done;
+	}
+
+	/* No buffer for daemon I/O. */
+	aiocbe->bp = NULL;
+
+	ki->kaio_queue_count++;
+	if (lj)
+		lj->lioj_queue_count++;
+	s = splnet();
+	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
+	splx(s);
+	aiocbe->jobstate = JOBST_JOBQGLOBAL;
+
+	num_queue_count++;
+	error = 0;
+
+	/*
+	 * If we don't have a free AIO process, and we are below our quota, then
+	 * start one.  Otherwise, depend on the subsequent I/O completions to
+	 * pick-up this job.  If we don't sucessfully create the new process
+	 * (thread) due to resource issues, we return an error for now (EAGAIN),
+	 * which is likely not the correct thing to do.
+	 */
+	s = splnet();
+retryproc:
+	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+		TAILQ_REMOVE(&aio_freeproc, aiop, list);
+		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
+		aiop->aiothreadflags &= ~AIOP_FREE;
+		wakeup(aiop->aiothread);
+	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
+	    ((ki->kaio_active_count + num_aio_resv_start) <
+	    ki->kaio_maxactive_count)) {
+		num_aio_resv_start++;
+		if ((error = aio_newproc()) == 0) {
+			num_aio_resv_start--;
+			goto retryproc;
+		}
+		num_aio_resv_start--;
+	}
+	splx(s);
+done:
+	return error;
+}
+
+/*
+ * This routine queues an AIO request, checking for quotas.
+ */
+static int
+aio_aqueue(struct thread *td, struct aiocb *job, int type)
+{
+	struct proc *p = td->td_proc;
+	struct kaioinfo *ki;
+
+	if (p->p_aioinfo == NULL)
+		aio_init_aioinfo(p);
+
+	if (num_queue_count >= max_queue_count)
+		return EAGAIN;
+
+	ki = p->p_aioinfo;
+	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
+		return EAGAIN;
+
+	return _aio_aqueue(td, job, NULL, type);
+}
+
+/*
+ * Support the aio_return system call, as a side-effect, kernel resources are
+ * released.
+ */
+int
+aio_return(struct thread *td, struct aio_return_args *uap)
+{
+	struct proc *p = td->td_proc;
+	int s;
+	long jobref;
+	struct aiocblist *cb, *ncb;
+	struct aiocb *ujob;
+	struct kaioinfo *ki;
+
+	ujob = uap->aiocbp;
+	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
+	if (jobref == -1 || jobref == 0)
+		return EINVAL;
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return EINVAL;
+	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
+		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
+		    jobref) {
+			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+				p->p_stats->p_ru.ru_oublock +=
+				    cb->outputcharge;
+				cb->outputcharge = 0;
+			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
+				cb->inputcharge = 0;
+			}
+			goto done;
+		}
+	}
+	s = splbio();
+	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
+		ncb = TAILQ_NEXT(cb, plist);
+		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
+		    == jobref) {
+			break;
+		}
+	}
+	splx(s);
+ done:
+	if (cb != NULL) {
+		if (ujob == cb->uuaiocb) {
+			td->td_retval[0] =
+			    cb->uaiocb._aiocb_private.status;
+		} else
+			td->td_retval[0] = EFAULT;
+		aio_free_entry(cb);
+		return (0);
+	}
+	return (EINVAL);
+}
+
+/*
+ * Allow a process to wakeup when any of the I/O requests are completed.
+ */
+int
+aio_suspend(struct thread *td, struct aio_suspend_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct timeval atv;
+	struct timespec ts;
+	struct aiocb *const *cbptr, *cbp;
+	struct kaioinfo *ki;
+	struct aiocblist *cb;
+	int i;
+	int njoblist;
+	int error, s, timo;
+	long *ijoblist;
+	struct aiocb **ujoblist;
+	
+	if (uap->nent > AIO_LISTIO_MAX)
+		return EINVAL;
+
+	timo = 0;
+	if (uap->timeout) {
+		/* Get timespec struct. */
+		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
+			return error;
+
+		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
+			return (EINVAL);
+
+		TIMESPEC_TO_TIMEVAL(&atv, &ts);
+		if (itimerfix(&atv))
+			return (EINVAL);
+		timo = tvtohz(&atv);
+	}
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return EAGAIN;
+
+	njoblist = 0;
+	ijoblist = uma_zalloc(aiol_zone, M_WAITOK);
+	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
+	cbptr = uap->aiocbp;
+
+	for (i = 0; i < uap->nent; i++) {
+		cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
+		if (cbp == 0)
+			continue;
+		ujoblist[njoblist] = cbp;
+		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
+		njoblist++;
+	}
+
+	if (njoblist == 0) {
+		uma_zfree(aiol_zone, ijoblist);
+		uma_zfree(aiol_zone, ujoblist);
+		return 0;
+	}
+
+	error = 0;
+	for (;;) {
+		TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
+			for (i = 0; i < njoblist; i++) {
+				if (((intptr_t)
+				    cb->uaiocb._aiocb_private.kernelinfo) ==
+				    ijoblist[i]) {
+					if (ujoblist[i] != cb->uuaiocb)
+						error = EINVAL;
+					uma_zfree(aiol_zone, ijoblist);
+					uma_zfree(aiol_zone, ujoblist);
+					return error;
+				}
+			}
+		}
+
+		s = splbio();
+		for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
+		    TAILQ_NEXT(cb, plist)) {
+			for (i = 0; i < njoblist; i++) {
+				if (((intptr_t)
+				    cb->uaiocb._aiocb_private.kernelinfo) ==
+				    ijoblist[i]) {
+					splx(s);
+					if (ujoblist[i] != cb->uuaiocb)
+						error = EINVAL;
+					uma_zfree(aiol_zone, ijoblist);
+					uma_zfree(aiol_zone, ujoblist);
+					return error;
+				}
+			}
+		}
+
+		ki->kaio_flags |= KAIO_WAKEUP;
+		error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
+		splx(s);
+
+		if (error == ERESTART || error == EINTR) {
+			uma_zfree(aiol_zone, ijoblist);
+			uma_zfree(aiol_zone, ujoblist);
+			return EINTR;
+		} else if (error == EWOULDBLOCK) {
+			uma_zfree(aiol_zone, ijoblist);
+			uma_zfree(aiol_zone, ujoblist);
+			return EAGAIN;
+		}
+	}
+
+/* NOTREACHED */
+	return EINVAL;
+}
+
+/*
+ * aio_cancel cancels any non-physio aio operations not currently in
+ * progress.
+ */
+int
+aio_cancel(struct thread *td, struct aio_cancel_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct kaioinfo *ki;
+	struct aiocblist *cbe, *cbn;
+	struct file *fp;
+	struct filedesc *fdp;
+	struct socket *so;
+	struct proc *po;
+	int s,error;
+	int cancelled=0;
+	int notcancelled=0;
+	struct vnode *vp;
+
+	fdp = p->p_fd;
+	if ((u_int)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
+		return (EBADF);
+
+        if (fp->f_type == DTYPE_VNODE) {
+		vp = (struct vnode *)fp->f_data;
+		
+		if (vn_isdisk(vp,&error)) {
+			td->td_retval[0] = AIO_NOTCANCELED;
+        	        return 0;
+		}
+	} else if (fp->f_type == DTYPE_SOCKET) {
+		so = (struct socket *)fp->f_data;
+
+		s = splnet();
+
+		for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
+			cbn = TAILQ_NEXT(cbe, list);
+			if ((uap->aiocbp == NULL) ||
+				(uap->aiocbp == cbe->uuaiocb) ) {
+				po = cbe->userproc;
+				ki = po->p_aioinfo;
+				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
+				TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
+				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
+				if (ki->kaio_flags & KAIO_WAKEUP) {
+					wakeup(po);
+				}
+				cbe->jobstate = JOBST_JOBFINISHED;
+				cbe->uaiocb._aiocb_private.status=-1;
+				cbe->uaiocb._aiocb_private.error=ECANCELED;
+				cancelled++;
+/* XXX cancelled, knote? */
+			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
+				    SIGEV_SIGNAL) {
+					PROC_LOCK(cbe->userproc);
+					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
+					PROC_UNLOCK(cbe->userproc);
+				}
+				if (uap->aiocbp) 
+					break;
+			}
+		}
+		splx(s);
+
+		if ((cancelled) && (uap->aiocbp)) {
+			td->td_retval[0] = AIO_CANCELED;
+			return 0;
+		}
+	}
+	ki=p->p_aioinfo;
+	s = splnet();
+
+	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
+		cbn = TAILQ_NEXT(cbe, plist);
+
+		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
+		    ((uap->aiocbp == NULL ) || 
+		     (uap->aiocbp == cbe->uuaiocb))) {
+			
+			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
+				TAILQ_REMOVE(&aio_jobs, cbe, list);
+                                TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
+                                TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
+                                    plist);
+				cancelled++;
+				ki->kaio_queue_finished_count++;
+				cbe->jobstate = JOBST_JOBFINISHED;
+				cbe->uaiocb._aiocb_private.status = -1;
+				cbe->uaiocb._aiocb_private.error = ECANCELED;
+/* XXX cancelled, knote? */
+			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
+				    SIGEV_SIGNAL) {
+					PROC_LOCK(cbe->userproc);
+					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
+					PROC_UNLOCK(cbe->userproc);
+				}
+			} else {
+				notcancelled++;
+			}
+		}
+	}
+	splx(s);
+
+	if (notcancelled) {
+		td->td_retval[0] = AIO_NOTCANCELED;
+		return 0;
+	}
+	if (cancelled) {
+		td->td_retval[0] = AIO_CANCELED;
+		return 0;
+	}
+	td->td_retval[0] = AIO_ALLDONE;
+
+	return 0;
+}
+
+/*
+ * aio_error is implemented in the kernel level for compatibility purposes only.
+ * For a user mode async implementation, it would be best to do it in a userland
+ * subroutine.
+ */
+int
+aio_error(struct thread *td, struct aio_error_args *uap)
+{
+	struct proc *p = td->td_proc;
+	int s;
+	struct aiocblist *cb;
+	struct kaioinfo *ki;
+	long jobref;
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return EINVAL;
+
+	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
+	if ((jobref == -1) || (jobref == 0))
+		return EINVAL;
+
+	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
+		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
+		    jobref) {
+			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
+			return 0;
+		}
+	}
+
+	s = splnet();
+
+	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
+	    plist)) {
+		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
+		    jobref) {
+			td->td_retval[0] = EINPROGRESS;
+			splx(s);
+			return 0;
+		}
+	}
+
+	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
+	    plist)) {
+		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
+		    jobref) {
+			td->td_retval[0] = EINPROGRESS;
+			splx(s);
+			return 0;
+		}
+	}
+	splx(s);
+
+	s = splbio();
+	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
+	    plist)) {
+		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
+		    jobref) {
+			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
+			splx(s);
+			return 0;
+		}
+	}
+
+	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
+	    plist)) {
+		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
+		    jobref) {
+			td->td_retval[0] = EINPROGRESS;
+			splx(s);
+			return 0;
+		}
+	}
+	splx(s);
+
+#if (0)
+	/*
+	 * Hack for lio.
+	 */
+	status = fuword(&uap->aiocbp->_aiocb_private.status);
+	if (status == -1)
+		return fuword(&uap->aiocbp->_aiocb_private.error);
+#endif
+	return EINVAL;
+}
+
+/* syscall - asynchronous read from a file (REALTIME) */
+int
+aio_read(struct thread *td, struct aio_read_args *uap)
+{
+
+	return aio_aqueue(td, uap->aiocbp, LIO_READ);
+}
+
+/* syscall - asynchronous write to a file (REALTIME) */
+int
+aio_write(struct thread *td, struct aio_write_args *uap)
+{
+
+	return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
+}
+
+/* syscall - XXX undocumented */
+int
+lio_listio(struct thread *td, struct lio_listio_args *uap)
+{
+	struct proc *p = td->td_proc;
+	int nent, nentqueued;
+	struct aiocb *iocb, * const *cbptr;
+	struct aiocblist *cb;
+	struct kaioinfo *ki;
+	struct aio_liojob *lj;
+	int error, runningcode;
+	int nerror;
+	int i;
+	int s;
+
+	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+		return EINVAL;
+
+	nent = uap->nent;
+	if (nent > AIO_LISTIO_MAX)
+		return EINVAL;
+
+	if (p->p_aioinfo == NULL)
+		aio_init_aioinfo(p);
+
+	if ((nent + num_queue_count) > max_queue_count)
+		return EAGAIN;
+
+	ki = p->p_aioinfo;
+	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
+		return EAGAIN;
+
+	lj = uma_zalloc(aiolio_zone, M_WAITOK);
+	if (!lj)
+		return EAGAIN;
+
+	lj->lioj_flags = 0;
+	lj->lioj_buffer_count = 0;
+	lj->lioj_buffer_finished_count = 0;
+	lj->lioj_queue_count = 0;
+	lj->lioj_queue_finished_count = 0;
+	lj->lioj_ki = ki;
+
+	/*
+	 * Setup signal.
+	 */
+	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		error = copyin(uap->sig, &lj->lioj_signal,
+			       sizeof(lj->lioj_signal));
+		if (error) {
+			uma_zfree(aiolio_zone, lj);
+			return error;
+		}
+		if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
+			uma_zfree(aiolio_zone, lj);
+			return EINVAL;
+		}
+		lj->lioj_flags |= LIOJ_SIGNAL;
+		lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
+	} else
+		lj->lioj_flags &= ~LIOJ_SIGNAL;
+
+	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
+	/*
+	 * Get pointers to the list of I/O requests.
+	 */
+	nerror = 0;
+	nentqueued = 0;
+	cbptr = uap->acb_list;
+	for (i = 0; i < uap->nent; i++) {
+		iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
+		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
+			error = _aio_aqueue(td, iocb, lj, 0);
+			if (error == 0)
+				nentqueued++;
+			else
+				nerror++;
+		}
+	}
+
+	/*
+	 * If we haven't queued any, then just return error.
+	 */
+	if (nentqueued == 0)
+		return 0;
+
+	/*
+	 * Calculate the appropriate error return.
+	 */
+	runningcode = 0;
+	if (nerror)
+		runningcode = EIO;
+
+	if (uap->mode == LIO_WAIT) {
+		int command, found, jobref;
+		
+		for (;;) {
+			found = 0;
+			for (i = 0; i < uap->nent; i++) {
+				/*
+				 * Fetch address of the control buf pointer in
+				 * user space.
+				 */
+				iocb = (struct aiocb *)
+				    (intptr_t)fuword(&cbptr[i]);
+				if (((intptr_t)iocb == -1) || ((intptr_t)iocb
+				    == 0))
+					continue;
+
+				/*
+				 * Fetch the associated command from user space.
+				 */
+				command = fuword(&iocb->aio_lio_opcode);
+				if (command == LIO_NOP) {
+					found++;
+					continue;
+				}
+
+				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
+
+				TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
+					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
+					    == jobref) {
+						if (cb->uaiocb.aio_lio_opcode
+						    == LIO_WRITE) {
+							p->p_stats->p_ru.ru_oublock
+							    +=
+							    cb->outputcharge;
+							cb->outputcharge = 0;
+						} else if (cb->uaiocb.aio_lio_opcode
+						    == LIO_READ) {
+							p->p_stats->p_ru.ru_inblock
+							    += cb->inputcharge;
+							cb->inputcharge = 0;
+						}
+						found++;
+						break;
+					}
+				}
+
+				s = splbio();
+				TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
+					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
+					    == jobref) {
+						found++;
+						break;
+					}
+				}
+				splx(s);
+			}
+
+			/*
+			 * If all I/Os have been disposed of, then we can
+			 * return.
+			 */
+			if (found == nentqueued)
+				return runningcode;
+			
+			ki->kaio_flags |= KAIO_WAKEUP;
+			error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
+
+			if (error == EINTR)
+				return EINTR;
+			else if (error == EWOULDBLOCK)
+				return EAGAIN;
+		}
+	}
+
+	return runningcode;
+}
+
+/*
+ * This is a weird hack so that we can post a signal.  It is safe to do so from
+ * a timeout routine, but *not* from an interrupt routine.
+ */
+static void
+process_signal(void *aioj)
+{
+	struct aiocblist *aiocbe = aioj;
+	struct aio_liojob *lj = aiocbe->lio;
+	struct aiocb *cb = &aiocbe->uaiocb;
+
+	if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
+		(lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
+		PROC_LOCK(lj->lioj_ki->kaio_p);
+		psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
+		PROC_UNLOCK(lj->lioj_ki->kaio_p);
+		lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+	}
+
+	if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
+		PROC_LOCK(aiocbe->userproc);
+		psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
+		PROC_UNLOCK(aiocbe->userproc);
+	}
+}
+
+/*
+ * Interrupt handler for physio, performs the necessary process wakeups, and
+ * signals.
+ */
+static void
+aio_physwakeup(struct buf *bp)
+{
+	struct aiocblist *aiocbe;
+	struct proc *p;
+	struct kaioinfo *ki;
+	struct aio_liojob *lj;
+
+	wakeup(bp);
+
+	aiocbe = (struct aiocblist *)bp->b_spc;
+	if (aiocbe) {
+		p = bp->b_caller1;
+
+		aiocbe->jobstate = JOBST_JOBBFINISHED;
+		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+		aiocbe->uaiocb._aiocb_private.error = 0;
+		aiocbe->jobflags |= AIOCBLIST_DONE;
+
+		if (bp->b_ioflags & BIO_ERROR)
+			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+
+		lj = aiocbe->lio;
+		if (lj) {
+			lj->lioj_buffer_finished_count++;
+			
+			/*
+			 * wakeup/signal if all of the interrupt jobs are done.
+			 */
+			if (lj->lioj_buffer_finished_count ==
+			    lj->lioj_buffer_count) {
+				/*
+				 * Post a signal if it is called for.
+				 */
+				if ((lj->lioj_flags &
+				    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
+				    LIOJ_SIGNAL) {
+					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+					aiocbe->timeouthandle =
+						timeout(process_signal,
+							aiocbe, 0);
+				}
+			}
+		}
+
+		ki = p->p_aioinfo;
+		if (ki) {
+			ki->kaio_buffer_finished_count++;
+			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
+			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
+
+			KNOTE(&aiocbe->klist, 0);
+			/* Do the wakeup. */
+			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
+				ki->kaio_flags &= ~KAIO_WAKEUP;
+				wakeup(p);
+			}
+		}
+
+		if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
+			aiocbe->timeouthandle =
+				timeout(process_signal, aiocbe, 0);
+	}
+}
+
+/* syscall - wait for the next completion of an aio request */
+int
+aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct timeval atv;
+	struct timespec ts;
+	struct kaioinfo *ki;
+	struct aiocblist *cb = NULL;
+	int error, s, timo;
+	
+	suword(uap->aiocbp, (int)NULL);
+
+	timo = 0;
+	if (uap->timeout) {
+		/* Get timespec struct. */
+		error = copyin(uap->timeout, &ts, sizeof(ts));
+		if (error)
+			return error;
+
+		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
+			return (EINVAL);
+
+		TIMESPEC_TO_TIMEVAL(&atv, &ts);
+		if (itimerfix(&atv))
+			return (EINVAL);
+		timo = tvtohz(&atv);
+	}
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return EAGAIN;
+
+	for (;;) {
+		if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
+			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
+			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
+			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+				p->p_stats->p_ru.ru_oublock +=
+				    cb->outputcharge;
+				cb->outputcharge = 0;
+			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
+				cb->inputcharge = 0;
+			}
+			aio_free_entry(cb);
+			return cb->uaiocb._aiocb_private.error;
+		}
+
+		s = splbio();
+ 		if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
+			splx(s);
+			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
+			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
+			aio_free_entry(cb);
+			return cb->uaiocb._aiocb_private.error;
+		}
+
+		ki->kaio_flags |= KAIO_WAKEUP;
+		error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
+		splx(s);
+
+		if (error == ERESTART)
+			return EINTR;
+		else if (error < 0)
+			return error;
+		else if (error == EINTR)
+			return EINTR;
+		else if (error == EWOULDBLOCK)
+			return EAGAIN;
+	}
+}
+
+/* kqueue attach function */
+static int
+filt_aioattach(struct knote *kn)
+{
+	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
+
+	/*
+	 * The aiocbe pointer must be validated before using it, so
+	 * registration is restricted to the kernel; the user cannot
+	 * set EV_FLAG1.
+	 */
+	if ((kn->kn_flags & EV_FLAG1) == 0)
+		return (EPERM);
+	kn->kn_flags &= ~EV_FLAG1;
+
+	SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
+
+	return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_aiodetach(struct knote *kn)
+{
+	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
+
+	SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_aio(struct knote *kn, long hint)
+{
+	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
+
+	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
+	if (aiocbe->jobstate != JOBST_JOBFINISHED &&
+	    aiocbe->jobstate != JOBST_JOBBFINISHED)
+		return (0);
+	kn->kn_flags |= EV_EOF; 
+	return (1);
+}
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
new file mode 100644
index 0000000..30dc753
--- /dev/null
+++ b/sys/kern/vfs_bio.c
@@ -0,0 +1,3395 @@
+/*
+ * Copyright (c) 1994,1997 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
+ * 2. Absolutely no warranty of function or purpose is made by the author
+ *		John S. Dyson.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * this file contains a new buffer I/O scheme implementing a coherent
+ * VM object and buffer cache scheme.  Pains have been taken to make
+ * sure that the performance degradation associated with schemes such
+ * as this is not realized.
+ *
+ * Author:  John S. Dyson
+ * Significant help during the development and debugging phases
+ * had been provided by David Greenman, also of the FreeBSD core team.
+ *
+ * see man buf(9) for more info.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/stdint.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/proc.h>
+#include <sys/reboot.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+
+static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
+
+struct	bio_ops bioops;		/* I/O operation notification */
+
+struct	buf_ops buf_ops_bio = {
+	"buf_ops_bio",
+	bwrite
+};
+
+/*
+ * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
+ * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
+ */
+struct buf *buf;		/* buffer header pool */
+struct mtx buftimelock;		/* Interlock on setting prio and timo */
+
+static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
+		vm_offset_t to);
+static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
+		vm_offset_t to);
+static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
+			       int pageno, vm_page_t m);
+static void vfs_clean_pages(struct buf * bp);
+static void vfs_setdirty(struct buf *bp);
+static void vfs_vmio_release(struct buf *bp);
+static void vfs_backgroundwritedone(struct buf *bp);
+static int flushbufqueues(void);
+static void buf_daemon(void);
+
+int vmiodirenable = TRUE;
+SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
+    "Use the VM system for directory writes");
+int runningbufspace;
+SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
+    "Amount of presently outstanding async buffer io");
+static int bufspace;
+SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
+    "KVA memory used for bufs");
+static int maxbufspace;
+SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
+    "Maximum allowed value of bufspace (including buf_daemon)");
+static int bufmallocspace;
+SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
+    "Amount of malloced memory for buffers");
+static int maxbufmallocspace;
+SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
+    "Maximum amount of malloced memory for buffers");
+static int lobufspace;
+SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
+    "Minimum amount of buffers we want to have");
+static int hibufspace;
+SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
+    "Maximum allowed value of bufspace (excluding buf_daemon)");
+static int bufreusecnt;
+SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
+    "Number of times we have reused a buffer");
+static int buffreekvacnt;
+SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
+    "Number of times we have freed the KVA space from some buffer");
+static int bufdefragcnt;
+SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
+    "Number of times we have had to repeat buffer allocation to defragment");
+static int lorunningspace;
+SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
+    "Minimum preferred space used for in-progress I/O");
+static int hirunningspace;
+SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
+    "Maximum amount of space to use for in-progress I/O");
+static int numdirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
+    "Number of buffers that are dirty (has unwritten changes) at the moment");
+static int lodirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
+    "How many buffers we want to have free before bufdaemon can sleep");
+static int hidirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
+    "When the number of dirty buffers is considered severe");
+static int numfreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
+    "Number of free buffers");
+static int lofreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
+   "XXX Unused");
+static int hifreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
+   "XXX Complicatedly unused");
+static int getnewbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
+   "Number of calls to getnewbuf");
+static int getnewbufrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
+    "Number of times getnewbuf has had to restart a buffer aquisition");
+static int dobkgrdwrite = 1;
+SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
+    "Do background writes (honoring the BX_BKGRDWRITE flag)?");
+
+/*
+ * Wakeup point for bufdaemon, as well as indicator of whether it is already
+ * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
+ * is idling.
+ */
+static int bd_request;
+
+/*
+ * bogus page -- for I/O to/from partially complete buffers
+ * this is a temporary solution to the problem, but it is not
+ * really that bad.  it would be better to split the buffer
+ * for input in the case of buffers partially already in memory,
+ * but the code is intricate enough already.
+ */
+vm_page_t bogus_page;
+
+/*
+ * Offset for bogus_page.
+ * XXX bogus_offset should be local to bufinit
+ */
+static vm_offset_t bogus_offset;
+
+/*
+ * Synchronization (sleep/wakeup) variable for active buffer space requests.
+ * Set when wait starts, cleared prior to wakeup().
+ * Used in runningbufwakeup() and waitrunningbufspace().
+ */
+static int runningbufreq;
+
+/* 
+ * Synchronization (sleep/wakeup) variable for buffer requests.
+ * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
+ * by and/or.
+ * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
+ * getnewbuf(), and getblk().
+ */
+static int needsbuffer;
+
+/*
+ * Mask for index into the buffer hash table, which needs to be power of 2 in
+ * size.  Set in kern_vfs_bio_buffer_alloc.
+ */
+static int bufhashmask;
+
+/*
+ * Hash table for all buffers, with a linked list hanging from each table
+ * entry.  Set in kern_vfs_bio_buffer_alloc, initialized in buf_init.
+ */
+static LIST_HEAD(bufhashhdr, buf) *bufhashtbl;
+
+/*
+ * Somewhere to store buffers when they are not in another list, to always
+ * have them in a list (and thus being able to use the same set of operations
+ * on them.)
+ */
+static struct bufhashhdr invalhash;
+
+/*
+ * Definitions for the buffer free lists.
+ */
+#define BUFFER_QUEUES	6	/* number of free buffer queues */
+
+#define QUEUE_NONE	0	/* on no queue */
+#define QUEUE_LOCKED	1	/* locked buffers */
+#define QUEUE_CLEAN	2	/* non-B_DELWRI buffers */
+#define QUEUE_DIRTY	3	/* B_DELWRI buffers */
+#define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
+#define QUEUE_EMPTY	5	/* empty buffer headers */
+
+/* Queues for free buffers with various properties */
+static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
+/*
+ * Single global constant for BUF_WMESG, to avoid getting multiple references.
+ * buf_wmesg is referred from macros.
+ */
+const char *buf_wmesg = BUF_WMESG;
+
+#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
+#define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
+#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
+#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
+
+/*
+ * Buffer hash table code.  Note that the logical block scans linearly, which
+ * gives us some L1 cache locality.
+ */
+
+static __inline 
+struct bufhashhdr *
+bufhash(struct vnode *vnp, daddr_t bn)
+{
+	return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
+}
+
+/*
+ *	numdirtywakeup:
+ *
+ *	If someone is blocked due to there being too many dirty buffers,
+ *	and numdirtybuffers is now reasonable, wake them up.
+ */
+
+static __inline void
+numdirtywakeup(int level)
+{
+	if (numdirtybuffers <= level) {
+		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
+			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
+			wakeup(&needsbuffer);
+		}
+	}
+}
+
+/*
+ *	bufspacewakeup:
+ *
+ *	Called when buffer space is potentially available for recovery.
+ *	getnewbuf() will block on this flag when it is unable to free 
+ *	sufficient buffer space.  Buffer space becomes recoverable when 
+ *	bp's get placed back in the queues.
+ */
+
+static __inline void
+bufspacewakeup(void)
+{
+	/*
+	 * If someone is waiting for BUF space, wake them up.  Even
+	 * though we haven't freed the kva space yet, the waiting
+	 * process will be able to now.
+	 */
+	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
+		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
+		wakeup(&needsbuffer);
+	}
+}
+
+/*
+ * runningbufwakeup() - in-progress I/O accounting.
+ *
+ */
+static __inline void
+runningbufwakeup(struct buf *bp)
+{
+	if (bp->b_runningbufspace) {
+		runningbufspace -= bp->b_runningbufspace;
+		bp->b_runningbufspace = 0;
+		if (runningbufreq && runningbufspace <= lorunningspace) {
+			runningbufreq = 0;
+			wakeup(&runningbufreq);
+		}
+	}
+}
+
+/*
+ *	bufcountwakeup:
+ *
+ *	Called when a buffer has been added to one of the free queues to
+ *	account for the buffer and to wakeup anyone waiting for free buffers.
+ *	This typically occurs when large amounts of metadata are being handled
+ *	by the buffer cache ( else buffer space runs out first, usually ).
+ */
+
+static __inline void
+bufcountwakeup(void) 
+{
+	++numfreebuffers;
+	if (needsbuffer) {
+		needsbuffer &= ~VFS_BIO_NEED_ANY;
+		if (numfreebuffers >= hifreebuffers)
+			needsbuffer &= ~VFS_BIO_NEED_FREE;
+		wakeup(&needsbuffer);
+	}
+}
+
+/*
+ *	waitrunningbufspace()
+ *
+ *	runningbufspace is a measure of the amount of I/O currently
+ *	running.  This routine is used in async-write situations to
+ *	prevent creating huge backups of pending writes to a device.
+ *	Only asynchronous writes are governed by this function.
+ *
+ *	Reads will adjust runningbufspace, but will not block based on it.
+ *	The read load has a side effect of reducing the allowed write load.
+ *
+ *	This does NOT turn an async write into a sync write.  It waits  
+ *	for earlier writes to complete and generally returns before the
+ *	caller's write has reached the device.
+ */
+static __inline void
+waitrunningbufspace(void)
+{
+	/*
+	 * XXX race against wakeup interrupt, currently
+	 * protected by Giant.  FIXME!
+	 */
+	while (runningbufspace > hirunningspace) {
+		++runningbufreq;
+		tsleep(&runningbufreq, PVM, "wdrain", 0);
+	}
+}
+
+
+/*
+ *	vfs_buf_test_cache:
+ *
+ *	Called when a buffer is extended.  This function clears the B_CACHE
+ *	bit if the newly extended portion of the buffer does not contain
+ *	valid data.
+ */
+static __inline__
+void
+vfs_buf_test_cache(struct buf *bp,
+		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
+		  vm_page_t m)
+{
+	GIANT_REQUIRED;
+
+	if (bp->b_flags & B_CACHE) {
+		int base = (foff + off) & PAGE_MASK;
+		if (vm_page_is_valid(m, base, size) == 0)
+			bp->b_flags &= ~B_CACHE;
+	}
+}
+
+/* Wake up the buffer deamon if necessary */
+static __inline__
+void
+bd_wakeup(int dirtybuflevel)
+{
+	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
+		bd_request = 1;
+		wakeup(&bd_request);
+	}
+}
+
+/*
+ * bd_speedup - speedup the buffer cache flushing code
+ */
+
+static __inline__
+void
+bd_speedup(void)
+{
+	bd_wakeup(1);
+}
+
+/*
+ * Calculating buffer cache scaling values and reserve space for buffer
+ * headers.  This is called during low level kernel initialization and
+ * may be called more then once.  We CANNOT write to the memory area
+ * being reserved at this time.
+ */
+caddr_t
+kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est)
+{
+	/*
+	 * physmem_est is in pages.  Convert it to kilobytes (assumes
+	 * PAGE_SIZE is >= 1K)
+	 */
+	physmem_est = physmem_est * (PAGE_SIZE / 1024);
+
+	/*
+	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
+	 * For the first 64MB of ram nominally allocate sufficient buffers to
+	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
+	 * buffers to cover 1/20 of our ram over 64MB.  When auto-sizing
+	 * the buffer cache we limit the eventual kva reservation to
+	 * maxbcache bytes.
+	 *
+	 * factor represents the 1/4 x ram conversion.
+	 */
+	if (nbuf == 0) {
+		int factor = 4 * BKVASIZE / 1024;
+
+		nbuf = 50;
+		if (physmem_est > 4096)
+			nbuf += min((physmem_est - 4096) / factor,
+			    65536 / factor);
+		if (physmem_est > 65536)
+			nbuf += (physmem_est - 65536) * 2 / (factor * 5);
+
+		if (maxbcache && nbuf > maxbcache / BKVASIZE)
+			nbuf = maxbcache / BKVASIZE;
+	}
+
+#if 0
+	/*
+	 * Do not allow the buffer_map to be more then 1/2 the size of the
+	 * kernel_map.
+	 */
+	if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / 
+	    (BKVASIZE * 2)) {
+		nbuf = (kernel_map->max_offset - kernel_map->min_offset) / 
+		    (BKVASIZE * 2);
+		printf("Warning: nbufs capped at %d\n", nbuf);
+	}
+#endif
+
+	/*
+	 * swbufs are used as temporary holders for I/O, such as paging I/O.
+	 * We have no less then 16 and no more then 256.
+	 */
+	nswbuf = max(min(nbuf/4, 256), 16);
+
+	/*
+	 * Reserve space for the buffer cache buffers
+	 */
+	swbuf = (void *)v;
+	v = (caddr_t)(swbuf + nswbuf);
+	buf = (void *)v;
+	v = (caddr_t)(buf + nbuf);
+
+	/*
+	 * Calculate the hash table size and reserve space
+	 */
+	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
+		;
+	bufhashtbl = (void *)v;
+	v = (caddr_t)(bufhashtbl + bufhashmask);
+	--bufhashmask;
+
+	return(v);
+}
+
+/* Initialize the buffer subsystem.  Called before use of any buffers. */
+void
+bufinit(void)
+{
+	struct buf *bp;
+	int i;
+
+	GIANT_REQUIRED;
+
+	LIST_INIT(&invalhash);
+	mtx_init(&buftimelock, "buftime lock", NULL, MTX_DEF);
+
+	for (i = 0; i <= bufhashmask; i++)
+		LIST_INIT(&bufhashtbl[i]);
+
+	/* next, make a null set of free lists */
+	for (i = 0; i < BUFFER_QUEUES; i++)
+		TAILQ_INIT(&bufqueues[i]);
+
+	/* finally, initialize each buffer header and stick on empty q */
+	for (i = 0; i < nbuf; i++) {
+		bp = &buf[i];
+		bzero(bp, sizeof *bp);
+		bp->b_flags = B_INVAL;	/* we're just an empty header */
+		bp->b_dev = NODEV;
+		bp->b_rcred = NOCRED;
+		bp->b_wcred = NOCRED;
+		bp->b_qindex = QUEUE_EMPTY;
+		bp->b_xflags = 0;
+		LIST_INIT(&bp->b_dep);
+		BUF_LOCKINIT(bp);
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+	}
+
+	/*
+	 * maxbufspace is the absolute maximum amount of buffer space we are 
+	 * allowed to reserve in KVM and in real terms.  The absolute maximum
+	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
+	 * used by most other processes.  The differential is required to 
+	 * ensure that buf_daemon is able to run when other processes might 
+	 * be blocked waiting for buffer space.
+	 *
+	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
+	 * this may result in KVM fragmentation which is not handled optimally
+	 * by the system.
+	 */
+	maxbufspace = nbuf * BKVASIZE;
+	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
+	lobufspace = hibufspace - MAXBSIZE;
+
+	lorunningspace = 512 * 1024;
+	hirunningspace = 1024 * 1024;
+
+/*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space.  Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on average
+ * (small) directories.
+ */
+	maxbufmallocspace = hibufspace / 20;
+
+/*
+ * Reduce the chance of a deadlock occuring by limiting the number
+ * of delayed-write dirty buffers we allow to stack up.
+ */
+	hidirtybuffers = nbuf / 4 + 20;
+	numdirtybuffers = 0;
+/*
+ * To support extreme low-memory systems, make sure hidirtybuffers cannot
+ * eat up all available buffer space.  This occurs when our minimum cannot
+ * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
+ * BKVASIZE'd (8K) buffers.
+ */
+	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
+		hidirtybuffers >>= 1;
+	}
+	lodirtybuffers = hidirtybuffers / 2;
+
+/*
+ * Try to keep the number of free buffers in the specified range,
+ * and give special processes (e.g. like buf_daemon) access to an 
+ * emergency reserve.
+ */
+	lofreebuffers = nbuf / 18 + 5;
+	hifreebuffers = 2 * lofreebuffers;
+	numfreebuffers = nbuf;
+
+/*
+ * Maximum number of async ops initiated per buf_daemon loop.  This is
+ * somewhat of a hack at the moment, we really need to limit ourselves
+ * based on the number of bytes of I/O in-transit that were initiated
+ * from buf_daemon.
+ */
+
+	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+	bogus_page = vm_page_alloc(kernel_object,
+			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+			VM_ALLOC_NORMAL);
+	cnt.v_wire_count++;
+}
+
+/*
+ * bfreekva() - free the kva allocation for a buffer.
+ *
+ *	Must be called at splbio() or higher as this is the only locking for
+ *	buffer_map.
+ *
+ *	Since this call frees up buffer space, we call bufspacewakeup().
+ */
+static void
+bfreekva(struct buf * bp)
+{
+	GIANT_REQUIRED;
+
+	if (bp->b_kvasize) {
+		++buffreekvacnt;
+		bufspace -= bp->b_kvasize;
+		vm_map_delete(buffer_map,
+		    (vm_offset_t) bp->b_kvabase,
+		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
+		);
+		bp->b_kvasize = 0;
+		bufspacewakeup();
+	}
+}
+
+/*
+ *	bremfree:
+ *
+ *	Remove the buffer from the appropriate free list.
+ */
+void
+bremfree(struct buf * bp)
+{
+	int s = splbio();
+	int old_qindex = bp->b_qindex;
+
+	GIANT_REQUIRED;
+
+	if (bp->b_qindex != QUEUE_NONE) {
+		KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
+		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+		bp->b_qindex = QUEUE_NONE;
+	} else {
+		if (BUF_REFCNT(bp) <= 1)
+			panic("bremfree: removing a buffer not on a queue");
+	}
+
+	/*
+	 * Fixup numfreebuffers count.  If the buffer is invalid or not
+	 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
+	 * the buffer was free and we must decrement numfreebuffers.
+	 */
+	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
+		switch(old_qindex) {
+		case QUEUE_DIRTY:
+		case QUEUE_CLEAN:
+		case QUEUE_EMPTY:
+		case QUEUE_EMPTYKVA:
+			--numfreebuffers;
+			break;
+		default:
+			break;
+		}
+	}
+	splx(s);
+}
+
+
+/*
+ * Get a buffer with the specified data.  Look in the cache first.  We
+ * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
+ * is set, the buffer is valid and we do not have to do anything ( see
+ * getblk() ).  This is really just a special case of breadn().
+ */
+int
+bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
+    struct buf ** bpp)
+{
+
+	return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp));
+}
+
+/*
+ * Operates like bread, but also starts asynchronous I/O on
+ * read-ahead blocks.  We must clear BIO_ERROR and B_INVAL prior
+ * to initiating I/O . If B_CACHE is set, the buffer is valid 
+ * and we do not have to do anything.
+ */
+int
+breadn(struct vnode * vp, daddr_t blkno, int size,
+    daddr_t * rablkno, int *rabsize,
+    int cnt, struct ucred * cred, struct buf ** bpp)
+{
+	struct buf *bp, *rabp;
+	int i;
+	int rv = 0, readwait = 0;
+
+	*bpp = bp = getblk(vp, blkno, size, 0, 0);
+
+	/* if not found in cache, do some I/O */
+	if ((bp->b_flags & B_CACHE) == 0) {
+		if (curthread != PCPU_GET(idlethread))
+			curthread->td_proc->p_stats->p_ru.ru_inblock++;
+		bp->b_iocmd = BIO_READ;
+		bp->b_flags &= ~B_INVAL;
+		bp->b_ioflags &= ~BIO_ERROR;
+		if (bp->b_rcred == NOCRED && cred != NOCRED)
+			bp->b_rcred = crhold(cred);
+		vfs_busy_pages(bp, 0);
+		VOP_STRATEGY(vp, bp);
+		++readwait;
+	}
+
+	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+		if (inmem(vp, *rablkno))
+			continue;
+		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
+
+		if ((rabp->b_flags & B_CACHE) == 0) {
+			if (curthread != PCPU_GET(idlethread))
+				curthread->td_proc->p_stats->p_ru.ru_inblock++;
+			rabp->b_flags |= B_ASYNC;
+			rabp->b_flags &= ~B_INVAL;
+			rabp->b_ioflags &= ~BIO_ERROR;
+			rabp->b_iocmd = BIO_READ;
+			if (rabp->b_rcred == NOCRED && cred != NOCRED)
+				rabp->b_rcred = crhold(cred);
+			vfs_busy_pages(rabp, 0);
+			BUF_KERNPROC(rabp);
+			VOP_STRATEGY(vp, rabp);
+		} else {
+			brelse(rabp);
+		}
+	}
+
+	if (readwait) {
+		rv = bufwait(bp);
+	}
+	return (rv);
+}
+
+/*
+ * Write, release buffer on completion.  (Done by iodone
+ * if async).  Do not bother writing anything if the buffer
+ * is invalid.
+ *
+ * Note that we set B_CACHE here, indicating that buffer is
+ * fully valid and thus cacheable.  This is true even of NFS
+ * now so we set it generally.  This could be set either here 
+ * or in biodone() since the I/O is synchronous.  We put it
+ * here.
+ */
+
+int
+bwrite(struct buf * bp)
+{
+	int oldflags, s;
+	struct buf *newbp;
+
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return (0);
+	}
+
+	oldflags = bp->b_flags;
+
+	if (BUF_REFCNT(bp) == 0)
+		panic("bwrite: buffer is not busy???");
+	s = splbio();
+	/*
+	 * If a background write is already in progress, delay
+	 * writing this block if it is asynchronous. Otherwise
+	 * wait for the background write to complete.
+	 */
+	if (bp->b_xflags & BX_BKGRDINPROG) {
+		if (bp->b_flags & B_ASYNC) {
+			splx(s);
+			bdwrite(bp);
+			return (0);
+		}
+		bp->b_xflags |= BX_BKGRDWAIT;
+		tsleep(&bp->b_xflags, PRIBIO, "bwrbg", 0);
+		if (bp->b_xflags & BX_BKGRDINPROG)
+			panic("bwrite: still writing");
+	}
+
+	/* Mark the buffer clean */
+	bundirty(bp);
+
+	/*
+	 * If this buffer is marked for background writing and we
+	 * do not have to wait for it, make a copy and write the
+	 * copy so as to leave this buffer ready for further use.
+	 *
+	 * This optimization eats a lot of memory.  If we have a page
+	 * or buffer shortfall we can't do it.
+	 */
+	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && 
+	    (bp->b_flags & B_ASYNC) &&
+	    !vm_page_count_severe() &&
+	    !buf_dirty_count_severe()) {
+		if (bp->b_iodone != NULL) {
+			printf("bp->b_iodone = %p\n", bp->b_iodone);
+			panic("bwrite: need chained iodone");
+		}
+
+		/* get a new block */
+		newbp = geteblk(bp->b_bufsize);
+
+		/* set it to be identical to the old block */
+		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
+		bgetvp(bp->b_vp, newbp);
+		newbp->b_lblkno = bp->b_lblkno;
+		newbp->b_blkno = bp->b_blkno;
+		newbp->b_offset = bp->b_offset;
+		newbp->b_iodone = vfs_backgroundwritedone;
+		newbp->b_flags |= B_ASYNC;
+		newbp->b_flags &= ~B_INVAL;
+
+		/* move over the dependencies */
+		if (LIST_FIRST(&bp->b_dep) != NULL)
+			buf_movedeps(bp, newbp);
+
+		/*
+		 * Initiate write on the copy, release the original to
+		 * the B_LOCKED queue so that it cannot go away until
+		 * the background write completes. If not locked it could go
+		 * away and then be reconstituted while it was being written.
+		 * If the reconstituted buffer were written, we could end up
+		 * with two background copies being written at the same time.
+		 */
+		bp->b_xflags |= BX_BKGRDINPROG;
+		bp->b_flags |= B_LOCKED;
+		bqrelse(bp);
+		bp = newbp;
+	}
+
+	bp->b_flags &= ~B_DONE;
+	bp->b_ioflags &= ~BIO_ERROR;
+	bp->b_flags |= B_WRITEINPROG | B_CACHE;
+	bp->b_iocmd = BIO_WRITE;
+
+	bp->b_vp->v_numoutput++;
+	vfs_busy_pages(bp, 1);
+
+	/*
+	 * Normal bwrites pipeline writes
+	 */
+	bp->b_runningbufspace = bp->b_bufsize;
+	runningbufspace += bp->b_runningbufspace;
+
+	if (curthread != PCPU_GET(idlethread))
+		curthread->td_proc->p_stats->p_ru.ru_oublock++;
+	splx(s);
+	if (oldflags & B_ASYNC)
+		BUF_KERNPROC(bp);
+	BUF_STRATEGY(bp);
+
+	if ((oldflags & B_ASYNC) == 0) {
+		int rtval = bufwait(bp);
+		brelse(bp);
+		return (rtval);
+	} else if ((oldflags & B_NOWDRAIN) == 0) {
+		/*
+		 * don't allow the async write to saturate the I/O
+		 * system.  Deadlocks can occur only if a device strategy
+		 * routine (like in MD) turns around and issues another
+		 * high-level write, in which case B_NOWDRAIN is expected
+		 * to be set.  Otherwise we will not deadlock here because
+		 * we are blocking waiting for I/O that is already in-progress
+		 * to complete.
+		 */
+		waitrunningbufspace();
+	}
+
+	return (0);
+}
+
+/*
+ * Complete a background write started from bwrite.
+ */
+static void
+vfs_backgroundwritedone(bp)
+	struct buf *bp;
+{
+	struct buf *origbp;
+
+	/*
+	 * Find the original buffer that we are writing.
+	 */
+	if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL)
+		panic("backgroundwritedone: lost buffer");
+	/*
+	 * Process dependencies then return any unfinished ones.
+	 */
+	if (LIST_FIRST(&bp->b_dep) != NULL)
+		buf_complete(bp);
+	if (LIST_FIRST(&bp->b_dep) != NULL)
+		buf_movedeps(bp, origbp);
+	/*
+	 * Clear the BX_BKGRDINPROG flag in the original buffer
+	 * and awaken it if it is waiting for the write to complete.
+	 * If BX_BKGRDINPROG is not set in the original buffer it must
+	 * have been released and re-instantiated - which is not legal.
+	 */
+	KASSERT((origbp->b_xflags & BX_BKGRDINPROG),
+	    ("backgroundwritedone: lost buffer2"));
+	origbp->b_xflags &= ~BX_BKGRDINPROG;
+	if (origbp->b_xflags & BX_BKGRDWAIT) {
+		origbp->b_xflags &= ~BX_BKGRDWAIT;
+		wakeup(&origbp->b_xflags);
+	}
+	/*
+	 * Clear the B_LOCKED flag and remove it from the locked
+	 * queue if it currently resides there.
+	 */
+	origbp->b_flags &= ~B_LOCKED;
+	if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
+		bremfree(origbp);
+		bqrelse(origbp);
+	}
+	/*
+	 * This buffer is marked B_NOCACHE, so when it is released
+	 * by biodone, it will be tossed. We mark it with BIO_READ
+	 * to avoid biodone doing a second vwakeup.
+	 */
+	bp->b_flags |= B_NOCACHE;
+	bp->b_iocmd = BIO_READ;
+	bp->b_flags &= ~(B_CACHE | B_DONE);
+	bp->b_iodone = 0;
+	bufdone(bp);
+}
+
+/*
+ * Delayed write. (Buffer is marked dirty).  Do not bother writing
+ * anything if the buffer is marked invalid.
+ *
+ * Note that since the buffer must be completely valid, we can safely
+ * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
+ * biodone() in order to prevent getblk from writing the buffer
+ * out synchronously.
+ */
+void
+bdwrite(struct buf * bp)
+{
+	GIANT_REQUIRED;
+
+	if (BUF_REFCNT(bp) == 0)
+		panic("bdwrite: buffer is not busy");
+
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return;
+	}
+	bdirty(bp);
+
+	/*
+	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
+	 * true even of NFS now.
+	 */
+	bp->b_flags |= B_CACHE;
+
+	/*
+	 * This bmap keeps the system from needing to do the bmap later,
+	 * perhaps when the system is attempting to do a sync.  Since it
+	 * is likely that the indirect block -- or whatever other datastructure
+	 * that the filesystem needs is still in memory now, it is a good
+	 * thing to do this.  Note also, that if the pageout daemon is
+	 * requesting a sync -- there might not be enough memory to do
+	 * the bmap then...  So, this is important to do.
+	 */
+	if (bp->b_lblkno == bp->b_blkno) {
+		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+	}
+
+	/*
+	 * Set the *dirty* buffer range based upon the VM system dirty pages.
+	 */
+	vfs_setdirty(bp);
+
+	/*
+	 * We need to do this here to satisfy the vnode_pager and the
+	 * pageout daemon, so that it thinks that the pages have been
+	 * "cleaned".  Note that since the pages are in a delayed write
+	 * buffer -- the VFS layer "will" see that the pages get written
+	 * out on the next sync, or perhaps the cluster will be completed.
+	 */
+	vfs_clean_pages(bp);
+	bqrelse(bp);
+
+	/*
+	 * Wakeup the buffer flushing daemon if we have a lot of dirty
+	 * buffers (midpoint between our recovery point and our stall
+	 * point).
+	 */
+	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
+
+	/*
+	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
+	 * due to the softdep code.
+	 */
+}
+
+/*
+ *	bdirty:
+ *
+ *	Turn buffer into delayed write request.  We must clear BIO_READ and
+ *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
+ *	itself to properly update it in the dirty/clean lists.  We mark it
+ *	B_DONE to ensure that any asynchronization of the buffer properly
+ *	clears B_DONE ( else a panic will occur later ).  
+ *
+ *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
+ *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
+ *	should only be called if the buffer is known-good.
+ *
+ *	Since the buffer is not on a queue, we do not update the numfreebuffers
+ *	count.
+ *
+ *	Must be called at splbio().
+ *	The buffer must be on QUEUE_NONE.
+ */
+void
+bdirty(bp)
+	struct buf *bp;
+{
+	KASSERT(bp->b_qindex == QUEUE_NONE,
+	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
+	bp->b_flags &= ~(B_RELBUF);
+	bp->b_iocmd = BIO_WRITE;
+
+	if ((bp->b_flags & B_DELWRI) == 0) {
+		bp->b_flags |= B_DONE | B_DELWRI;
+		reassignbuf(bp, bp->b_vp);
+		++numdirtybuffers;
+		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
+	}
+}
+
+/*
+ *	bundirty:
+ *
+ *	Clear B_DELWRI for buffer.
+ *
+ *	Since the buffer is not on a queue, we do not update the numfreebuffers
+ *	count.
+ *	
+ *	Must be called at splbio().
+ *	The buffer must be on QUEUE_NONE.
+ */
+
+void
+bundirty(bp)
+	struct buf *bp;
+{
+	KASSERT(bp->b_qindex == QUEUE_NONE,
+	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
+
+	if (bp->b_flags & B_DELWRI) {
+		bp->b_flags &= ~B_DELWRI;
+		reassignbuf(bp, bp->b_vp);
+		--numdirtybuffers;
+		numdirtywakeup(lodirtybuffers);
+	}
+	/*
+	 * Since it is now being written, we can clear its deferred write flag.
+	 */
+	bp->b_flags &= ~B_DEFERRED;
+}
+
+/*
+ *	bawrite:
+ *
+ *	Asynchronous write.  Start output on a buffer, but do not wait for
+ *	it to complete.  The buffer is released when the output completes.
+ *
+ *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
+ *	B_INVAL buffers.  Not us.
+ */
+void
+bawrite(struct buf * bp)
+{
+	bp->b_flags |= B_ASYNC;
+	(void) BUF_WRITE(bp);
+}
+
+/*
+ *	bwillwrite:
+ *
+ *	Called prior to the locking of any vnodes when we are expecting to
+ *	write.  We do not want to starve the buffer cache with too many
+ *	dirty buffers so we block here.  By blocking prior to the locking
+ *	of any vnodes we attempt to avoid the situation where a locked vnode
+ *	prevents the various system daemons from flushing related buffers.
+ */
+
+void
+bwillwrite(void)
+{
+	if (numdirtybuffers >= hidirtybuffers) {
+		int s;
+
+		mtx_lock(&Giant);
+		s = splbio();
+		while (numdirtybuffers >= hidirtybuffers) {
+			bd_wakeup(1);
+			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
+			tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
+		}
+		splx(s);
+		mtx_unlock(&Giant);
+	}
+}
+
+/*
+ * Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+	return(numdirtybuffers >= hidirtybuffers);
+}
+
+/*
+ *	brelse:
+ *
+ *	Release a busy buffer and, if requested, free its resources.  The
+ *	buffer will be stashed in the appropriate bufqueue[] allowing it
+ *	to be accessed later as a cache entity or reused for other purposes.
+ */
+void
+brelse(struct buf * bp)
+{
+	int s;
+
+	GIANT_REQUIRED;
+
+	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
+	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+	s = splbio();
+
+	if (bp->b_flags & B_LOCKED)
+		bp->b_ioflags &= ~BIO_ERROR;
+
+	if (bp->b_iocmd == BIO_WRITE &&
+	    (bp->b_ioflags & BIO_ERROR) &&
+	    !(bp->b_flags & B_INVAL)) {
+		/*
+		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
+		 * pages from being scrapped.  If B_INVAL is set then
+		 * this case is not run and the next case is run to 
+		 * destroy the buffer.  B_INVAL can occur if the buffer
+		 * is outside the range supported by the underlying device.
+		 */
+		bp->b_ioflags &= ~BIO_ERROR;
+		bdirty(bp);
+	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
+	    (bp->b_ioflags & BIO_ERROR) ||
+	    bp->b_iocmd == BIO_DELETE || (bp->b_bufsize <= 0)) {
+		/*
+		 * Either a failed I/O or we were asked to free or not
+		 * cache the buffer.
+		 */
+		bp->b_flags |= B_INVAL;
+		if (LIST_FIRST(&bp->b_dep) != NULL)
+			buf_deallocate(bp);
+		if (bp->b_flags & B_DELWRI) {
+			--numdirtybuffers;
+			numdirtywakeup(lodirtybuffers);
+		}
+		bp->b_flags &= ~(B_DELWRI | B_CACHE);
+		if ((bp->b_flags & B_VMIO) == 0) {
+			if (bp->b_bufsize)
+				allocbuf(bp, 0);
+			if (bp->b_vp)
+				brelvp(bp);
+		}
+	}
+
+	/*
+	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
+	 * is called with B_DELWRI set, the underlying pages may wind up
+	 * getting freed causing a previous write (bdwrite()) to get 'lost'
+	 * because pages associated with a B_DELWRI bp are marked clean.
+	 * 
+	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
+	 * if B_DELWRI is set.
+	 *
+	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
+	 * on pages to return pages to the VM page queues.
+	 */
+	if (bp->b_flags & B_DELWRI)
+		bp->b_flags &= ~B_RELBUF;
+	else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
+		bp->b_flags |= B_RELBUF;
+
+	/*
+	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
+	 * constituted, not even NFS buffers now.  Two flags effect this.  If
+	 * B_INVAL, the struct buf is invalidated but the VM object is kept
+	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
+	 *
+	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
+	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
+	 * buffer is also B_INVAL because it hits the re-dirtying code above.
+	 *
+	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
+	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
+	 * the commit state and we cannot afford to lose the buffer. If the
+	 * buffer has a background write in progress, we need to keep it
+	 * around to prevent it from being reconstituted and starting a second
+	 * background write.
+	 */
+	if ((bp->b_flags & B_VMIO)
+	    && !(bp->b_vp->v_tag == VT_NFS &&
+		 !vn_isdisk(bp->b_vp, NULL) &&
+		 (bp->b_flags & B_DELWRI))
+	    ) {
+
+		int i, j, resid;
+		vm_page_t m;
+		off_t foff;
+		vm_pindex_t poff;
+		vm_object_t obj;
+		struct vnode *vp;
+
+		vp = bp->b_vp;
+
+		/*
+		 * Get the base offset and length of the buffer.  Note that 
+		 * in the VMIO case if the buffer block size is not
+		 * page-aligned then b_data pointer may not be page-aligned.
+		 * But our b_pages[] array *IS* page aligned.
+		 *
+		 * block sizes less then DEV_BSIZE (usually 512) are not 
+		 * supported due to the page granularity bits (m->valid,
+		 * m->dirty, etc...). 
+		 *
+		 * See man buf(9) for more information
+		 */
+		resid = bp->b_bufsize;
+		foff = bp->b_offset;
+
+		for (i = 0; i < bp->b_npages; i++) {
+			int had_bogus = 0;
+
+			m = bp->b_pages[i];
+			vm_page_flag_clear(m, PG_ZERO);
+
+			/*
+			 * If we hit a bogus page, fixup *all* the bogus pages
+			 * now.
+			 */
+			if (m == bogus_page) {
+				VOP_GETVOBJECT(vp, &obj);
+				poff = OFF_TO_IDX(bp->b_offset);
+				had_bogus = 1;
+
+				for (j = i; j < bp->b_npages; j++) {
+					vm_page_t mtmp;
+					mtmp = bp->b_pages[j];
+					if (mtmp == bogus_page) {
+						mtmp = vm_page_lookup(obj, poff + j);
+						if (!mtmp) {
+							panic("brelse: page missing\n");
+						}
+						bp->b_pages[j] = mtmp;
+					}
+				}
+
+				if ((bp->b_flags & B_INVAL) == 0) {
+					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+				}
+				m = bp->b_pages[i];
+			}
+			if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) {
+				int poffset = foff & PAGE_MASK;
+				int presid = resid > (PAGE_SIZE - poffset) ?
+					(PAGE_SIZE - poffset) : resid;
+
+				KASSERT(presid >= 0, ("brelse: extra page"));
+				vm_page_set_invalid(m, poffset, presid);
+				if (had_bogus)
+					printf("avoided corruption bug in bogus_page/brelse code\n");
+			}
+			resid -= PAGE_SIZE - (foff & PAGE_MASK);
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+		}
+
+		if (bp->b_flags & (B_INVAL | B_RELBUF))
+			vfs_vmio_release(bp);
+
+	} else if (bp->b_flags & B_VMIO) {
+
+		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
+			vfs_vmio_release(bp);
+		}
+
+	}
+			
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("brelse: free buffer onto another queue???");
+	if (BUF_REFCNT(bp) > 1) {
+		/* do not release to free list */
+		BUF_UNLOCK(bp);
+		splx(s);
+		return;
+	}
+
+	/* enqueue */
+
+	/* buffers with no memory */
+	if (bp->b_bufsize == 0) {
+		bp->b_flags |= B_INVAL;
+		bp->b_xflags &= ~BX_BKGRDWRITE;
+		if (bp->b_xflags & BX_BKGRDINPROG)
+			panic("losing buffer 1");
+		if (bp->b_kvasize) {
+			bp->b_qindex = QUEUE_EMPTYKVA;
+		} else {
+			bp->b_qindex = QUEUE_EMPTY;
+		}
+		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+		LIST_REMOVE(bp, b_hash);
+		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+		bp->b_dev = NODEV;
+	/* buffers with junk contents */
+	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
+	    (bp->b_ioflags & BIO_ERROR)) {
+		bp->b_flags |= B_INVAL;
+		bp->b_xflags &= ~BX_BKGRDWRITE;
+		if (bp->b_xflags & BX_BKGRDINPROG)
+			panic("losing buffer 2");
+		bp->b_qindex = QUEUE_CLEAN;
+		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+		LIST_REMOVE(bp, b_hash);
+		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+		bp->b_dev = NODEV;
+
+	/* buffers that are locked */
+	} else if (bp->b_flags & B_LOCKED) {
+		bp->b_qindex = QUEUE_LOCKED;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+
+	/* remaining buffers */
+	} else {
+		if (bp->b_flags & B_DELWRI)
+			bp->b_qindex = QUEUE_DIRTY;
+		else
+			bp->b_qindex = QUEUE_CLEAN;
+		if (bp->b_flags & B_AGE)
+			TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+		else
+			TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+	}
+
+	/*
+	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
+	 * on the correct queue.
+	 */
+	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
+		bundirty(bp);
+
+	/*
+	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
+	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
+	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
+	 * if B_INVAL is set ).
+	 */
+
+	if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
+		bufcountwakeup();
+
+	/*
+	 * Something we can maybe free or reuse
+	 */
+	if (bp->b_bufsize || bp->b_kvasize)
+		bufspacewakeup();
+
+	/* unlock */
+	BUF_UNLOCK(bp);
+	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | 
+			B_DIRECT | B_NOWDRAIN);
+	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
+		panic("brelse: not dirty");
+	splx(s);
+}
+
+/*
+ * Release a buffer back to the appropriate queue but do not try to free
+ * it.  The buffer is expected to be used again soon.
+ *
+ * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
+ * biodone() to requeue an async I/O on completion.  It is also used when
+ * known good buffers need to be requeued but we think we may need the data
+ * again soon.
+ *
+ * XXX we should be able to leave the B_RELBUF hint set on completion.
+ */
+void
+bqrelse(struct buf * bp)
+{
+	int s;
+
+	s = splbio();
+
+	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("bqrelse: free buffer onto another queue???");
+	if (BUF_REFCNT(bp) > 1) {
+		/* do not release to free list */
+		BUF_UNLOCK(bp);
+		splx(s);
+		return;
+	}
+	if (bp->b_flags & B_LOCKED) {
+		bp->b_ioflags &= ~BIO_ERROR;
+		bp->b_qindex = QUEUE_LOCKED;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+		/* buffers with stale but valid contents */
+	} else if (bp->b_flags & B_DELWRI) {
+		bp->b_qindex = QUEUE_DIRTY;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are too low on memory, we have to try to free the
+		 * buffer (most importantly: the wired pages making up its
+		 * backing store) *now*.
+		 */
+		splx(s);
+		brelse(bp);
+		return;
+	} else {
+		bp->b_qindex = QUEUE_CLEAN;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+	}
+
+	if ((bp->b_flags & B_LOCKED) == 0 &&
+	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
+		bufcountwakeup();
+	}
+
+	/*
+	 * Something we can maybe free or reuse.
+	 */
+	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
+		bufspacewakeup();
+
+	/* unlock */
+	BUF_UNLOCK(bp);
+	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
+		panic("bqrelse: not dirty");
+	splx(s);
+}
+
+/* Give pages used by the bp back to the VM system (where possible) */
+static void
+vfs_vmio_release(bp)
+	struct buf *bp;
+{
+	int i;
+	vm_page_t m;
+
+	GIANT_REQUIRED;
+
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		bp->b_pages[i] = NULL;
+		/*
+		 * In order to keep page LRU ordering consistent, put
+		 * everything on the inactive queue.
+		 */
+		vm_page_unwire(m, 0);
+		/*
+		 * We don't mess with busy pages, it is
+		 * the responsibility of the process that
+		 * busied the pages to deal with them.
+		 */
+		if ((m->flags & PG_BUSY) || (m->busy != 0))
+			continue;
+			
+		if (m->wire_count == 0) {
+			vm_page_flag_clear(m, PG_ZERO);
+			/*
+			 * Might as well free the page if we can and it has
+			 * no valid data.  We also free the page if the
+			 * buffer was used for direct I/O
+			 */
+			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
+			    m->hold_count == 0) {
+				vm_page_busy(m);
+				vm_page_protect(m, VM_PROT_NONE);
+				vm_page_free(m);
+			} else if (bp->b_flags & B_DIRECT) {
+				vm_page_try_to_free(m);
+			} else if (vm_page_count_severe()) {
+				vm_page_try_to_cache(m);
+			}
+		}
+	}
+	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+	
+	if (bp->b_bufsize) {
+		bufspacewakeup();
+		bp->b_bufsize = 0;
+	}
+	bp->b_npages = 0;
+	bp->b_flags &= ~B_VMIO;
+	if (bp->b_vp)
+		brelvp(bp);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+gbincore(struct vnode * vp, daddr_t blkno)
+{
+	struct buf *bp;
+	struct bufhashhdr *bh;
+
+	bh = bufhash(vp, blkno);
+
+	/* Search hash chain */
+	LIST_FOREACH(bp, bh, b_hash) {
+		/* hit */
+		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
+		    (bp->b_flags & B_INVAL) == 0) {
+			break;
+		}
+	}
+	return (bp);
+}
+
+/*
+ *	vfs_bio_awrite:
+ *
+ *	Implement clustered async writes for clearing out B_DELWRI buffers.
+ *	This is much better then the old way of writing only one buffer at
+ *	a time.  Note that we may not be presented with the buffers in the 
+ *	correct order, so we search for the cluster in both directions.
+ */
+int
+vfs_bio_awrite(struct buf * bp)
+{
+	int i;
+	int j;
+	daddr_t lblkno = bp->b_lblkno;
+	struct vnode *vp = bp->b_vp;
+	int s;
+	int ncl;
+	struct buf *bpa;
+	int nwritten;
+	int size;
+	int maxcl;
+
+	s = splbio();
+	/*
+	 * right now we support clustered writing only to regular files.  If
+	 * we find a clusterable block we could be in the middle of a cluster
+	 * rather then at the beginning.
+	 */
+	if ((vp->v_type == VREG) && 
+	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
+	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
+
+		size = vp->v_mount->mnt_stat.f_iosize;
+		maxcl = MAXPHYS / size;
+
+		for (i = 1; i < maxcl; i++) {
+			if ((bpa = gbincore(vp, lblkno + i)) &&
+			    BUF_REFCNT(bpa) == 0 &&
+			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+			    (B_DELWRI | B_CLUSTEROK)) &&
+			    (bpa->b_bufsize == size)) {
+				if ((bpa->b_blkno == bpa->b_lblkno) ||
+				    (bpa->b_blkno !=
+				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
+					break;
+			} else {
+				break;
+			}
+		}
+		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
+			if ((bpa = gbincore(vp, lblkno - j)) &&
+			    BUF_REFCNT(bpa) == 0 &&
+			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+			    (B_DELWRI | B_CLUSTEROK)) &&
+			    (bpa->b_bufsize == size)) {
+				if ((bpa->b_blkno == bpa->b_lblkno) ||
+				    (bpa->b_blkno !=
+				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
+					break;
+			} else {
+				break;
+			}
+		}
+		--j;
+		ncl = i + j;
+		/*
+		 * this is a possible cluster write
+		 */
+		if (ncl != 1) {
+			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
+			splx(s);
+			return nwritten;
+		}
+	}
+
+	BUF_LOCK(bp, LK_EXCLUSIVE);
+	bremfree(bp);
+	bp->b_flags |= B_ASYNC;
+
+	splx(s);
+	/*
+	 * default (old) behavior, writing out only one block
+	 *
+	 * XXX returns b_bufsize instead of b_bcount for nwritten?
+	 */
+	nwritten = bp->b_bufsize;
+	(void) BUF_WRITE(bp);
+
+	return nwritten;
+}
+
+/*
+ *	getnewbuf:
+ *
+ *	Find and initialize a new buffer header, freeing up existing buffers 
+ *	in the bufqueues as necessary.  The new buffer is returned locked.
+ *
+ *	Important:  B_INVAL is not set.  If the caller wishes to throw the
+ *	buffer away, the caller must set B_INVAL prior to calling brelse().
+ *
+ *	We block if:
+ *		We have insufficient buffer headers
+ *		We have insufficient buffer space
+ *		buffer_map is too fragmented ( space reservation fails )
+ *		If we have to flush dirty buffers ( but we try to avoid this )
+ *
+ *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
+ *	Instead we ask the buf daemon to do it for us.  We attempt to
+ *	avoid piecemeal wakeups of the pageout daemon.
+ */
+
+static struct buf *
+getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
+{
+	struct buf *bp;
+	struct buf *nbp;
+	int defrag = 0;
+	int nqindex;
+	static int flushingbufs;
+
+	GIANT_REQUIRED;
+
+	/*
+	 * We can't afford to block since we might be holding a vnode lock,
+	 * which may prevent system daemons from running.  We deal with
+	 * low-memory situations by proactively returning memory and running
+	 * async I/O rather then sync I/O.
+	 */
+
+	++getnewbufcalls;
+	--getnewbufrestarts;
+restart:
+	++getnewbufrestarts;
+
+	/*
+	 * Setup for scan.  If we do not have enough free buffers,
+	 * we setup a degenerate case that immediately fails.  Note
+	 * that if we are specially marked process, we are allowed to
+	 * dip into our reserves.
+	 *
+	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
+	 *
+	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
+	 * However, there are a number of cases (defragging, reusing, ...)
+	 * where we cannot backup.
+	 */
+	nqindex = QUEUE_EMPTYKVA;
+	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+
+	if (nbp == NULL) {
+		/*
+		 * If no EMPTYKVA buffers and we are either
+		 * defragging or reusing, locate a CLEAN buffer
+		 * to free or reuse.  If bufspace useage is low
+		 * skip this step so we can allocate a new buffer.
+		 */
+		if (defrag || bufspace >= lobufspace) {
+			nqindex = QUEUE_CLEAN;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+		}
+
+		/*
+		 * If we could not find or were not allowed to reuse a
+		 * CLEAN buffer, check to see if it is ok to use an EMPTY
+		 * buffer.  We can only use an EMPTY buffer if allocating
+		 * its KVA would not otherwise run us out of buffer space.
+		 */
+		if (nbp == NULL && defrag == 0 &&
+		    bufspace + maxsize < hibufspace) {
+			nqindex = QUEUE_EMPTY;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+		}
+	}
+
+	/*
+	 * Run scan, possibly freeing data and/or kva mappings on the fly
+	 * depending.
+	 */
+
+	while ((bp = nbp) != NULL) {
+		int qindex = nqindex;
+
+		/*
+		 * Calculate next bp ( we can only use it if we do not block
+		 * or do other fancy things ).
+		 */
+		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
+			switch(qindex) {
+			case QUEUE_EMPTY:
+				nqindex = QUEUE_EMPTYKVA;
+				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
+					break;
+				/* fall through */
+			case QUEUE_EMPTYKVA:
+				nqindex = QUEUE_CLEAN;
+				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
+					break;
+				/* fall through */
+			case QUEUE_CLEAN:
+				/*
+				 * nbp is NULL. 
+				 */
+				break;
+			}
+		}
+
+		/*
+		 * Sanity Checks
+		 */
+		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
+
+		/*
+		 * Note: we no longer distinguish between VMIO and non-VMIO
+		 * buffers.
+		 */
+
+		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
+
+		/*
+		 * If we are defragging then we need a buffer with 
+		 * b_kvasize != 0.  XXX this situation should no longer
+		 * occur, if defrag is non-zero the buffer's b_kvasize
+		 * should also be non-zero at this point.  XXX
+		 */
+		if (defrag && bp->b_kvasize == 0) {
+			printf("Warning: defrag empty buffer %p\n", bp);
+			continue;
+		}
+
+		/*
+		 * Start freeing the bp.  This is somewhat involved.  nbp
+		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
+		 */
+
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
+			panic("getnewbuf: locked buf");
+		bremfree(bp);
+
+		if (qindex == QUEUE_CLEAN) {
+			if (bp->b_flags & B_VMIO) {
+				bp->b_flags &= ~B_ASYNC;
+				vfs_vmio_release(bp);
+			}
+			if (bp->b_vp)
+				brelvp(bp);
+		}
+
+		/*
+		 * NOTE:  nbp is now entirely invalid.  We can only restart
+		 * the scan from this point on.
+		 *
+		 * Get the rest of the buffer freed up.  b_kva* is still
+		 * valid after this operation.
+		 */
+
+		if (bp->b_rcred != NOCRED) {
+			crfree(bp->b_rcred);
+			bp->b_rcred = NOCRED;
+		}
+		if (bp->b_wcred != NOCRED) {
+			crfree(bp->b_wcred);
+			bp->b_wcred = NOCRED;
+		}
+		if (LIST_FIRST(&bp->b_dep) != NULL)
+			buf_deallocate(bp);
+		if (bp->b_xflags & BX_BKGRDINPROG)
+			panic("losing buffer 3");
+		LIST_REMOVE(bp, b_hash);
+		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+
+		if (bp->b_bufsize)
+			allocbuf(bp, 0);
+
+		bp->b_flags = 0;
+		bp->b_ioflags = 0;
+		bp->b_xflags = 0;
+		bp->b_dev = NODEV;
+		bp->b_vp = NULL;
+		bp->b_blkno = bp->b_lblkno = 0;
+		bp->b_offset = NOOFFSET;
+		bp->b_iodone = 0;
+		bp->b_error = 0;
+		bp->b_resid = 0;
+		bp->b_bcount = 0;
+		bp->b_npages = 0;
+		bp->b_dirtyoff = bp->b_dirtyend = 0;
+		bp->b_magic = B_MAGIC_BIO;
+		bp->b_op = &buf_ops_bio;
+
+		LIST_INIT(&bp->b_dep);
+
+		/*
+		 * If we are defragging then free the buffer.
+		 */
+		if (defrag) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			defrag = 0;
+			goto restart;
+		}
+
+		/*
+		 * If we are overcomitted then recover the buffer and its
+		 * KVM space.  This occurs in rare situations when multiple
+		 * processes are blocked in getnewbuf() or allocbuf().
+		 */
+		if (bufspace >= hibufspace)
+			flushingbufs = 1;
+		if (flushingbufs && bp->b_kvasize != 0) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			goto restart;
+		}
+		if (bufspace < lobufspace)
+			flushingbufs = 0;
+		break;
+	}
+
+	/*
+	 * If we exhausted our list, sleep as appropriate.  We may have to
+	 * wakeup various daemons and write out some dirty buffers.
+	 *
+	 * Generally we are sleeping due to insufficient buffer space.
+	 */
+
+	if (bp == NULL) {
+		int flags;
+		char *waitmsg;
+
+		if (defrag) {
+			flags = VFS_BIO_NEED_BUFSPACE;
+			waitmsg = "nbufkv";
+		} else if (bufspace >= hibufspace) {
+			waitmsg = "nbufbs";
+			flags = VFS_BIO_NEED_BUFSPACE;
+		} else {
+			waitmsg = "newbuf";
+			flags = VFS_BIO_NEED_ANY;
+		}
+
+		bd_speedup();	/* heeeelp */
+
+		needsbuffer |= flags;
+		while (needsbuffer & flags) {
+			if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
+			    waitmsg, slptimeo))
+				return (NULL);
+		}
+	} else {
+		/*
+		 * We finally have a valid bp.  We aren't quite out of the
+		 * woods, we still have to reserve kva space.  In order
+		 * to keep fragmentation sane we only allocate kva in
+		 * BKVASIZE chunks.
+		 */
+		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
+
+		if (maxsize != bp->b_kvasize) {
+			vm_offset_t addr = 0;
+
+			bfreekva(bp);
+
+			if (vm_map_findspace(buffer_map,
+				vm_map_min(buffer_map), maxsize, &addr)) {
+				/*
+				 * Uh oh.  Buffer map is to fragmented.  We
+				 * must defragment the map.
+				 */
+				++bufdefragcnt;
+				defrag = 1;
+				bp->b_flags |= B_INVAL;
+				brelse(bp);
+				goto restart;
+			}
+			if (addr) {
+				vm_map_insert(buffer_map, NULL, 0,
+					addr, addr + maxsize,
+					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
+
+				bp->b_kvabase = (caddr_t) addr;
+				bp->b_kvasize = maxsize;
+				bufspace += bp->b_kvasize;
+				++bufreusecnt;
+			}
+		}
+		bp->b_data = bp->b_kvabase;
+	}
+	return(bp);
+}
+
+/*
+ *	buf_daemon:
+ *
+ *	buffer flushing daemon.  Buffers are normally flushed by the
+ *	update daemon but if it cannot keep up this process starts to
+ *	take the load in an attempt to prevent getnewbuf() from blocking.
+ */
+
+static struct proc *bufdaemonproc;
+
+static struct kproc_desc buf_kp = {
+	"bufdaemon",
+	buf_daemon,
+	&bufdaemonproc
+};
+SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
+
+static void
+buf_daemon()
+{
+	int s;
+
+	mtx_lock(&Giant);
+
+	/*
+	 * This process needs to be suspended prior to shutdown sync.
+	 */
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
+	    SHUTDOWN_PRI_LAST);
+
+	/*
+	 * This process is allowed to take the buffer cache to the limit
+	 */
+	s = splbio();
+
+	for (;;) {
+		kthread_suspend_check(bufdaemonproc);
+
+		bd_request = 0;
+
+		/*
+		 * Do the flush.  Limit the amount of in-transit I/O we
+		 * allow to build up, otherwise we would completely saturate
+		 * the I/O system.  Wakeup any waiting processes before we
+		 * normally would so they can run in parallel with our drain.
+		 */
+		while (numdirtybuffers > lodirtybuffers) {
+			if (flushbufqueues() == 0)
+				break;
+			waitrunningbufspace();
+			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
+		}
+
+		/*
+		 * Only clear bd_request if we have reached our low water
+		 * mark.  The buf_daemon normally waits 1 second and
+		 * then incrementally flushes any dirty buffers that have
+		 * built up, within reason.
+		 *
+		 * If we were unable to hit our low water mark and couldn't
+		 * find any flushable buffers, we sleep half a second.
+		 * Otherwise we loop immediately.
+		 */
+		if (numdirtybuffers <= lodirtybuffers) {
+			/*
+			 * We reached our low water mark, reset the
+			 * request and sleep until we are needed again.
+			 * The sleep is just so the suspend code works.
+			 */
+			bd_request = 0;
+			tsleep(&bd_request, PVM, "psleep", hz);
+		} else {
+			/*
+			 * We couldn't find any flushable dirty buffers but
+			 * still have too many dirty buffers, we
+			 * have to sleep and try again.  (rare)
+			 */
+			tsleep(&bd_request, PVM, "qsleep", hz / 2);
+		}
+	}
+}
+
+/*
+ *	flushbufqueues:
+ *
+ *	Try to flush a buffer in the dirty queue.  We must be careful to
+ *	free up B_INVAL buffers instead of write them, which NFS is 
+ *	particularly sensitive to.
+ */
+
+static int
+flushbufqueues(void)
+{
+	struct buf *bp;
+	int r = 0;
+
+	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
+
+	while (bp) {
+		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
+		if ((bp->b_flags & B_DELWRI) != 0 &&
+		    (bp->b_xflags & BX_BKGRDINPROG) == 0) {
+			if (bp->b_flags & B_INVAL) {
+				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
+					panic("flushbufqueues: locked buf");
+				bremfree(bp);
+				brelse(bp);
+				++r;
+				break;
+			}
+			if (LIST_FIRST(&bp->b_dep) != NULL &&
+			    (bp->b_flags & B_DEFERRED) == 0 &&
+			    buf_countdeps(bp, 0)) {
+				TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY],
+				    bp, b_freelist);
+				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY],
+				    bp, b_freelist);
+				bp->b_flags |= B_DEFERRED;
+				bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
+				continue;
+			}
+			vfs_bio_awrite(bp);
+			++r;
+			break;
+		}
+		bp = TAILQ_NEXT(bp, b_freelist);
+	}
+	return (r);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+incore(struct vnode * vp, daddr_t blkno)
+{
+	struct buf *bp;
+
+	int s = splbio();
+	bp = gbincore(vp, blkno);
+	splx(s);
+	return (bp);
+}
+
+/*
+ * Returns true if no I/O is needed to access the
+ * associated VM object.  This is like incore except
+ * it also hunts around in the VM system for the data.
+ */
+
+int
+inmem(struct vnode * vp, daddr_t blkno)
+{
+	vm_object_t obj;
+	vm_offset_t toff, tinc, size;
+	vm_page_t m;
+	vm_ooffset_t off;
+
+	GIANT_REQUIRED;
+
+	if (incore(vp, blkno))
+		return 1;
+	if (vp->v_mount == NULL)
+		return 0;
+	if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0)
+		return 0;
+
+	size = PAGE_SIZE;
+	if (size > vp->v_mount->mnt_stat.f_iosize)
+		size = vp->v_mount->mnt_stat.f_iosize;
+	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
+
+	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
+		if (!m)
+			goto notinmem;
+		tinc = size;
+		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
+			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
+		if (vm_page_is_valid(m,
+		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
+			goto notinmem;
+	}
+	return 1;
+
+notinmem:
+	return (0);
+}
+
+/*
+ *	vfs_setdirty:
+ *
+ *	Sets the dirty range for a buffer based on the status of the dirty
+ *	bits in the pages comprising the buffer.
+ *
+ *	The range is limited to the size of the buffer.
+ *
+ *	This routine is primarily used by NFS, but is generalized for the
+ *	B_VMIO case.
+ */
+static void
+vfs_setdirty(struct buf *bp) 
+{
+	int i;
+	vm_object_t object;
+
+	GIANT_REQUIRED;
+	/*
+	 * Degenerate case - empty buffer
+	 */
+
+	if (bp->b_bufsize == 0)
+		return;
+
+	/*
+	 * We qualify the scan for modified pages on whether the
+	 * object has been flushed yet.  The OBJ_WRITEABLE flag
+	 * is not cleared simply by protecting pages off.
+	 */
+
+	if ((bp->b_flags & B_VMIO) == 0)
+		return;
+
+	object = bp->b_pages[0]->object;
+
+	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
+		printf("Warning: object %p writeable but not mightbedirty\n", object);
+	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
+		printf("Warning: object %p mightbedirty but not writeable\n", object);
+
+	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
+		vm_offset_t boffset;
+		vm_offset_t eoffset;
+
+		/*
+		 * test the pages to see if they have been modified directly
+		 * by users through the VM system.
+		 */
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
+			vm_page_test_dirty(bp->b_pages[i]);
+		}
+
+		/*
+		 * Calculate the encompassing dirty range, boffset and eoffset,
+		 * (eoffset - boffset) bytes.
+		 */
+
+		for (i = 0; i < bp->b_npages; i++) {
+			if (bp->b_pages[i]->dirty)
+				break;
+		}
+		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+		for (i = bp->b_npages - 1; i >= 0; --i) {
+			if (bp->b_pages[i]->dirty) {
+				break;
+			}
+		}
+		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+		/*
+		 * Fit it to the buffer.
+		 */
+
+		if (eoffset > bp->b_bcount)
+			eoffset = bp->b_bcount;
+
+		/*
+		 * If we have a good dirty range, merge with the existing
+		 * dirty range.
+		 */
+
+		if (boffset < eoffset) {
+			if (bp->b_dirtyoff > boffset)
+				bp->b_dirtyoff = boffset;
+			if (bp->b_dirtyend < eoffset)
+				bp->b_dirtyend = eoffset;
+		}
+	}
+}
+
+/*
+ *	getblk:
+ *
+ *	Get a block given a specified block and offset into a file/device.
+ *	The buffers B_DONE bit will be cleared on return, making it almost
+ * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
+ *	return.  The caller should clear B_INVAL prior to initiating a
+ *	READ.
+ *
+ *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
+ *	an existing buffer.
+ *
+ *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
+ *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
+ *	and then cleared based on the backing VM.  If the previous buffer is
+ *	non-0-sized but invalid, B_CACHE will be cleared.
+ *
+ *	If getblk() must create a new buffer, the new buffer is returned with
+ *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
+ *	case it is returned with B_INVAL clear and B_CACHE set based on the
+ *	backing VM.
+ *
+ *	getblk() also forces a BUF_WRITE() for any B_DELWRI buffer whos
+ *	B_CACHE bit is clear.
+ *	
+ *	What this means, basically, is that the caller should use B_CACHE to
+ *	determine whether the buffer is fully valid or not and should clear
+ *	B_INVAL prior to issuing a read.  If the caller intends to validate
+ *	the buffer by loading its data area with something, the caller needs
+ *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
+ *	the caller should set B_CACHE ( as an optimization ), else the caller
+ *	should issue the I/O and biodone() will set B_CACHE if the I/O was
+ *	a write attempt or if it was a successfull read.  If the caller 
+ *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
+ *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
+ */
+struct buf *
+getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
+{
+	struct buf *bp;
+	int s;
+	struct bufhashhdr *bh;
+
+	if (size > MAXBSIZE)
+		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+
+	s = splbio();
+loop:
+	/*
+	 * Block if we are low on buffers.   Certain processes are allowed
+	 * to completely exhaust the buffer cache.
+         *
+         * If this check ever becomes a bottleneck it may be better to
+         * move it into the else, when gbincore() fails.  At the moment
+         * it isn't a problem.
+	 *
+	 * XXX remove if 0 sections (clean this up after its proven)
+         */
+	if (numfreebuffers == 0) {
+		if (curthread == PCPU_GET(idlethread))
+			return NULL;
+		needsbuffer |= VFS_BIO_NEED_ANY;
+	}
+
+	if ((bp = gbincore(vp, blkno))) {
+		/*
+		 * Buffer is in-core.  If the buffer is not busy, it must
+		 * be on a queue.
+		 */
+
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
+			    "getblk", slpflag, slptimeo) == ENOLCK)
+				goto loop;
+			splx(s);
+			return (struct buf *) NULL;
+		}
+
+		/*
+		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
+		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
+		 * and for a VMIO buffer B_CACHE is adjusted according to the
+		 * backing VM cache.
+		 */
+		if (bp->b_flags & B_INVAL)
+			bp->b_flags &= ~B_CACHE;
+		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
+			bp->b_flags |= B_CACHE;
+		bremfree(bp);
+
+		/*
+		 * check for size inconsistancies for non-VMIO case.
+		 */
+
+		if (bp->b_bcount != size) {
+			if ((bp->b_flags & B_VMIO) == 0 ||
+			    (size > bp->b_kvasize)) {
+				if (bp->b_flags & B_DELWRI) {
+					bp->b_flags |= B_NOCACHE;
+					BUF_WRITE(bp);
+				} else {
+					if ((bp->b_flags & B_VMIO) &&
+					   (LIST_FIRST(&bp->b_dep) == NULL)) {
+						bp->b_flags |= B_RELBUF;
+						brelse(bp);
+					} else {
+						bp->b_flags |= B_NOCACHE;
+						BUF_WRITE(bp);
+					}
+				}
+				goto loop;
+			}
+		}
+
+		/*
+		 * If the size is inconsistant in the VMIO case, we can resize
+		 * the buffer.  This might lead to B_CACHE getting set or
+		 * cleared.  If the size has not changed, B_CACHE remains
+		 * unchanged from its previous state.
+		 */
+
+		if (bp->b_bcount != size)
+			allocbuf(bp, size);
+
+		KASSERT(bp->b_offset != NOOFFSET, 
+		    ("getblk: no buffer offset"));
+
+		/*
+		 * A buffer with B_DELWRI set and B_CACHE clear must
+		 * be committed before we can return the buffer in
+		 * order to prevent the caller from issuing a read
+		 * ( due to B_CACHE not being set ) and overwriting
+		 * it.
+		 *
+		 * Most callers, including NFS and FFS, need this to
+		 * operate properly either because they assume they
+		 * can issue a read if B_CACHE is not set, or because
+		 * ( for example ) an uncached B_DELWRI might loop due 
+		 * to softupdates re-dirtying the buffer.  In the latter
+		 * case, B_CACHE is set after the first write completes,
+		 * preventing further loops.
+		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
+		 * above while extending the buffer, we cannot allow the
+		 * buffer to remain with B_CACHE set after the write
+		 * completes or it will represent a corrupt state.  To
+		 * deal with this we set B_NOCACHE to scrap the buffer
+		 * after the write.
+		 *
+		 * We might be able to do something fancy, like setting
+		 * B_CACHE in bwrite() except if B_DELWRI is already set,
+		 * so the below call doesn't set B_CACHE, but that gets real
+		 * confusing.  This is much easier.
+		 */
+
+		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+			bp->b_flags |= B_NOCACHE;
+			BUF_WRITE(bp);
+			goto loop;
+		}
+
+		splx(s);
+		bp->b_flags &= ~B_DONE;
+	} else {
+		/*
+		 * Buffer is not in-core, create new buffer.  The buffer
+		 * returned by getnewbuf() is locked.  Note that the returned
+		 * buffer is also considered valid (not marked B_INVAL).
+		 */
+		int bsize, maxsize, vmio;
+		off_t offset;
+
+		if (vn_isdisk(vp, NULL))
+			bsize = DEV_BSIZE;
+		else if (vp->v_mountedhere)
+			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
+		else if (vp->v_mount)
+			bsize = vp->v_mount->mnt_stat.f_iosize;
+		else
+			bsize = size;
+
+		offset = blkno * bsize;
+		vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF);
+		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
+		maxsize = imax(maxsize, bsize);
+
+		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
+			if (slpflag || slptimeo) {
+				splx(s);
+				return NULL;
+			}
+			goto loop;
+		}
+
+		/*
+		 * This code is used to make sure that a buffer is not
+		 * created while the getnewbuf routine is blocked.
+		 * This can be a problem whether the vnode is locked or not.
+		 * If the buffer is created out from under us, we have to
+		 * throw away the one we just created.  There is now window
+		 * race because we are safely running at splbio() from the
+		 * point of the duplicate buffer creation through to here,
+		 * and we've locked the buffer.
+		 */
+		if (gbincore(vp, blkno)) {
+			bp->b_flags |= B_INVAL;
+			brelse(bp);
+			goto loop;
+		}
+
+		/*
+		 * Insert the buffer into the hash, so that it can
+		 * be found by incore.
+		 */
+		bp->b_blkno = bp->b_lblkno = blkno;
+		bp->b_offset = offset;
+
+		bgetvp(vp, bp);
+		LIST_REMOVE(bp, b_hash);
+		bh = bufhash(vp, blkno);
+		LIST_INSERT_HEAD(bh, bp, b_hash);
+
+		/*
+		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
+		 * buffer size starts out as 0, B_CACHE will be set by
+		 * allocbuf() for the VMIO case prior to it testing the
+		 * backing store for validity.
+		 */
+
+		if (vmio) {
+			bp->b_flags |= B_VMIO;
+#if defined(VFS_BIO_DEBUG)
+			if (vp->v_type != VREG)
+				printf("getblk: vmioing file type %d???\n", vp->v_type);
+#endif
+		} else {
+			bp->b_flags &= ~B_VMIO;
+		}
+
+		allocbuf(bp, size);
+
+		splx(s);
+		bp->b_flags &= ~B_DONE;
+	}
+	return (bp);
+}
+
+/*
+ * Get an empty, disassociated buffer of given size.  The buffer is initially
+ * set to B_INVAL.
+ */
+struct buf *
+geteblk(int size)
+{
+	struct buf *bp;
+	int s;
+	int maxsize;
+
+	maxsize = (size + BKVAMASK) & ~BKVAMASK;
+
+	s = splbio();
+	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0);
+	splx(s);
+	allocbuf(bp, size);
+	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
+	return (bp);
+}
+
+
+/*
+ * This code constitutes the buffer memory from either anonymous system
+ * memory (in the case of non-VMIO operations) or from an associated
+ * VM object (in the case of VMIO operations).  This code is able to
+ * resize a buffer up or down.
+ *
+ * Note that this code is tricky, and has many complications to resolve
+ * deadlock or inconsistant data situations.  Tread lightly!!! 
+ * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
+ * the caller.  Calling this code willy nilly can result in the loss of data.
+ *
+ * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
+ * B_CACHE for the non-VMIO case.
+ */
+
+int
+allocbuf(struct buf *bp, int size)
+{
+	int newbsize, mbsize;
+	int i;
+
+	GIANT_REQUIRED;
+
+	if (BUF_REFCNT(bp) == 0)
+		panic("allocbuf: buffer not busy");
+
+	if (bp->b_kvasize < size)
+		panic("allocbuf: buffer too small");
+
+	if ((bp->b_flags & B_VMIO) == 0) {
+		caddr_t origbuf;
+		int origbufsize;
+		/*
+		 * Just get anonymous memory from the kernel.  Don't
+		 * mess with B_CACHE.
+		 */
+		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+		if (bp->b_flags & B_MALLOC)
+			newbsize = mbsize;
+		else
+			newbsize = round_page(size);
+
+		if (newbsize < bp->b_bufsize) {
+			/*
+			 * malloced buffers are not shrunk
+			 */
+			if (bp->b_flags & B_MALLOC) {
+				if (newbsize) {
+					bp->b_bcount = size;
+				} else {
+					free(bp->b_data, M_BIOBUF);
+					if (bp->b_bufsize) {
+						bufmallocspace -= bp->b_bufsize;
+						bufspacewakeup();
+						bp->b_bufsize = 0;
+					}
+					bp->b_data = bp->b_kvabase;
+					bp->b_bcount = 0;
+					bp->b_flags &= ~B_MALLOC;
+				}
+				return 1;
+			}		
+			vm_hold_free_pages(
+			    bp,
+			    (vm_offset_t) bp->b_data + newbsize,
+			    (vm_offset_t) bp->b_data + bp->b_bufsize);
+		} else if (newbsize > bp->b_bufsize) {
+			/*
+			 * We only use malloced memory on the first allocation.
+			 * and revert to page-allocated memory when the buffer
+			 * grows.
+			 */
+			if ( (bufmallocspace < maxbufmallocspace) &&
+				(bp->b_bufsize == 0) &&
+				(mbsize <= PAGE_SIZE/2)) {
+
+				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
+				bp->b_bufsize = mbsize;
+				bp->b_bcount = size;
+				bp->b_flags |= B_MALLOC;
+				bufmallocspace += mbsize;
+				return 1;
+			}
+			origbuf = NULL;
+			origbufsize = 0;
+			/*
+			 * If the buffer is growing on its other-than-first allocation,
+			 * then we revert to the page-allocation scheme.
+			 */
+			if (bp->b_flags & B_MALLOC) {
+				origbuf = bp->b_data;
+				origbufsize = bp->b_bufsize;
+				bp->b_data = bp->b_kvabase;
+				if (bp->b_bufsize) {
+					bufmallocspace -= bp->b_bufsize;
+					bufspacewakeup();
+					bp->b_bufsize = 0;
+				}
+				bp->b_flags &= ~B_MALLOC;
+				newbsize = round_page(newbsize);
+			}
+			vm_hold_load_pages(
+			    bp,
+			    (vm_offset_t) bp->b_data + bp->b_bufsize,
+			    (vm_offset_t) bp->b_data + newbsize);
+			if (origbuf) {
+				bcopy(origbuf, bp->b_data, origbufsize);
+				free(origbuf, M_BIOBUF);
+			}
+		}
+	} else {
+		vm_page_t m;
+		int desiredpages;
+
+		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+		desiredpages = (size == 0) ? 0 :
+			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
+
+		if (bp->b_flags & B_MALLOC)
+			panic("allocbuf: VMIO buffer can't be malloced");
+		/*
+		 * Set B_CACHE initially if buffer is 0 length or will become
+		 * 0-length.
+		 */
+		if (size == 0 || bp->b_bufsize == 0)
+			bp->b_flags |= B_CACHE;
+
+		if (newbsize < bp->b_bufsize) {
+			/*
+			 * DEV_BSIZE aligned new buffer size is less then the
+			 * DEV_BSIZE aligned existing buffer size.  Figure out
+			 * if we have to remove any pages.
+			 */
+			if (desiredpages < bp->b_npages) {
+				for (i = desiredpages; i < bp->b_npages; i++) {
+					/*
+					 * the page is not freed here -- it
+					 * is the responsibility of 
+					 * vnode_pager_setsize
+					 */
+					m = bp->b_pages[i];
+					KASSERT(m != bogus_page,
+					    ("allocbuf: bogus page found"));
+					while (vm_page_sleep_busy(m, TRUE, "biodep"))
+						;
+
+					bp->b_pages[i] = NULL;
+					vm_page_unwire(m, 0);
+				}
+				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
+				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
+				bp->b_npages = desiredpages;
+			}
+		} else if (size > bp->b_bcount) {
+			/*
+			 * We are growing the buffer, possibly in a 
+			 * byte-granular fashion.
+			 */
+			struct vnode *vp;
+			vm_object_t obj;
+			vm_offset_t toff;
+			vm_offset_t tinc;
+
+			/*
+			 * Step 1, bring in the VM pages from the object, 
+			 * allocating them if necessary.  We must clear
+			 * B_CACHE if these pages are not valid for the 
+			 * range covered by the buffer.
+			 */
+
+			vp = bp->b_vp;
+			VOP_GETVOBJECT(vp, &obj);
+
+			while (bp->b_npages < desiredpages) {
+				vm_page_t m;
+				vm_pindex_t pi;
+
+				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
+				if ((m = vm_page_lookup(obj, pi)) == NULL) {
+					/*
+					 * note: must allocate system pages
+					 * since blocking here could intefere
+					 * with paging I/O, no matter which
+					 * process we are.
+					 */
+					m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
+					if (m == NULL) {
+						VM_WAIT;
+						vm_pageout_deficit += desiredpages - bp->b_npages;
+					} else {
+						vm_page_wire(m);
+						vm_page_wakeup(m);
+						bp->b_flags &= ~B_CACHE;
+						bp->b_pages[bp->b_npages] = m;
+						++bp->b_npages;
+					}
+					continue;
+				}
+
+				/*
+				 * We found a page.  If we have to sleep on it,
+				 * retry because it might have gotten freed out
+				 * from under us.
+				 *
+				 * We can only test PG_BUSY here.  Blocking on
+				 * m->busy might lead to a deadlock:
+				 *
+				 *  vm_fault->getpages->cluster_read->allocbuf
+				 *
+				 */
+
+				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
+					continue;
+
+				/*
+				 * We have a good page.  Should we wakeup the
+				 * page daemon?
+				 */
+				if ((curproc != pageproc) &&
+				    ((m->queue - m->pc) == PQ_CACHE) &&
+				    ((cnt.v_free_count + cnt.v_cache_count) <
+					(cnt.v_free_min + cnt.v_cache_min))) {
+					pagedaemon_wakeup();
+				}
+				vm_page_flag_clear(m, PG_ZERO);
+				vm_page_wire(m);
+				bp->b_pages[bp->b_npages] = m;
+				++bp->b_npages;
+			}
+
+			/*
+			 * Step 2.  We've loaded the pages into the buffer,
+			 * we have to figure out if we can still have B_CACHE
+			 * set.  Note that B_CACHE is set according to the
+			 * byte-granular range ( bcount and size ), new the
+			 * aligned range ( newbsize ).
+			 *
+			 * The VM test is against m->valid, which is DEV_BSIZE
+			 * aligned.  Needless to say, the validity of the data
+			 * needs to also be DEV_BSIZE aligned.  Note that this
+			 * fails with NFS if the server or some other client
+			 * extends the file's EOF.  If our buffer is resized, 
+			 * B_CACHE may remain set! XXX
+			 */
+
+			toff = bp->b_bcount;
+			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
+
+			while ((bp->b_flags & B_CACHE) && toff < size) {
+				vm_pindex_t pi;
+
+				if (tinc > (size - toff))
+					tinc = size - toff;
+
+				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
+				    PAGE_SHIFT;
+
+				vfs_buf_test_cache(
+				    bp, 
+				    bp->b_offset,
+				    toff, 
+				    tinc, 
+				    bp->b_pages[pi]
+				);
+				toff += tinc;
+				tinc = PAGE_SIZE;
+			}
+
+			/*
+			 * Step 3, fixup the KVM pmap.  Remember that
+			 * bp->b_data is relative to bp->b_offset, but 
+			 * bp->b_offset may be offset into the first page.
+			 */
+
+			bp->b_data = (caddr_t)
+			    trunc_page((vm_offset_t)bp->b_data);
+			pmap_qenter(
+			    (vm_offset_t)bp->b_data,
+			    bp->b_pages, 
+			    bp->b_npages
+			);
+			
+			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
+			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
+		}
+	}
+	if (newbsize < bp->b_bufsize)
+		bufspacewakeup();
+	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
+	bp->b_bcount = size;		/* requested buffer size	*/
+	return 1;
+}
+
+/*
+ *	bufwait:
+ *
+ *	Wait for buffer I/O completion, returning error status.  The buffer
+ *	is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
+ *	error and cleared.
+ */
+int
+bufwait(register struct buf * bp)
+{
+	int s;
+
+	s = splbio();
+	while ((bp->b_flags & B_DONE) == 0) {
+		if (bp->b_iocmd == BIO_READ)
+			tsleep(bp, PRIBIO, "biord", 0);
+		else
+			tsleep(bp, PRIBIO, "biowr", 0);
+	}
+	splx(s);
+	if (bp->b_flags & B_EINTR) {
+		bp->b_flags &= ~B_EINTR;
+		return (EINTR);
+	}
+	if (bp->b_ioflags & BIO_ERROR) {
+		return (bp->b_error ? bp->b_error : EIO);
+	} else {
+		return (0);
+	}
+}
+
+ /*
+  * Call back function from struct bio back up to struct buf.
+  * The corresponding initialization lives in sys/conf.h:DEV_STRATEGY().
+  */
+void
+bufdonebio(struct bio *bp)
+{
+	bufdone(bp->bio_caller2);
+}
+
+/*
+ *	bufdone:
+ *
+ *	Finish I/O on a buffer, optionally calling a completion function.
+ *	This is usually called from an interrupt so process blocking is
+ *	not allowed.
+ *
+ *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
+ *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
+ *	assuming B_INVAL is clear.
+ *
+ *	For the VMIO case, we set B_CACHE if the op was a read and no
+ *	read error occured, or if the op was a write.  B_CACHE is never
+ *	set if the buffer is invalid or otherwise uncacheable.
+ *
+ *	biodone does not mess with B_INVAL, allowing the I/O routine or the
+ *	initiator to leave B_INVAL set to brelse the buffer out of existance
+ *	in the biodone routine.
+ */
+void
+bufdone(struct buf *bp)
+{
+	int s, error;
+	void    (*biodone)(struct buf *);
+
+	GIANT_REQUIRED;
+
+	s = splbio();
+
+	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
+	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
+
+	bp->b_flags |= B_DONE;
+	runningbufwakeup(bp);
+
+	if (bp->b_iocmd == BIO_DELETE) {
+		brelse(bp);
+		splx(s);
+		return;
+	}
+
+	if (bp->b_iocmd == BIO_WRITE) {
+		vwakeup(bp);
+	}
+
+	/* call optional completion function if requested */
+	if (bp->b_iodone != NULL) {
+		biodone = bp->b_iodone;
+		bp->b_iodone = NULL;
+		(*biodone) (bp);
+		splx(s);
+		return;
+	}
+	if (LIST_FIRST(&bp->b_dep) != NULL)
+		buf_complete(bp);
+
+	if (bp->b_flags & B_VMIO) {
+		int i;
+		vm_ooffset_t foff;
+		vm_page_t m;
+		vm_object_t obj;
+		int iosize;
+		struct vnode *vp = bp->b_vp;
+
+		error = VOP_GETVOBJECT(vp, &obj);
+
+#if defined(VFS_BIO_DEBUG)
+		if (vp->v_usecount == 0) {
+			panic("biodone: zero vnode ref count");
+		}
+
+		if (error) {
+			panic("biodone: missing VM object");
+		}
+
+		if ((vp->v_flag & VOBJBUF) == 0) {
+			panic("biodone: vnode is not setup for merged cache");
+		}
+#endif
+
+		foff = bp->b_offset;
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("biodone: no buffer offset"));
+
+		if (error) {
+			panic("biodone: no object");
+		}
+#if defined(VFS_BIO_DEBUG)
+		if (obj->paging_in_progress < bp->b_npages) {
+			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
+			    obj->paging_in_progress, bp->b_npages);
+		}
+#endif
+
+		/*
+		 * Set B_CACHE if the op was a normal read and no error
+		 * occured.  B_CACHE is set for writes in the b*write()
+		 * routines.
+		 */
+		iosize = bp->b_bcount - bp->b_resid;
+		if (bp->b_iocmd == BIO_READ &&
+		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
+		    !(bp->b_ioflags & BIO_ERROR)) {
+			bp->b_flags |= B_CACHE;
+		}
+
+		for (i = 0; i < bp->b_npages; i++) {
+			int bogusflag = 0;
+			int resid;
+
+			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+			if (resid > iosize)
+				resid = iosize;
+
+			/*
+			 * cleanup bogus pages, restoring the originals
+			 */
+			m = bp->b_pages[i];
+			if (m == bogus_page) {
+				bogusflag = 1;
+				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
+				if (m == NULL)
+					panic("biodone: page disappeared!");
+				bp->b_pages[i] = m;
+				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+			}
+#if defined(VFS_BIO_DEBUG)
+			if (OFF_TO_IDX(foff) != m->pindex) {
+				printf(
+"biodone: foff(%lu)/m->pindex(%d) mismatch\n",
+				    (unsigned long)foff, m->pindex);
+			}
+#endif
+
+			/*
+			 * In the write case, the valid and clean bits are
+			 * already changed correctly ( see bdwrite() ), so we 
+			 * only need to do this here in the read case.
+			 */
+			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
+				vfs_page_set_valid(bp, foff, i, m);
+			}
+			vm_page_flag_clear(m, PG_ZERO);
+
+			/*
+			 * when debugging new filesystems or buffer I/O methods, this
+			 * is the most common error that pops up.  if you see this, you
+			 * have not set the page busy flag correctly!!!
+			 */
+			if (m->busy == 0) {
+				printf("biodone: page busy < 0, "
+				    "pindex: %d, foff: 0x(%x,%x), "
+				    "resid: %d, index: %d\n",
+				    (int) m->pindex, (int)(foff >> 32),
+						(int) foff & 0xffffffff, resid, i);
+				if (!vn_isdisk(vp, NULL))
+					printf(" iosize: %ld, lblkno: %jd, flags: 0x%lx, npages: %d\n",
+					    bp->b_vp->v_mount->mnt_stat.f_iosize,
+					    (intmax_t) bp->b_lblkno,
+					    bp->b_flags, bp->b_npages);
+				else
+					printf(" VDEV, lblkno: %jd, flags: 0x%lx, npages: %d\n",
+					    (intmax_t) bp->b_lblkno,
+					    bp->b_flags, bp->b_npages);
+				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
+				    m->valid, m->dirty, m->wire_count);
+				panic("biodone: page busy < 0\n");
+			}
+			vm_page_io_finish(m);
+			vm_object_pip_subtract(obj, 1);
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+			iosize -= resid;
+		}
+		if (obj)
+			vm_object_pip_wakeupn(obj, 0);
+	}
+
+	/*
+	 * For asynchronous completions, release the buffer now. The brelse
+	 * will do a wakeup there if necessary - so no need to do a wakeup
+	 * here in the async case. The sync case always needs to do a wakeup.
+	 */
+
+	if (bp->b_flags & B_ASYNC) {
+		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
+			brelse(bp);
+		else
+			bqrelse(bp);
+	} else {
+		wakeup(bp);
+	}
+	splx(s);
+}
+
+/*
+ * This routine is called in lieu of iodone in the case of
+ * incomplete I/O.  This keeps the busy status for pages
+ * consistant.
+ */
+void
+vfs_unbusy_pages(struct buf * bp)
+{
+	int i;
+
+	GIANT_REQUIRED;
+
+	runningbufwakeup(bp);
+	if (bp->b_flags & B_VMIO) {
+		struct vnode *vp = bp->b_vp;
+		vm_object_t obj;
+
+		VOP_GETVOBJECT(vp, &obj);
+
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+
+			if (m == bogus_page) {
+				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
+				if (!m) {
+					panic("vfs_unbusy_pages: page missing\n");
+				}
+				bp->b_pages[i] = m;
+				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+			}
+			vm_object_pip_subtract(obj, 1);
+			vm_page_flag_clear(m, PG_ZERO);
+			vm_page_io_finish(m);
+		}
+		vm_object_pip_wakeupn(obj, 0);
+	}
+}
+
+/*
+ * vfs_page_set_valid:
+ *
+ *	Set the valid bits in a page based on the supplied offset.   The
+ *	range is restricted to the buffer's size.
+ *
+ *	This routine is typically called after a read completes.
+ */
+static void
+vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
+{
+	vm_ooffset_t soff, eoff;
+
+	GIANT_REQUIRED;
+	/*
+	 * Start and end offsets in buffer.  eoff - soff may not cross a
+	 * page boundry or cross the end of the buffer.  The end of the
+	 * buffer, in this case, is our file EOF, not the allocation size
+	 * of the buffer.
+	 */
+	soff = off;
+	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+	if (eoff > bp->b_offset + bp->b_bcount)
+		eoff = bp->b_offset + bp->b_bcount;
+
+	/*
+	 * Set valid range.  This is typically the entire buffer and thus the
+	 * entire page.
+	 */
+	if (eoff > soff) {
+		vm_page_set_validclean(
+		    m,
+		   (vm_offset_t) (soff & PAGE_MASK),
+		   (vm_offset_t) (eoff - soff)
+		);
+	}
+}
+
+/*
+ * This routine is called before a device strategy routine.
+ * It is used to tell the VM system that paging I/O is in
+ * progress, and treat the pages associated with the buffer
+ * almost as being PG_BUSY.  Also the object paging_in_progress
+ * flag is handled to make sure that the object doesn't become
+ * inconsistant.
+ *
+ * Since I/O has not been initiated yet, certain buffer flags
+ * such as BIO_ERROR or B_INVAL may be in an inconsistant state
+ * and should be ignored.
+ */
+void
+vfs_busy_pages(struct buf * bp, int clear_modify)
+{
+	int i, bogus;
+
+	GIANT_REQUIRED;
+
+	if (bp->b_flags & B_VMIO) {
+		struct vnode *vp = bp->b_vp;
+		vm_object_t obj;
+		vm_ooffset_t foff;
+
+		VOP_GETVOBJECT(vp, &obj);
+		foff = bp->b_offset;
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("vfs_busy_pages: no buffer offset"));
+		vfs_setdirty(bp);
+
+retry:
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
+				goto retry;
+		}
+
+		bogus = 0;
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+
+			vm_page_flag_clear(m, PG_ZERO);
+			if ((bp->b_flags & B_CLUSTER) == 0) {
+				vm_object_pip_add(obj, 1);
+				vm_page_io_start(m);
+			}
+
+			/*
+			 * When readying a buffer for a read ( i.e
+			 * clear_modify == 0 ), it is important to do
+			 * bogus_page replacement for valid pages in 
+			 * partially instantiated buffers.  Partially 
+			 * instantiated buffers can, in turn, occur when
+			 * reconstituting a buffer from its VM backing store
+			 * base.  We only have to do this if B_CACHE is
+			 * clear ( which causes the I/O to occur in the
+			 * first place ).  The replacement prevents the read
+			 * I/O from overwriting potentially dirty VM-backed
+			 * pages.  XXX bogus page replacement is, uh, bogus.
+			 * It may not work properly with small-block devices.
+			 * We need to find a better way.
+			 */
+
+			vm_page_protect(m, VM_PROT_NONE);
+			if (clear_modify)
+				vfs_page_set_valid(bp, foff, i, m);
+			else if (m->valid == VM_PAGE_BITS_ALL &&
+				(bp->b_flags & B_CACHE) == 0) {
+				bp->b_pages[i] = bogus_page;
+				bogus++;
+			}
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+		}
+		if (bogus)
+			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+	}
+}
+
+/*
+ * Tell the VM system that the pages associated with this buffer
+ * are clean.  This is used for delayed writes where the data is
+ * going to go to disk eventually without additional VM intevention.
+ *
+ * Note that while we only really need to clean through to b_bcount, we
+ * just go ahead and clean through to b_bufsize.
+ */
+static void
+vfs_clean_pages(struct buf * bp)
+{
+	int i;
+
+	GIANT_REQUIRED;
+
+	if (bp->b_flags & B_VMIO) {
+		vm_ooffset_t foff;
+
+		foff = bp->b_offset;
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("vfs_clean_pages: no buffer offset"));
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+			vm_ooffset_t eoff = noff;
+
+			if (eoff > bp->b_offset + bp->b_bufsize)
+				eoff = bp->b_offset + bp->b_bufsize;
+			vfs_page_set_valid(bp, foff, i, m);
+			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
+			foff = noff;
+		}
+	}
+}
+
+/*
+ *	vfs_bio_set_validclean:
+ *
+ *	Set the range within the buffer to valid and clean.  The range is 
+ *	relative to the beginning of the buffer, b_offset.  Note that b_offset
+ *	itself may be offset from the beginning of the first page.
+ *
+ */
+
+void   
+vfs_bio_set_validclean(struct buf *bp, int base, int size)
+{
+	if (bp->b_flags & B_VMIO) {
+		int i;
+		int n;
+
+		/*
+		 * Fixup base to be relative to beginning of first page.
+		 * Set initial n to be the maximum number of bytes in the
+		 * first page that can be validated.
+		 */
+
+		base += (bp->b_offset & PAGE_MASK);
+		n = PAGE_SIZE - (base & PAGE_MASK);
+
+		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+			vm_page_t m = bp->b_pages[i];
+
+			if (n > size)
+				n = size;
+
+			vm_page_set_validclean(m, base & PAGE_MASK, n);
+			base += n;
+			size -= n;
+			n = PAGE_SIZE;
+		}
+	}
+}
+
+/*
+ *	vfs_bio_clrbuf:
+ *
+ *	clear a buffer.  This routine essentially fakes an I/O, so we need
+ *	to clear BIO_ERROR and B_INVAL.
+ *
+ *	Note that while we only theoretically need to clear through b_bcount,
+ *	we go ahead and clear through b_bufsize.
+ */
+
+void
+vfs_bio_clrbuf(struct buf *bp) 
+{
+	int i, mask = 0;
+	caddr_t sa, ea;
+
+	GIANT_REQUIRED;
+
+	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
+		bp->b_flags &= ~B_INVAL;
+		bp->b_ioflags &= ~BIO_ERROR;
+		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
+		    (bp->b_offset & PAGE_MASK) == 0) {
+			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
+			if ((bp->b_pages[0]->valid & mask) == mask) {
+				bp->b_resid = 0;
+				return;
+			}
+			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
+			    ((bp->b_pages[0]->valid & mask) == 0)) {
+				bzero(bp->b_data, bp->b_bufsize);
+				bp->b_pages[0]->valid |= mask;
+				bp->b_resid = 0;
+				return;
+			}
+		}
+		ea = sa = bp->b_data;
+		for(i=0;i<bp->b_npages;i++,sa=ea) {
+			int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
+			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
+			ea = (caddr_t)(vm_offset_t)ulmin(
+			    (u_long)(vm_offset_t)ea,
+			    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
+			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
+			if ((bp->b_pages[i]->valid & mask) == mask)
+				continue;
+			if ((bp->b_pages[i]->valid & mask) == 0) {
+				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
+					bzero(sa, ea - sa);
+				}
+			} else {
+				for (; sa < ea; sa += DEV_BSIZE, j++) {
+					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
+						(bp->b_pages[i]->valid & (1<<j)) == 0)
+						bzero(sa, DEV_BSIZE);
+				}
+			}
+			bp->b_pages[i]->valid |= mask;
+			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
+		}
+		bp->b_resid = 0;
+	} else {
+		clrbuf(bp);
+	}
+}
+
+/*
+ * vm_hold_load_pages and vm_hold_free_pages get pages into
+ * a buffers address space.  The pages are anonymous and are
+ * not associated with a file object.
+ */
+static void
+vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+	vm_offset_t pg;
+	vm_page_t p;
+	int index;
+
+	GIANT_REQUIRED;
+
+	to = round_page(to);
+	from = round_page(from);
+	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+tryagain:
+		/*
+		 * note: must allocate system pages since blocking here
+		 * could intefere with paging I/O, no matter which
+		 * process we are.
+		 */
+		p = vm_page_alloc(kernel_object,
+			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+		    VM_ALLOC_SYSTEM);
+		if (!p) {
+			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
+			VM_WAIT;
+			goto tryagain;
+		}
+		vm_page_wire(p);
+		p->valid = VM_PAGE_BITS_ALL;
+		vm_page_flag_clear(p, PG_ZERO);
+		pmap_qenter(pg, &p, 1);
+		bp->b_pages[index] = p;
+		vm_page_wakeup(p);
+	}
+	bp->b_npages = index;
+}
+
+/* Return pages associated with this buf to the vm system */
+void
+vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+	vm_offset_t pg;
+	vm_page_t p;
+	int index, newnpages;
+
+	GIANT_REQUIRED;
+
+	from = round_page(from);
+	to = round_page(to);
+	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+		p = bp->b_pages[index];
+		if (p && (index < bp->b_npages)) {
+			if (p->busy) {
+				printf(
+			    "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
+				    (intmax_t)bp->b_blkno,
+				    (intmax_t)bp->b_lblkno);
+			}
+			bp->b_pages[index] = NULL;
+			pmap_qremove(pg, 1);
+			vm_page_busy(p);
+			vm_page_unwire(p, 0);
+			vm_page_free(p);
+		}
+	}
+	bp->b_npages = newnpages;
+}
+
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+/* DDB command to show buffer data */
+DB_SHOW_COMMAND(buffer, db_show_buffer)
+{
+	/* get args */
+	struct buf *bp = (struct buf *)addr;
+
+	if (!have_addr) {
+		db_printf("usage: show buffer <addr>\n");
+		return;
+	}
+
+	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
+	db_printf(
+	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
+	    "b_dev = (%d,%d), b_data = %p, b_blkno = %jd, b_pblkno = %jd\n",
+	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
+	    major(bp->b_dev), minor(bp->b_dev), bp->b_data,
+	    (intmax_t)bp->b_blkno, (intmax_t)bp->b_pblkno);
+	if (bp->b_npages) {
+		int i;
+		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m;
+			m = bp->b_pages[i];
+			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
+			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
+			if ((i + 1) < bp->b_npages)
+				db_printf(",");
+		}
+		db_printf("\n");
+	}
+}
+#endif /* DDB */
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
new file mode 100644
index 0000000..be79fc2
--- /dev/null
+++ b/sys/kern/vfs_cache.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 1989, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Poul-Henning Kamp of the FreeBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/namei.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/filedesc.h>
+#include <sys/fnv_hash.h>
+
+/*
+ * This structure describes the elements in the cache of recent
+ * names looked up by namei.
+ */
+
+struct	namecache {
+	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
+	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
+	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
+	struct	vnode *nc_dvp;		/* vnode of parent of name */
+	struct	vnode *nc_vp;		/* vnode the name refers to */
+	u_char	nc_flag;		/* flag bits */
+	u_char	nc_nlen;		/* length of name */
+	char	nc_name[0];		/* segment name */
+};
+
+/*
+ * Name caching works as follows:
+ *
+ * Names found by directory scans are retained in a cache
+ * for future reference.  It is managed LRU, so frequently
+ * used names will hang around.  Cache is indexed by hash value
+ * obtained from (vp, name) where vp refers to the directory
+ * containing name.
+ *
+ * If it is a "negative" entry, (i.e. for a name that is known NOT to
+ * exist) the vnode pointer will be NULL.
+ *
+ * Upon reaching the last segment of a path, if the reference
+ * is for DELETE, or NOCACHE is set (rewrite), and the
+ * name is located in the cache, it will be dropped.
+ */
+
+/*
+ * Structures associated with name cacheing.
+ */
+#define NCHHASH(hash) \
+	(&nchashtbl[(hash) & nchash])
+static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
+static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
+static u_long	nchash;			/* size of hash table */
+SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
+static u_long	ncnegfactor = 16;	/* ratio of negative entries */
+SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
+static u_long	numneg;			/* number of cache entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
+static u_long	numcache;		/* number of cache entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
+static u_long	numcachehv;		/* number of cache entries with vnodes held */
+SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
+#if 0
+static u_long	numcachepl;		/* number of cache purge for leaf entries */
+SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
+#endif
+struct	nchstats nchstats;		/* cache effectiveness statistics */
+
+static int	doingcache = 1;		/* 1 => enable the cache */
+SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
+
+/* Export size information to userland */
+SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
+SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
+
+/*
+ * The new name cache statistics
+ */
+SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
+#define STATNODE(mode, name, var) \
+	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
+STATNODE(CTLFLAG_RD, numneg, &numneg);
+STATNODE(CTLFLAG_RD, numcache, &numcache);
+static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
+static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
+static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
+static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
+static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
+static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
+static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
+static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
+static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
+static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
+
+SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
+        sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
+
+
+
+static void cache_zap(struct namecache *ncp);
+
+static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
+
+/*
+ * Flags in namecache.nc_flag
+ */
+#define NCF_WHITE	1
+
+/*
+ * Grab an atomic snapshot of the name cache hash chain lengths
+ */
+SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
+
+static int
+sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct nchashhead *ncpp;
+	struct namecache *ncp;
+	int n_nchash;
+	int count;
+
+	n_nchash = nchash + 1;	/* nchash is max index, not count */
+	if (!req->oldptr)
+		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
+
+	/* Scan hash tables for applicable entries */
+	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
+		count = 0;
+		LIST_FOREACH(ncp, ncpp, nc_hash) {
+			count++;
+		}
+		error = SYSCTL_OUT(req, &count, sizeof(count));
+		if (error)
+			return (error);
+	}
+	return (0);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
+	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
+
+static int
+sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct nchashhead *ncpp;
+	struct namecache *ncp;
+	int n_nchash;
+	int count, maxlength, used, pct;
+
+	if (!req->oldptr)
+		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
+
+	n_nchash = nchash + 1;	/* nchash is max index, not count */
+	used = 0;
+	maxlength = 0;
+
+	/* Scan hash tables for applicable entries */
+	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
+		count = 0;
+		LIST_FOREACH(ncp, ncpp, nc_hash) {
+			count++;
+		}
+		if (count)
+			used++;
+		if (maxlength < count)
+			maxlength = count;
+	}
+	n_nchash = nchash + 1;
+	pct = (used * 100 * 100) / n_nchash;
+	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, &used, sizeof(used));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, &pct, sizeof(pct));
+	if (error)
+		return (error);
+	return (0);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
+	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
+
+/*
+ * Delete an entry from its hash list and move it to the front
+ * of the LRU list for immediate reuse.
+ */
+static void
+cache_zap(ncp)
+	struct namecache *ncp;
+{
+	LIST_REMOVE(ncp, nc_hash);
+	LIST_REMOVE(ncp, nc_src);
+	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
+		vdrop(ncp->nc_dvp);
+		numcachehv--;
+	}
+	if (ncp->nc_vp) {
+		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
+	} else {
+		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+		numneg--;
+	}
+	numcache--;
+	free(ncp, M_VFSCACHE);
+}
+
+/*
+ * cache_leaf_test()
+ * 
+ *      Test whether this (directory) vnode's namei cache entry contains
+ *      subdirectories or not.  Used to determine whether the directory is
+ *      a leaf in the namei cache or not.  Note: the directory may still   
+ *      contain files in the namei cache.
+ *
+ *      Returns 0 if the directory is a leaf, -1 if it isn't.
+ */
+int
+cache_leaf_test(struct vnode *vp)
+{
+	struct namecache *ncpc;
+
+	for (ncpc = LIST_FIRST(&vp->v_cache_src);
+	     ncpc != NULL;
+	     ncpc = LIST_NEXT(ncpc, nc_src)
+	) {
+		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR)
+			return(-1);
+	}
+	return(0);
+}
+
+/*
+ * Lookup an entry in the cache
+ *
+ * Lookup is called with dvp pointing to the directory to search,
+ * cnp pointing to the name of the entry being sought. If the lookup
+ * succeeds, the vnode is returned in *vpp, and a status of -1 is
+ * returned. If the lookup determines that the name does not exist
+ * (negative cacheing), a status of ENOENT is returned. If the lookup
+ * fails, a status of zero is returned.
+ */
+
+int
+cache_lookup(dvp, vpp, cnp)
+	struct vnode *dvp;
+	struct vnode **vpp;
+	struct componentname *cnp;
+{
+	struct namecache *ncp;
+	u_int32_t hash;
+
+	if (!doingcache) {
+		cnp->cn_flags &= ~MAKEENTRY;
+		return (0);
+	}
+
+	numcalls++;
+
+	if (cnp->cn_nameptr[0] == '.') {
+		if (cnp->cn_namelen == 1) {
+			*vpp = dvp;
+			dothits++;
+			return (-1);
+		}
+		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+			dotdothits++;
+			if (dvp->v_dd->v_id != dvp->v_ddid ||
+			    (cnp->cn_flags & MAKEENTRY) == 0) {
+				dvp->v_ddid = 0;
+				return (0);
+			}
+			*vpp = dvp->v_dd;
+			return (-1);
+		}
+	}
+
+	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
+	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
+	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+		numchecks++;
+		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+			break;
+	}
+
+	/* We failed to find an entry */
+	if (ncp == 0) {
+		if ((cnp->cn_flags & MAKEENTRY) == 0) {
+			nummisszap++;
+		} else {
+			nummiss++;
+		}
+		nchstats.ncs_miss++;
+		return (0);
+	}
+
+	/* We don't want to have an entry, so dump it */
+	if ((cnp->cn_flags & MAKEENTRY) == 0) {
+		numposzaps++;
+		nchstats.ncs_badhits++;
+		cache_zap(ncp);
+		return (0);
+	}
+
+	/* We found a "positive" match, return the vnode */
+        if (ncp->nc_vp) {
+		numposhits++;
+		nchstats.ncs_goodhits++;
+		*vpp = ncp->nc_vp;
+		return (-1);
+	}
+
+	/* We found a negative match, and want to create it, so purge */
+	if (cnp->cn_nameiop == CREATE) {
+		numnegzaps++;
+		nchstats.ncs_badhits++;
+		cache_zap(ncp);
+		return (0);
+	}
+
+	numneghits++;
+	/*
+	 * We found a "negative" match, ENOENT notifies client of this match.
+	 * The nc_vpid field records whether this is a whiteout.
+	 */
+	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+	nchstats.ncs_neghits++;
+	if (ncp->nc_flag & NCF_WHITE)
+		cnp->cn_flags |= ISWHITEOUT;
+	return (ENOENT);
+}
+
+/*
+ * Add an entry to the cache.
+ */
+void
+cache_enter(dvp, vp, cnp)
+	struct vnode *dvp;
+	struct vnode *vp;
+	struct componentname *cnp;
+{
+	struct namecache *ncp;
+	struct nchashhead *ncpp;
+	u_int32_t hash;
+	int len;
+
+	if (!doingcache)
+		return;
+
+	if (cnp->cn_nameptr[0] == '.') {
+		if (cnp->cn_namelen == 1) {
+			return;
+		}
+		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+			if (vp) {
+				dvp->v_dd = vp;
+				dvp->v_ddid = vp->v_id;
+			} else {
+				dvp->v_dd = dvp;
+				dvp->v_ddid = 0;
+			}
+			return;
+		}
+	}
+	 
+	ncp = (struct namecache *)
+		malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
+	bzero((char *)ncp, sizeof *ncp);
+	numcache++;
+	if (!vp) {
+		numneg++;
+		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
+	} else if (vp->v_type == VDIR) {
+		vp->v_dd = dvp;
+		vp->v_ddid = dvp->v_id;
+	}
+
+	/*
+	 * Fill in cache info, if vp is NULL this is a "negative" cache entry.
+	 * For negative entries, we have to record whether it is a whiteout.
+	 * the whiteout flag is stored in the nc_vpid field which is
+	 * otherwise unused.
+	 */
+	ncp->nc_vp = vp;
+	ncp->nc_dvp = dvp;
+	len = ncp->nc_nlen = cnp->cn_namelen;
+	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
+	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
+	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
+	ncpp = NCHHASH(hash);
+	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+	if (LIST_EMPTY(&dvp->v_cache_src)) {
+		vhold(dvp);
+		numcachehv++;
+	}
+	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
+	if (vp) {
+		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
+	} else {
+		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+	}
+	if (numneg * ncnegfactor > numcache) {
+		ncp = TAILQ_FIRST(&ncneg);
+		cache_zap(ncp);
+	}
+}
+
+/*
+ * Name cache initialization, from vfs_init() when we are booting
+ */
+static void
+nchinit(void *dummy __unused)
+{
+
+	TAILQ_INIT(&ncneg);
+	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
+
+
+/*
+ * Invalidate all entries to a particular vnode.
+ *
+ * Remove all entries in the namecache relating to this vnode and
+ * change the v_id.  We take the v_id from a global counter, since
+ * it becomes a handy sequence number in crash-dumps that way.
+ * No valid vnode will ever have (v_id == 0).
+ *
+ * XXX: Only time and the size of v_id prevents this from failing:
+ * XXX: In theory we should hunt down all (struct vnode*, v_id)
+ * XXX: soft references and nuke them, at least on the global
+ * XXX: v_id wraparound.  The period of resistance can be extended
+ * XXX: by incrementing each vnodes v_id individually instead of
+ * XXX: using the global v_id.
+ */
+
+void
+cache_purge(vp)
+	struct vnode *vp;
+{
+	static u_long nextid;
+
+	while (!LIST_EMPTY(&vp->v_cache_src)) 
+		cache_zap(LIST_FIRST(&vp->v_cache_src));
+	while (!TAILQ_EMPTY(&vp->v_cache_dst)) 
+		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
+
+	do
+		nextid++;
+	while (nextid == vp->v_id || !nextid);
+	vp->v_id = nextid;
+	vp->v_dd = vp;
+	vp->v_ddid = 0;
+}
+
+/*
+ * Flush all entries referencing a particular filesystem.
+ *
+ * Since we need to check it anyway, we will flush all the invalid
+ * entries at the same time.
+ */
+void
+cache_purgevfs(mp)
+	struct mount *mp;
+{
+	struct nchashhead *ncpp;
+	struct namecache *ncp, *nnp;
+
+	/* Scan hash tables for applicable entries */
+	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
+		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
+			nnp = LIST_NEXT(ncp, nc_hash);
+			if (ncp->nc_dvp->v_mount == mp) {
+				cache_zap(ncp);
+			}
+		}
+	}
+}
+
+/*
+ * Perform canonical checks and cache lookup and pass on to filesystem
+ * through the vop_cachedlookup only if needed.
+ */
+
+int
+vfs_cache_lookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+	struct vnode *dvp, *vp;
+	int lockparent;
+	int error;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	struct ucred *cred = cnp->cn_cred;
+	int flags = cnp->cn_flags;
+	struct thread *td = cnp->cn_thread;
+	u_long vpid;	/* capability number of vnode */
+
+	*vpp = NULL;
+	dvp = ap->a_dvp;
+	lockparent = flags & LOCKPARENT;
+
+	if (dvp->v_type != VDIR)
+                return (ENOTDIR);
+
+	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+		return (EROFS);
+
+	error = VOP_ACCESS(dvp, VEXEC, cred, td);
+
+	if (error)
+		return (error);
+
+	error = cache_lookup(dvp, vpp, cnp);
+
+#ifdef LOOKUP_SHARED
+	if (!error) {
+		/* We do this because the rest of the system now expects to get
+		 * a shared lock, which is later upgraded if LOCKSHARED is not
+		 * set.  We have so many cases here because of bugs that yield
+		 * inconsistant lock states.  This all badly needs to be fixed
+		 */
+		error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
+		if (!error) {
+			int flock;
+
+			flock = VOP_ISLOCKED(*vpp, td);
+			if (flock != LK_EXCLUSIVE) {
+				if (flock == 0) {
+					if ((flags & ISLASTCN) &&
+					    (flags & LOCKSHARED))
+						VOP_LOCK(*vpp, LK_SHARED, td);
+					else
+						VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
+				}
+			} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
+				VOP_LOCK(*vpp, LK_DOWNGRADE, td);
+		}
+		return (error);
+	}
+#else
+	if (!error) 
+		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
+#endif
+
+	if (error == ENOENT)
+		return (error);
+
+	vp = *vpp;
+	vpid = vp->v_id;
+	cnp->cn_flags &= ~PDIRUNLOCK;
+	if (dvp == vp) {   /* lookup on "." */
+		VREF(vp);
+		error = 0;
+	} else if (flags & ISDOTDOT) {
+		VOP_UNLOCK(dvp, 0, td);
+		cnp->cn_flags |= PDIRUNLOCK;
+#ifdef LOOKUP_SHARED
+		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
+			error = vget(vp, LK_SHARED, td);
+		else
+			error = vget(vp, LK_EXCLUSIVE, td);
+#else
+		error = vget(vp, LK_EXCLUSIVE, td);
+#endif
+
+		if (!error && lockparent && (flags & ISLASTCN)) {
+			if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
+				cnp->cn_flags &= ~PDIRUNLOCK;
+		}
+	} else {
+#ifdef LOOKUP_SHARED
+		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
+			error = vget(vp, LK_SHARED, td);
+		else
+			error = vget(vp, LK_EXCLUSIVE, td);
+#else
+		error = vget(vp, LK_EXCLUSIVE, td);
+#endif
+		if (!lockparent || error || !(flags & ISLASTCN)) {
+			VOP_UNLOCK(dvp, 0, td);
+			cnp->cn_flags |= PDIRUNLOCK;
+		}
+	}
+	/*
+	 * Check that the capability number did not change
+	 * while we were waiting for the lock.
+	 */
+	if (!error) {
+		if (vpid == vp->v_id)
+			return (0);
+		vput(vp);
+		if (lockparent && dvp != vp && (flags & ISLASTCN)) {
+			VOP_UNLOCK(dvp, 0, td);
+			cnp->cn_flags |= PDIRUNLOCK;
+		}
+	}
+	if (cnp->cn_flags & PDIRUNLOCK) {
+		error = vn_lock(dvp, LK_EXCLUSIVE, td);
+		if (error)
+			return (error);
+		cnp->cn_flags &= ~PDIRUNLOCK;
+	}
+#ifdef LOOKUP_SHARED
+	error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
+
+	if (!error) {
+		int flock = 0;
+
+		flock = VOP_ISLOCKED(*vpp, td);
+		if (flock != LK_EXCLUSIVE) {
+			if (flock == 0) {
+				if ((flags & ISLASTCN) && (flags & LOCKSHARED))
+					VOP_LOCK(*vpp, LK_SHARED, td);
+				else
+					VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
+			}
+		} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
+			VOP_LOCK(*vpp, LK_DOWNGRADE, td);
+	}
+
+	return (error);
+#else
+	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
+#endif
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct  __getcwd_args {
+	u_char	*buf;
+	u_int	buflen;
+};
+#endif
+
+/*
+ * XXX All of these sysctls would probably be more productive dead.
+ */
+static int disablecwd;
+SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
+   "Disable the getcwd syscall");
+
+/* Various statistics for the getcwd syscall */
+static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
+static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
+static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
+static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
+static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
+static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
+
+/* Implementation of the getcwd syscall */
+int
+__getcwd(td, uap)
+	struct thread *td;
+	struct __getcwd_args *uap;
+{
+	char *bp, *buf;
+	int error, i, slash_prefixed;
+	struct filedesc *fdp;
+	struct namecache *ncp;
+	struct vnode *vp;
+
+	numcwdcalls++;
+	if (disablecwd)
+		return (ENODEV);
+	if (uap->buflen < 2)
+		return (EINVAL);
+	if (uap->buflen > MAXPATHLEN)
+		uap->buflen = MAXPATHLEN;
+	buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
+	bp += uap->buflen - 1;
+	*bp = '\0';
+	fdp = td->td_proc->p_fd;
+	slash_prefixed = 0;
+	FILEDESC_LOCK(fdp);
+	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
+		if (vp->v_flag & VROOT) {
+			if (vp->v_mount == NULL) {	/* forced unmount */
+				FILEDESC_UNLOCK(fdp);
+				free(buf, M_TEMP);
+				return (EBADF);
+			}
+			vp = vp->v_mount->mnt_vnodecovered;
+			continue;
+		}
+		if (vp->v_dd->v_id != vp->v_ddid) {
+			FILEDESC_UNLOCK(fdp);
+			numcwdfail1++;
+			free(buf, M_TEMP);
+			return (ENOTDIR);
+		}
+		ncp = TAILQ_FIRST(&vp->v_cache_dst);
+		if (!ncp) {
+			FILEDESC_UNLOCK(fdp);
+			numcwdfail2++;
+			free(buf, M_TEMP);
+			return (ENOENT);
+		}
+		if (ncp->nc_dvp != vp->v_dd) {
+			FILEDESC_UNLOCK(fdp);
+			numcwdfail3++;
+			free(buf, M_TEMP);
+			return (EBADF);
+		}
+		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
+			if (bp == buf) {
+				FILEDESC_UNLOCK(fdp);
+				numcwdfail4++;
+				free(buf, M_TEMP);
+				return (ENOMEM);
+			}
+			*--bp = ncp->nc_name[i];
+		}
+		if (bp == buf) {
+			FILEDESC_UNLOCK(fdp);
+			numcwdfail4++;
+			free(buf, M_TEMP);
+			return (ENOMEM);
+		}
+		*--bp = '/';
+		slash_prefixed = 1;
+		vp = vp->v_dd;
+	}
+	FILEDESC_UNLOCK(fdp);
+	if (!slash_prefixed) {
+		if (bp == buf) {
+			numcwdfail4++;
+			free(buf, M_TEMP);
+			return (ENOMEM);
+		}
+		*--bp = '/';
+	}
+	numcwdfound++;
+	error = copyout(bp, uap->buf, strlen(bp) + 1);
+	free(buf, M_TEMP);
+	return (error);
+}
+
+/*
+ * Thus begins the fullpath magic.
+ */
+
+#undef STATNODE
+#define STATNODE(name)							\
+	static u_int name;						\
+	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
+
+static int disablefullpath;
+SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
+	"Disable the vn_fullpath function");
+
+STATNODE(numfullpathcalls);
+STATNODE(numfullpathfail1);
+STATNODE(numfullpathfail2);
+STATNODE(numfullpathfail3);
+STATNODE(numfullpathfail4);
+STATNODE(numfullpathfound);
+
+/*
+ * Retrieve the full filesystem path that correspond to a vnode from the name
+ * cache (if available)
+ */
+int
+vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
+{
+	char *bp, *buf;
+	int i, slash_prefixed;
+	struct filedesc *fdp;
+	struct namecache *ncp;
+	struct vnode *vp;
+
+	numfullpathcalls++;
+	if (disablefullpath)
+		return (ENODEV);
+	if (vn == NULL)
+		return (EINVAL);
+	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	bp = buf + MAXPATHLEN - 1;
+	*bp = '\0';
+	fdp = td->td_proc->p_fd;
+	slash_prefixed = 0;
+	FILEDESC_LOCK(fdp);
+	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
+		if (vp->v_flag & VROOT) {
+			if (vp->v_mount == NULL) {	/* forced unmount */
+				FILEDESC_UNLOCK(fdp);
+				free(buf, M_TEMP);
+				return (EBADF);
+			}
+			vp = vp->v_mount->mnt_vnodecovered;
+			continue;
+		}
+		if (vp != vn && vp->v_dd->v_id != vp->v_ddid) {
+			FILEDESC_UNLOCK(fdp);
+			numfullpathfail1++;
+			free(buf, M_TEMP);
+			return (ENOTDIR);
+		}
+		ncp = TAILQ_FIRST(&vp->v_cache_dst);
+		if (!ncp) {
+			FILEDESC_UNLOCK(fdp);
+			numfullpathfail2++;
+			free(buf, M_TEMP);
+			return (ENOENT);
+		}
+		if (vp != vn && ncp->nc_dvp != vp->v_dd) {
+			FILEDESC_UNLOCK(fdp);
+			numfullpathfail3++;
+			free(buf, M_TEMP);
+			return (EBADF);
+		}
+		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
+			if (bp == buf) {
+				FILEDESC_UNLOCK(fdp);
+				numfullpathfail4++;
+				free(buf, M_TEMP);
+				return (ENOMEM);
+			}
+			*--bp = ncp->nc_name[i];
+		}
+		if (bp == buf) {
+			FILEDESC_UNLOCK(fdp);
+			numfullpathfail4++;
+			free(buf, M_TEMP);
+			return (ENOMEM);
+		}
+		*--bp = '/';
+		slash_prefixed = 1;
+		vp = ncp->nc_dvp;
+	}
+	if (!slash_prefixed) {
+		if (bp == buf) {
+			FILEDESC_UNLOCK(fdp);
+			numfullpathfail4++;
+			free(buf, M_TEMP);
+			return (ENOMEM);
+		}
+		*--bp = '/';
+	}
+	FILEDESC_UNLOCK(fdp);
+	numfullpathfound++;
+	*retbuf = bp; 
+	*freebuf = buf;
+	return (0);
+}
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
new file mode 100644
index 0000000..4c11952
--- /dev/null
+++ b/sys/kern/vfs_cluster.c
@@ -0,0 +1,1008 @@
+/*-
+ * Copyright (c) 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * Modifications/enhancements:
+ * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
+ * $FreeBSD$
+ */
+
+#include "opt_debug_cluster.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/stdint.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <sys/sysctl.h>
+
+#if defined(CLUSTERDEBUG)
+#include <sys/sysctl.h>
+static int	rcluster= 0;
+SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
+    "Debug VFS clustering code");
+#endif
+
+static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer");
+
+static struct cluster_save *
+	cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
+static struct buf *
+	cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+			 daddr_t blkno, long size, int run, struct buf *fbp);
+
+static int write_behind = 1;
+SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
+    "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
+
+/* Page expended to mark partially backed buffers */
+extern vm_page_t	bogus_page;
+
+/*
+ * Number of physical bufs (pbufs) this subsystem is allowed.
+ * Manipulated by vm_pager.c
+ */
+extern int cluster_pbuf_freecnt;
+
+/*
+ * Maximum number of blocks for read-ahead.
+ */
+#define MAXRA 32
+
+/*
+ * Read data to a buf, including read-ahead if we find this to be beneficial.
+ * cluster_read replaces bread.
+ */
+int
+cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
+	struct vnode *vp;
+	u_quad_t filesize;
+	daddr_t lblkno;
+	long size;
+	struct ucred *cred;
+	long totread;
+	int seqcount;
+	struct buf **bpp;
+{
+	struct buf *bp, *rbp, *reqbp;
+	daddr_t blkno, origblkno;
+	int error, num_ra;
+	int i;
+	int maxra, racluster;
+	long origtotread;
+
+	error = 0;
+
+	/*
+	 * Try to limit the amount of read-ahead by a few
+	 * ad-hoc parameters.  This needs work!!!
+	 */
+	racluster = vp->v_mount->mnt_iosize_max / size;
+	maxra = 2 * racluster + (totread / size);
+	if (maxra > MAXRA)
+		maxra = MAXRA;
+	if (maxra > nbuf/8)
+		maxra = nbuf/8;
+
+	/*
+	 * get the requested block
+	 */
+	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
+	origblkno = lblkno;
+	origtotread = totread;
+
+	/*
+	 * if it is in the cache, then check to see if the reads have been
+	 * sequential.  If they have, then try some read-ahead, otherwise
+	 * back-off on prospective read-aheads.
+	 */
+	if (bp->b_flags & B_CACHE) {
+		if (!seqcount) {
+			return 0;
+		} else if ((bp->b_flags & B_RAM) == 0) {
+			return 0;
+		} else {
+			int s;
+			struct buf *tbp;
+			bp->b_flags &= ~B_RAM;
+			/*
+			 * We do the spl here so that there is no window
+			 * between the incore and the b_usecount increment
+			 * below.  We opt to keep the spl out of the loop
+			 * for efficiency.
+			 */
+			s = splbio();
+			for (i = 1; i < maxra; i++) {
+
+				if (!(tbp = incore(vp, lblkno+i))) {
+					break;
+				}
+
+				/*
+				 * Set another read-ahead mark so we know 
+				 * to check again.
+				 */
+				if (((i % racluster) == (racluster - 1)) ||
+					(i == (maxra - 1)))
+					tbp->b_flags |= B_RAM;
+			}
+			splx(s);
+			if (i >= maxra) {
+				return 0;
+			}
+			lblkno += i;
+		}
+		reqbp = bp = NULL;
+	} else {
+		off_t firstread = bp->b_offset;
+
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("cluster_read: no buffer offset"));
+		if (firstread + totread > filesize)
+			totread = filesize - firstread;
+		if (totread > size) {
+			int nblks = 0;
+			int ncontigafter;
+			while (totread > 0) {
+				nblks++;
+				totread -= size;
+			}
+			if (nblks == 1)
+				goto single_block_read;
+			if (nblks > racluster)
+				nblks = racluster;
+
+	    		error = VOP_BMAP(vp, lblkno, NULL,
+				&blkno, &ncontigafter, NULL);
+			if (error)
+				goto single_block_read;
+			if (blkno == -1)
+				goto single_block_read;
+			if (ncontigafter == 0)
+				goto single_block_read;
+			if (ncontigafter + 1 < nblks)
+				nblks = ncontigafter + 1;
+
+			bp = cluster_rbuild(vp, filesize, lblkno,
+				blkno, size, nblks, bp);
+			lblkno += (bp->b_bufsize / size);
+		} else {
+single_block_read:
+			/*
+			 * if it isn't in the cache, then get a chunk from
+			 * disk if sequential, otherwise just get the block.
+			 */
+			bp->b_flags |= B_RAM;
+			bp->b_iocmd = BIO_READ;
+			lblkno += 1;
+		}
+	}
+
+	/*
+	 * if we have been doing sequential I/O, then do some read-ahead
+	 */
+	rbp = NULL;
+	if (seqcount && (lblkno < (origblkno + seqcount))) {
+		/*
+		 * we now build the read-ahead buffer if it is desirable.
+		 */
+		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
+		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
+		    blkno != -1) {
+			int nblksread;
+			int ntoread = num_ra + 1;
+			nblksread = (origtotread + size - 1) / size;
+			if (seqcount < nblksread)
+				seqcount = nblksread;
+			if (seqcount < ntoread)
+				ntoread = seqcount;
+			if (num_ra) {
+				rbp = cluster_rbuild(vp, filesize, lblkno,
+					blkno, size, ntoread, NULL);
+			} else {
+				rbp = getblk(vp, lblkno, size, 0, 0);
+				rbp->b_flags |= B_ASYNC | B_RAM;
+				rbp->b_iocmd = BIO_READ;
+				rbp->b_blkno = blkno;
+			}
+		}
+	}
+
+	/*
+	 * handle the synchronous read
+	 */
+	if (bp) {
+#if defined(CLUSTERDEBUG)
+		if (rcluster)
+			printf("S(%ld,%ld,%d) ",
+			    (long)bp->b_lblkno, bp->b_bcount, seqcount);
+#endif
+		if ((bp->b_flags & B_CLUSTER) == 0) {
+			vfs_busy_pages(bp, 0);
+		}
+		bp->b_flags &= ~B_INVAL;
+		bp->b_ioflags &= ~BIO_ERROR;
+		if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
+			BUF_KERNPROC(bp);
+		error = VOP_STRATEGY(vp, bp);
+		curproc->p_stats->p_ru.ru_inblock++;
+	}
+
+	/*
+	 * and if we have read-aheads, do them too
+	 */
+	if (rbp) {
+		if (error) {
+			rbp->b_flags &= ~B_ASYNC;
+			brelse(rbp);
+		} else if (rbp->b_flags & B_CACHE) {
+			rbp->b_flags &= ~B_ASYNC;
+			bqrelse(rbp);
+		} else {
+#if defined(CLUSTERDEBUG)
+			if (rcluster) {
+				if (bp)
+					printf("A+");
+				else
+					printf("A");
+				printf("(%lld,%ld,%lld,%d) ",
+				    (intmax_t)rbp->b_lblkno, rbp->b_bcount,
+				    (intmax_t)(rbp->b_lblkno - origblkno),
+				    seqcount);
+			}
+#endif
+
+			if ((rbp->b_flags & B_CLUSTER) == 0) {
+				vfs_busy_pages(rbp, 0);
+			}
+			rbp->b_flags &= ~B_INVAL;
+			rbp->b_ioflags &= ~BIO_ERROR;
+			if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
+				BUF_KERNPROC(rbp);
+			(void) VOP_STRATEGY(vp, rbp);
+			curproc->p_stats->p_ru.ru_inblock++;
+		}
+	}
+	if (reqbp)
+		return (bufwait(reqbp));
+	else
+		return (error);
+}
+
+/*
+ * If blocks are contiguous on disk, use this to provide clustered
+ * read ahead.  We will read as many blocks as possible sequentially
+ * and then parcel them up into logical blocks in the buffer hash table.
+ */
+static struct buf *
+cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
+	struct vnode *vp;
+	u_quad_t filesize;
+	daddr_t lbn;
+	daddr_t blkno;
+	long size;
+	int run;
+	struct buf *fbp;
+{
+	struct buf *bp, *tbp;
+	daddr_t bn;
+	int i, inc, j;
+
+	GIANT_REQUIRED;
+
+	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
+	    ("cluster_rbuild: size %ld != filesize %ld\n",
+	    size, vp->v_mount->mnt_stat.f_iosize));
+
+	/*
+	 * avoid a division
+	 */
+	while ((u_quad_t) size * (lbn + run) > filesize) {
+		--run;
+	}
+
+	if (fbp) {
+		tbp = fbp;
+		tbp->b_iocmd = BIO_READ; 
+	} else {
+		tbp = getblk(vp, lbn, size, 0, 0);
+		if (tbp->b_flags & B_CACHE)
+			return tbp;
+		tbp->b_flags |= B_ASYNC | B_RAM;
+		tbp->b_iocmd = BIO_READ;
+	}
+
+	tbp->b_blkno = blkno;
+	if( (tbp->b_flags & B_MALLOC) ||
+		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+		return tbp;
+
+	bp = trypbuf(&cluster_pbuf_freecnt);
+	if (bp == 0)
+		return tbp;
+
+	/*
+	 * We are synthesizing a buffer out of vm_page_t's, but
+	 * if the block size is not page aligned then the starting
+	 * address may not be either.  Inherit the b_data offset
+	 * from the original buffer.
+	 */
+	bp->b_data = (char *)((vm_offset_t)bp->b_data |
+	    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+	bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+	bp->b_iocmd = BIO_READ;
+	bp->b_iodone = cluster_callback;
+	bp->b_blkno = blkno;
+	bp->b_lblkno = lbn;
+	bp->b_offset = tbp->b_offset;
+	KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
+	pbgetvp(vp, bp);
+
+	TAILQ_INIT(&bp->b_cluster.cluster_head);
+
+	bp->b_bcount = 0;
+	bp->b_bufsize = 0;
+	bp->b_npages = 0;
+
+	inc = btodb(size);
+	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
+		if (i != 0) {
+			if ((bp->b_npages * PAGE_SIZE) +
+			    round_page(size) > vp->v_mount->mnt_iosize_max) {
+				break;
+			}
+
+			/*
+			 * Shortcut some checks and try to avoid buffers that
+			 * would block in the lock.  The same checks have to
+			 * be made again after we officially get the buffer.
+			 */
+			if ((tbp = incore(vp, lbn + i)) != NULL) {
+				if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT))
+					break;
+				BUF_UNLOCK(tbp);
+
+				for (j = 0; j < tbp->b_npages; j++) {
+					if (tbp->b_pages[j]->valid)
+						break;
+				}
+				
+				if (j != tbp->b_npages)
+					break;
+	
+				if (tbp->b_bcount != size)
+					break;
+			}
+
+			tbp = getblk(vp, lbn + i, size, 0, 0);
+
+			/*
+			 * Stop scanning if the buffer is fully valid
+			 * (marked B_CACHE), or locked (may be doing a
+			 * background write), or if the buffer is not
+			 * VMIO backed.  The clustering code can only deal
+			 * with VMIO-backed buffers.
+			 */
+			if ((tbp->b_flags & (B_CACHE|B_LOCKED)) ||
+				(tbp->b_flags & B_VMIO) == 0) {
+				bqrelse(tbp);
+				break;
+			}
+
+			/*
+			 * The buffer must be completely invalid in order to
+			 * take part in the cluster.  If it is partially valid
+			 * then we stop.
+			 */
+			for (j = 0;j < tbp->b_npages; j++) {
+				if (tbp->b_pages[j]->valid)
+					break;
+			}
+			if (j != tbp->b_npages) {
+				bqrelse(tbp);
+				break;
+			}
+
+			/*
+			 * Set a read-ahead mark as appropriate
+			 */
+			if ((fbp && (i == 1)) || (i == (run - 1)))
+				tbp->b_flags |= B_RAM;
+
+			/*
+			 * Set the buffer up for an async read (XXX should
+			 * we do this only if we do not wind up brelse()ing?).
+			 * Set the block number if it isn't set, otherwise
+			 * if it is make sure it matches the block number we
+			 * expect.
+			 */
+			tbp->b_flags |= B_ASYNC;
+			tbp->b_iocmd = BIO_READ;
+			if (tbp->b_blkno == tbp->b_lblkno) {
+				tbp->b_blkno = bn;
+			} else if (tbp->b_blkno != bn) {
+				brelse(tbp);
+				break;
+			}
+		}
+		/*
+		 * XXX fbp from caller may not be B_ASYNC, but we are going
+		 * to biodone() it in cluster_callback() anyway
+		 */
+		BUF_KERNPROC(tbp);
+		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+			tbp, b_cluster.cluster_entry);
+		for (j = 0; j < tbp->b_npages; j += 1) {
+			vm_page_t m;
+			m = tbp->b_pages[j];
+			vm_page_io_start(m);
+			vm_object_pip_add(m->object, 1);
+			if ((bp->b_npages == 0) ||
+				(bp->b_pages[bp->b_npages-1] != m)) {
+				bp->b_pages[bp->b_npages] = m;
+				bp->b_npages++;
+			}
+			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
+				tbp->b_pages[j] = bogus_page;
+		}
+		/*
+		 * XXX shouldn't this be += size for both, like in
+		 * cluster_wbuild()?
+		 *
+		 * Don't inherit tbp->b_bufsize as it may be larger due to
+		 * a non-page-aligned size.  Instead just aggregate using
+		 * 'size'.
+		 */
+		if (tbp->b_bcount != size)
+			printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
+		if (tbp->b_bufsize != size)
+			printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
+		bp->b_bcount += size;
+		bp->b_bufsize += size;
+	}
+
+	/*
+	 * Fully valid pages in the cluster are already good and do not need
+	 * to be re-read from disk.  Replace the page with bogus_page
+	 */
+	for (j = 0; j < bp->b_npages; j++) {
+		if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
+		    VM_PAGE_BITS_ALL) {
+			bp->b_pages[j] = bogus_page;
+		}
+	}
+	if (bp->b_bufsize > bp->b_kvasize)
+		panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+		    bp->b_bufsize, bp->b_kvasize);
+	bp->b_kvasize = bp->b_bufsize;
+
+	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+		(vm_page_t *)bp->b_pages, bp->b_npages);
+	return (bp);
+}
+
+/*
+ * Cleanup after a clustered read or write.
+ * This is complicated by the fact that any of the buffers might have
+ * extra memory (if there were no empty buffer headers at allocbuf time)
+ * that we will need to shift around.
+ */
+void
+cluster_callback(bp)
+	struct buf *bp;
+{
+	struct buf *nbp, *tbp;
+	int error = 0;
+
+	GIANT_REQUIRED;
+
+	/*
+	 * Must propogate errors to all the components.
+	 */
+	if (bp->b_ioflags & BIO_ERROR)
+		error = bp->b_error;
+
+	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+	/*
+	 * Move memory from the large cluster buffer into the component
+	 * buffers and mark IO as done on these.
+	 */
+	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
+		tbp; tbp = nbp) {
+		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
+		if (error) {
+			tbp->b_ioflags |= BIO_ERROR;
+			tbp->b_error = error;
+		} else {
+			tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+			tbp->b_flags &= ~B_INVAL;
+			tbp->b_ioflags &= ~BIO_ERROR;
+			/*
+			 * XXX the bdwrite()/bqrelse() issued during
+			 * cluster building clears B_RELBUF (see bqrelse()
+			 * comment).  If direct I/O was specified, we have
+			 * to restore it here to allow the buffer and VM
+			 * to be freed.
+			 */
+			if (tbp->b_flags & B_DIRECT)
+				tbp->b_flags |= B_RELBUF;
+		}
+		bufdone(tbp);
+	}
+	relpbuf(bp, &cluster_pbuf_freecnt);
+}
+
+/*
+ *	cluster_wbuild_wb:
+ *
+ *	Implement modified write build for cluster.
+ *
+ *		write_behind = 0	write behind disabled
+ *		write_behind = 1	write behind normal (default)
+ *		write_behind = 2	write behind backed-off
+ */
+
+static __inline int
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
+{
+	int r = 0;
+
+	switch(write_behind) {
+	case 2:
+		if (start_lbn < len)
+			break;
+		start_lbn -= len;
+		/* fall through */
+	case 1:
+		r = cluster_wbuild(vp, size, start_lbn, len);
+		/* fall through */
+	default:
+		/* fall through */
+		break;
+	}
+	return(r);
+}
+
+/*
+ * Do clustered write for FFS.
+ *
+ * Three cases:
+ *	1. Write is not sequential (write asynchronously)
+ *	Write is sequential:
+ *	2.	beginning of cluster - begin cluster
+ *	3.	middle of a cluster - add to cluster
+ *	4.	end of a cluster - asynchronously write cluster
+ */
+void
+cluster_write(bp, filesize, seqcount)
+	struct buf *bp;
+	u_quad_t filesize;
+	int seqcount;
+{
+	struct vnode *vp;
+	daddr_t lbn;
+	int maxclen, cursize;
+	int lblocksize;
+	int async;
+
+	vp = bp->b_vp;
+	if (vp->v_type == VREG) {
+		async = vp->v_mount->mnt_flag & MNT_ASYNC;
+		lblocksize = vp->v_mount->mnt_stat.f_iosize;
+	} else {
+		async = 0;
+		lblocksize = bp->b_bufsize;
+	}
+	lbn = bp->b_lblkno;
+	KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
+
+	/* Initialize vnode to beginning of file. */
+	if (lbn == 0)
+		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+
+	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
+	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
+		maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
+		if (vp->v_clen != 0) {
+			/*
+			 * Next block is not sequential.
+			 *
+			 * If we are not writing at end of file, the process
+			 * seeked to another point in the file since its last
+			 * write, or we have reached our maximum cluster size,
+			 * then push the previous cluster. Otherwise try
+			 * reallocating to make it sequential.
+			 *
+			 * Change to algorithm: only push previous cluster if
+			 * it was sequential from the point of view of the
+			 * seqcount heuristic, otherwise leave the buffer 
+			 * intact so we can potentially optimize the I/O
+			 * later on in the buf_daemon or update daemon
+			 * flush.
+			 */
+			cursize = vp->v_lastw - vp->v_cstart + 1;
+			if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
+			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
+				if (!async && seqcount > 0) {
+					cluster_wbuild_wb(vp, lblocksize,
+						vp->v_cstart, cursize);
+				}
+			} else {
+				struct buf **bpp, **endbp;
+				struct cluster_save *buflist;
+
+				buflist = cluster_collectbufs(vp, bp);
+				endbp = &buflist->bs_children
+				    [buflist->bs_nchildren - 1];
+				if (VOP_REALLOCBLKS(vp, buflist)) {
+					/*
+					 * Failed, push the previous cluster
+					 * if *really* writing sequentially
+					 * in the logical file (seqcount > 1),
+					 * otherwise delay it in the hopes that
+					 * the low level disk driver can
+					 * optimize the write ordering.
+					 */
+					for (bpp = buflist->bs_children;
+					     bpp < endbp; bpp++)
+						brelse(*bpp);
+					free(buflist, M_SEGMENT);
+					if (seqcount > 1) {
+						cluster_wbuild_wb(vp, 
+						    lblocksize, vp->v_cstart, 
+						    cursize);
+					}
+				} else {
+					/*
+					 * Succeeded, keep building cluster.
+					 */
+					for (bpp = buflist->bs_children;
+					     bpp <= endbp; bpp++)
+						bdwrite(*bpp);
+					free(buflist, M_SEGMENT);
+					vp->v_lastw = lbn;
+					vp->v_lasta = bp->b_blkno;
+					return;
+				}
+			}
+		}
+		/*
+		 * Consider beginning a cluster. If at end of file, make
+		 * cluster as large as possible, otherwise find size of
+		 * existing cluster.
+		 */
+		if ((vp->v_type == VREG) &&
+			((u_quad_t) bp->b_offset + lblocksize) != filesize &&
+		    (bp->b_blkno == bp->b_lblkno) &&
+		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
+		     bp->b_blkno == -1)) {
+			bawrite(bp);
+			vp->v_clen = 0;
+			vp->v_lasta = bp->b_blkno;
+			vp->v_cstart = lbn + 1;
+			vp->v_lastw = lbn;
+			return;
+		}
+		vp->v_clen = maxclen;
+		if (!async && maxclen == 0) {	/* I/O not contiguous */
+			vp->v_cstart = lbn + 1;
+			bawrite(bp);
+		} else {	/* Wait for rest of cluster */
+			vp->v_cstart = lbn;
+			bdwrite(bp);
+		}
+	} else if (lbn == vp->v_cstart + vp->v_clen) {
+		/*
+		 * At end of cluster, write it out if seqcount tells us we
+		 * are operating sequentially, otherwise let the buf or
+		 * update daemon handle it.
+		 */
+		bdwrite(bp);
+		if (seqcount > 1)
+			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
+		vp->v_clen = 0;
+		vp->v_cstart = lbn + 1;
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are low on memory, get it going NOW
+		 */
+		bawrite(bp);
+	} else {
+		/*
+		 * In the middle of a cluster, so just delay the I/O for now.
+		 */
+		bdwrite(bp);
+	}
+	vp->v_lastw = lbn;
+	vp->v_lasta = bp->b_blkno;
+}
+
+
+/*
+ * This is an awful lot like cluster_rbuild...wish they could be combined.
+ * The last lbn argument is the current block on which I/O is being
+ * performed.  Check to see that it doesn't fall in the middle of
+ * the current block (if last_bp == NULL).
+ */
+int
+cluster_wbuild(vp, size, start_lbn, len)
+	struct vnode *vp;
+	long size;
+	daddr_t start_lbn;
+	int len;
+{
+	struct buf *bp, *tbp;
+	int i, j, s;
+	int totalwritten = 0;
+	int dbsize = btodb(size);
+
+	GIANT_REQUIRED;
+
+	while (len > 0) {
+		s = splbio();
+		/*
+		 * If the buffer is not delayed-write (i.e. dirty), or it
+		 * is delayed-write but either locked or inval, it cannot
+		 * partake in the clustered write.
+		 */
+		if (((tbp = gbincore(vp, start_lbn)) == NULL) ||
+		  ((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI) ||
+		  BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) {
+			++start_lbn;
+			--len;
+			splx(s);
+			continue;
+		}
+		bremfree(tbp);
+		tbp->b_flags &= ~B_DONE;
+		splx(s);
+
+		/*
+		 * Extra memory in the buffer, punt on this buffer.
+		 * XXX we could handle this in most cases, but we would
+		 * have to push the extra memory down to after our max
+		 * possible cluster size and then potentially pull it back
+		 * up if the cluster was terminated prematurely--too much
+		 * hassle.
+		 */
+		if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 
+		     (B_CLUSTEROK | B_VMIO)) ||
+		  (tbp->b_bcount != tbp->b_bufsize) ||
+		  (tbp->b_bcount != size) ||
+		  (len == 1) ||
+		  ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+			totalwritten += tbp->b_bufsize;
+			bawrite(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
+
+		/*
+		 * We got a pbuf to make the cluster in.
+		 * so initialise it.
+		 */
+		TAILQ_INIT(&bp->b_cluster.cluster_head);
+		bp->b_bcount = 0;
+		bp->b_magic = tbp->b_magic;
+		bp->b_op = tbp->b_op;
+		bp->b_bufsize = 0;
+		bp->b_npages = 0;
+		if (tbp->b_wcred != NOCRED)
+			bp->b_wcred = crhold(tbp->b_wcred);
+
+		bp->b_blkno = tbp->b_blkno;
+		bp->b_lblkno = tbp->b_lblkno;
+		bp->b_offset = tbp->b_offset;
+
+		/*
+		 * We are synthesizing a buffer out of vm_page_t's, but
+		 * if the block size is not page aligned then the starting
+		 * address may not be either.  Inherit the b_data offset
+		 * from the original buffer.
+		 */
+		bp->b_data = (char *)((vm_offset_t)bp->b_data |
+		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+		bp->b_flags |= B_CLUSTER |
+				(tbp->b_flags & (B_VMIO | B_NEEDCOMMIT | B_NOWDRAIN));
+		bp->b_iodone = cluster_callback;
+		pbgetvp(vp, bp);
+		/*
+		 * From this location in the file, scan forward to see
+		 * if there are buffers with adjacent data that need to
+		 * be written as well.
+		 */
+		for (i = 0; i < len; ++i, ++start_lbn) {
+			if (i != 0) { /* If not the first buffer */
+				s = splbio();
+				/*
+				 * If the adjacent data is not even in core it
+				 * can't need to be written.
+				 */
+				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
+					splx(s);
+					break;
+				}
+
+				/*
+				 * If it IS in core, but has different
+				 * characteristics, or is locked (which
+				 * means it could be undergoing a background
+				 * I/O or be in a weird state), then don't
+				 * cluster with it.
+				 */
+				if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
+				    B_INVAL | B_DELWRI | B_NEEDCOMMIT))
+				  != (B_DELWRI | B_CLUSTEROK |
+				    (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
+				    (tbp->b_flags & B_LOCKED) ||
+				    tbp->b_wcred != bp->b_wcred ||
+				    BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) {
+					splx(s);
+					break;
+				}
+
+				/*
+				 * Check that the combined cluster
+				 * would make sense with regard to pages
+				 * and would not be too large
+				 */
+				if ((tbp->b_bcount != size) ||
+				  ((bp->b_blkno + (dbsize * i)) !=
+				    tbp->b_blkno) ||
+				  ((tbp->b_npages + bp->b_npages) >
+				    (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
+					BUF_UNLOCK(tbp);
+					splx(s);
+					break;
+				}
+				/*
+				 * Ok, it's passed all the tests,
+				 * so remove it from the free list
+				 * and mark it busy. We will use it.
+				 */
+				bremfree(tbp);
+				tbp->b_flags &= ~B_DONE;
+				splx(s);
+			} /* end of code for non-first buffers only */
+			/* check for latent dependencies to be handled */
+			if ((LIST_FIRST(&tbp->b_dep)) != NULL)
+				buf_start(tbp);
+			/*
+			 * If the IO is via the VM then we do some
+			 * special VM hackery (yuck).  Since the buffer's
+			 * block size may not be page-aligned it is possible
+			 * for a page to be shared between two buffers.  We
+			 * have to get rid of the duplication when building
+			 * the cluster.
+			 */
+			if (tbp->b_flags & B_VMIO) {
+				vm_page_t m;
+
+				if (i != 0) { /* if not first buffer */
+					for (j = 0; j < tbp->b_npages; j += 1) {
+						m = tbp->b_pages[j];
+						if (m->flags & PG_BUSY) {
+							bqrelse(tbp);
+							goto finishcluster;
+						}
+					}
+				}
+					
+				for (j = 0; j < tbp->b_npages; j += 1) {
+					m = tbp->b_pages[j];
+					vm_page_io_start(m);
+					vm_object_pip_add(m->object, 1);
+					if ((bp->b_npages == 0) ||
+					  (bp->b_pages[bp->b_npages - 1] != m)) {
+						bp->b_pages[bp->b_npages] = m;
+						bp->b_npages++;
+					}
+				}
+			}
+			bp->b_bcount += size;
+			bp->b_bufsize += size;
+
+			s = splbio();
+			bundirty(tbp);
+			tbp->b_flags &= ~B_DONE;
+			tbp->b_ioflags &= ~BIO_ERROR;
+			tbp->b_flags |= B_ASYNC;
+			tbp->b_iocmd = BIO_WRITE;
+			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
+			++tbp->b_vp->v_numoutput;
+			splx(s);
+			BUF_KERNPROC(tbp);
+			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+				tbp, b_cluster.cluster_entry);
+		}
+	finishcluster:
+		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+			(vm_page_t *) bp->b_pages, bp->b_npages);
+		if (bp->b_bufsize > bp->b_kvasize)
+			panic(
+			    "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+			    bp->b_bufsize, bp->b_kvasize);
+		bp->b_kvasize = bp->b_bufsize;
+		totalwritten += bp->b_bufsize;
+		bp->b_dirtyoff = 0;
+		bp->b_dirtyend = bp->b_bufsize;
+		bawrite(bp);
+
+		len -= i;
+	}
+	return totalwritten;
+}
+
+/*
+ * Collect together all the buffers in a cluster.
+ * Plus add one additional buffer.
+ */
+static struct cluster_save *
+cluster_collectbufs(vp, last_bp)
+	struct vnode *vp;
+	struct buf *last_bp;
+{
+	struct cluster_save *buflist;
+	struct buf *bp;
+	daddr_t lbn;
+	int i, len;
+
+	len = vp->v_lastw - vp->v_cstart + 1;
+	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
+	    M_SEGMENT, M_WAITOK);
+	buflist->bs_nchildren = 0;
+	buflist->bs_children = (struct buf **) (buflist + 1);
+	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
+		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp);
+		buflist->bs_children[i] = bp;
+		if (bp->b_blkno == bp->b_lblkno)
+			VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
+				NULL, NULL);
+	}
+	buflist->bs_children[i] = bp = last_bp;
+	if (bp->b_blkno == bp->b_lblkno)
+		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
+			NULL, NULL);
+	buflist->bs_nchildren = i + 1;
+	return (buflist);
+}
diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c
new file mode 100644
index 0000000..20d9b90
--- /dev/null
+++ b/sys/kern/vfs_conf.c
@@ -0,0 +1,396 @@
+/*-
+ * Copyright (c) 1999 Michael Smith
+ * All rights reserved.
+ * Copyright (c) 1999 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$FreeBSD$
+ */
+
+/*
+ * Locate and mount the root filesystem.
+ *
+ * The root filesystem is detailed in the kernel environment variable
+ * vfs.root.mountfrom, which is expected to be in the general format
+ *
+ * <vfsname>:[<path>]
+ * vfsname   := the name of a VFS known to the kernel and capable
+ *              of being mounted as root
+ * path      := disk device name or other data used by the filesystem
+ *              to locate its physical store
+ *
+ */
+
+#include "opt_rootdevname.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/reboot.h>
+#include <sys/diskslice.h>
+#include <sys/disklabel.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/proc.h>
+
+#include "opt_ddb.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <paths.h>
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
+
+#define ROOTNAME	"root_device"
+
+/*
+ * The vnode of the system's root (/ in the filesystem, without chroot
+ * active.)
+ */
+struct vnode	*rootvnode;
+
+/* 
+ * The root specifiers we will try if RB_CDROM is specified.
+ */
+static char *cdrom_rootdevnames[] = {
+	"cd9660:cd0a",
+	"cd9660:acd0a",
+	"cd9660:wcd0a",
+	NULL
+};
+
+static int	vfs_mountroot_try(char *mountfrom);
+static int	vfs_mountroot_ask(void);
+static void	gets(char *cp);
+
+/* legacy find-root code */
+char		*rootdevnames[2] = {NULL, NULL};
+static int	setrootbyname(char *name);
+dev_t		rootdev = NODEV;
+
+/*
+ * Find and mount the root filesystem
+ */
+void
+vfs_mountroot(void *foo __unused)
+{
+	char		*cp;
+	int		i, error;
+	
+	/* 
+	 * The root filesystem information is compiled in, and we are
+	 * booted with instructions to use it.
+	 */
+#ifdef ROOTDEVNAME
+	if ((boothowto & RB_DFLTROOT) && 
+	    !vfs_mountroot_try(ROOTDEVNAME))
+		return;
+#endif
+	/* 
+	 * We are booted with instructions to prompt for the root filesystem,
+	 * or to use the compiled-in default when it doesn't exist.
+	 */
+	if (boothowto & (RB_DFLTROOT | RB_ASKNAME)) {
+		if (!vfs_mountroot_ask())
+			return;
+	}
+
+	/*
+	 * We've been given the generic "use CDROM as root" flag.  This is
+	 * necessary because one media may be used in many different
+	 * devices, so we need to search for them.
+	 */
+	if (boothowto & RB_CDROM) {
+		for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
+			if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
+				return;
+		}
+	}
+
+	/*
+	 * Try to use the value read by the loader from /etc/fstab, or
+	 * supplied via some other means.  This is the preferred 
+	 * mechanism.
+	 */
+	if ((cp = getenv("vfs.root.mountfrom")) != NULL) {
+		error = vfs_mountroot_try(cp);
+		freeenv(cp);
+		if (!error)
+			return;
+	}
+
+	/* 
+	 * Try values that may have been computed by the machine-dependant
+	 * legacy code.
+	 */
+	if (!vfs_mountroot_try(rootdevnames[0]))
+		return;
+	if (!vfs_mountroot_try(rootdevnames[1]))
+		return;
+
+	/*
+	 * If we have a compiled-in default, and haven't already tried it, try
+	 * it now.
+	 */
+#ifdef ROOTDEVNAME
+	if (!(boothowto & RB_DFLTROOT))
+		if (!vfs_mountroot_try(ROOTDEVNAME))
+			return;
+#endif
+
+	/* 
+	 * Everything so far has failed, prompt on the console if we haven't
+	 * already tried that.
+	 */
+	if (!(boothowto & (RB_DFLTROOT | RB_ASKNAME)) && !vfs_mountroot_ask())
+		return;
+	panic("Root mount failed, startup aborted.");
+}
+
+/*
+ * Mount (mountfrom) as the root filesystem.
+ */
+static int
+vfs_mountroot_try(char *mountfrom)
+{
+        struct mount	*mp;
+	char		*vfsname, *path;
+	int		error;
+	char		patt[32];
+	int		s;
+
+	vfsname = NULL;
+	path    = NULL;
+	mp      = NULL;
+	error   = EINVAL;
+
+	if (mountfrom == NULL)
+		return(error);		/* don't complain */
+
+	s = splcam();			/* Overkill, but annoying without it */
+	printf("Mounting root from %s\n", mountfrom);
+	splx(s);
+
+	/* parse vfs name and path */
+	vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK);
+	path = malloc(MNAMELEN, M_MOUNT, M_WAITOK);
+	vfsname[0] = path[0] = 0;
+	sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN);
+	if (sscanf(mountfrom, patt, vfsname, path) < 1)
+		goto done;
+
+	/* allocate a root mount */
+	error = vfs_rootmountalloc(vfsname, path[0] != 0 ? path : ROOTNAME,
+				   &mp);
+	if (error != 0) {
+		printf("Can't allocate root mount for filesystem '%s': %d\n",
+		       vfsname, error);
+		goto done;
+	}
+	mp->mnt_flag |= MNT_ROOTFS;
+
+	/* do our best to set rootdev */
+	if ((path[0] != 0) && setrootbyname(path))
+		printf("setrootbyname failed\n");
+
+	/* If the root device is a type "memory disk", mount RW */
+	if (rootdev != NODEV && devsw(rootdev) &&
+	    (devsw(rootdev)->d_flags & D_MEMDISK))
+		mp->mnt_flag &= ~MNT_RDONLY;
+
+	/* 
+	 * Set the mount path to be something useful, because the
+	 * filesystem code isn't responsible now for initialising
+	 * f_mntonname unless they want to override the default
+	 * (which is `path'.)
+	 */
+	strncpy(mp->mnt_stat.f_mntonname, "/", MNAMELEN);
+
+	error = VFS_MOUNT(mp, NULL, NULL, NULL, curthread);
+
+done:
+	if (vfsname != NULL)
+		free(vfsname, M_MOUNT);
+	if (path != NULL)
+		free(path, M_MOUNT);
+	if (error != 0) {
+		if (mp != NULL) {
+			vfs_unbusy(mp, curthread);
+			free(mp, M_MOUNT);
+		}
+		printf("Root mount failed: %d\n", error);
+	} else {
+
+		/* register with list of mounted filesystems */
+		mtx_lock(&mountlist_mtx);
+		TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+		mtx_unlock(&mountlist_mtx);
+
+		/* sanity check system clock against root filesystem timestamp */
+		inittodr(mp->mnt_time);
+		vfs_unbusy(mp, curthread);
+	}
+	return(error);
+}
+
+/*
+ * Spin prompting on the console for a suitable root filesystem
+ */
+static int
+vfs_mountroot_ask(void)
+{
+	char name[128];
+	int i;
+	dev_t dev;
+
+	for(;;) {
+		printf("\nManual root filesystem specification:\n");
+		printf("  <fstype>:<device>  Mount <device> using filesystem <fstype>\n");
+#if defined(__i386__) || defined(__ia64__)
+		printf("                       eg. ufs:da0s1a\n");
+#else
+		printf("                       eg. ufs:da0a\n");
+#endif
+		printf("  ?                  List valid disk boot devices\n");
+		printf("  <empty line>       Abort manual input\n");
+		printf("\nmountroot> ");
+		gets(name);
+		if (name[0] == 0)
+			return(1);
+		if (name[0] == '?') {
+			printf("Possibly valid devices for 'ufs' root:\n");
+			for (i = 0; i < NUMCDEVSW; i++) {
+				dev = makedev(i, 0);
+				if (devsw(dev) != NULL)
+					printf(" \"%s\"", devsw(dev)->d_name);
+			}
+			printf("\n");
+			continue;
+		}
+		if (!vfs_mountroot_try(name))
+			return(0);
+	}
+}
+
+/*
+ * Local helper function for vfs_mountroot_ask.
+ */
+static void
+gets(char *cp)
+{
+	char *lp;
+	int c;
+
+	lp = cp;
+	for (;;) {
+		printf("%c", c = cngetc() & 0177);
+		switch (c) {
+		case -1:
+		case '\n':
+		case '\r':
+			*lp++ = '\0';
+			return;
+		case '\b':
+		case '\177':
+			if (lp > cp) {
+				printf(" \b");
+				lp--;
+			}
+			continue;
+		case '#':
+			lp--;
+			if (lp < cp)
+				lp = cp;
+			continue;
+		case '@':
+		case 'u' & 037:
+			lp = cp;
+			printf("%c", '\n');
+			continue;
+		default:
+			*lp++ = c;
+		}
+	}
+}
+
+/*
+ * Convert a given name to the dev_t of the disk-like device
+ * it refers to.
+ */
+dev_t
+getdiskbyname(char *name) {
+	char *cp;
+	dev_t dev;
+
+	cp = name;
+	if (!bcmp(cp, "/dev/", 5))
+		cp += 5;
+
+	dev = NODEV;
+	EVENTHANDLER_INVOKE(dev_clone, cp, strlen(cp), &dev);
+	return (dev);
+}
+
+/*
+ * Set rootdev to match (name), given that we expect it to
+ * refer to a disk-like device.
+ */
+static int
+setrootbyname(char *name)
+{
+	dev_t diskdev;
+
+	diskdev = getdiskbyname(name);
+	if (diskdev != NODEV) {
+		rootdev = diskdev;
+		return (0);
+	}
+
+	return (1);
+}
+
+/* Show the dev_t for a disk specified by name */
+#ifdef DDB
+DB_SHOW_COMMAND(disk, db_getdiskbyname)
+{
+	dev_t dev;
+
+	if (modif[0] == '\0') {
+		db_error("usage: show disk/devicename");
+		return;
+	}
+	dev = getdiskbyname(modif);
+	if (dev != NODEV)
+		db_printf("dev_t = %p\n", dev);
+	else
+		db_printf("No disk device matched.\n");
+}
+#endif
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
new file mode 100644
index 0000000..6bfe085
--- /dev/null
+++ b/sys/kern/vfs_default.c
@@ -0,0 +1,845 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/poll.h>
+
+#include <machine/limits.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+
+static int	vop_nolookup(struct vop_lookup_args *);
+static int	vop_nostrategy(struct vop_strategy_args *);
+
+/*
+ * This vnode table stores what we want to do if the filesystem doesn't
+ * implement a particular VOP.
+ *
+ * If there is no specific entry here, we will return EOPNOTSUPP.
+ *
+ */
+
+vop_t **default_vnodeop_p;
+static struct vnodeopv_entry_desc default_vnodeop_entries[] = {
+	{ &vop_default_desc,		(vop_t *) vop_eopnotsupp },
+	{ &vop_advlock_desc,		(vop_t *) vop_einval },
+	{ &vop_bmap_desc,		(vop_t *) vop_stdbmap },
+	{ &vop_close_desc,		(vop_t *) vop_null },
+	{ &vop_createvobject_desc,	(vop_t *) vop_stdcreatevobject },
+	{ &vop_destroyvobject_desc,	(vop_t *) vop_stddestroyvobject },
+	{ &vop_fsync_desc,		(vop_t *) vop_null },
+	{ &vop_getpages_desc,		(vop_t *) vop_stdgetpages },
+	{ &vop_getvobject_desc,		(vop_t *) vop_stdgetvobject },
+	{ &vop_inactive_desc,		(vop_t *) vop_stdinactive },
+	{ &vop_ioctl_desc,		(vop_t *) vop_enotty },
+	{ &vop_islocked_desc,		(vop_t *) vop_noislocked },
+	{ &vop_lease_desc,		(vop_t *) vop_null },
+	{ &vop_lock_desc,		(vop_t *) vop_nolock },
+	{ &vop_lookup_desc,		(vop_t *) vop_nolookup },
+	{ &vop_open_desc,		(vop_t *) vop_null },
+	{ &vop_pathconf_desc,		(vop_t *) vop_einval },
+	{ &vop_putpages_desc,		(vop_t *) vop_stdputpages },
+	{ &vop_poll_desc,		(vop_t *) vop_nopoll },
+	{ &vop_readlink_desc,		(vop_t *) vop_einval },
+	{ &vop_revoke_desc,		(vop_t *) vop_revoke },
+	{ &vop_strategy_desc,		(vop_t *) vop_nostrategy },
+	{ &vop_unlock_desc,		(vop_t *) vop_nounlock },
+	{ NULL, NULL }
+};
+
+static struct vnodeopv_desc default_vnodeop_opv_desc =
+        { &default_vnodeop_p, default_vnodeop_entries };
+
+VNODEOP_SET(default_vnodeop_opv_desc);
+
+/*
+ * Series of placeholder functions for various error returns for
+ * VOPs.
+ */
+
+int
+vop_eopnotsupp(struct vop_generic_args *ap)
+{
+	/*
+	printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
+	*/
+
+	return (EOPNOTSUPP);
+}
+
+int
+vop_ebadf(struct vop_generic_args *ap)
+{
+
+	return (EBADF);
+}
+
+int
+vop_enotty(struct vop_generic_args *ap)
+{
+
+	return (ENOTTY);
+}
+
+int
+vop_einval(struct vop_generic_args *ap)
+{
+
+	return (EINVAL);
+}
+
+int
+vop_null(struct vop_generic_args *ap)
+{
+
+	return (0);
+}
+
+/*
+ * Used to make a defined VOP fall back to the default VOP.
+ */
+int
+vop_defaultop(struct vop_generic_args *ap)
+{
+
+	return (VOCALL(default_vnodeop_p, ap->a_desc->vdesc_offset, ap));
+}
+
+/*
+ * Helper function to panic on some bad VOPs in some filesystems.
+ */
+int
+vop_panic(struct vop_generic_args *ap)
+{
+
+	panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
+}
+
+/*
+ * vop_std<something> and vop_no<something> are default functions for use by
+ * filesystems that need the "default reasonable" implementation for a
+ * particular operation.
+ *
+ * The documentation for the operations they implement exists (if it exists)
+ * in the VOP_<SOMETHING>(9) manpage (all uppercase).
+ */
+
+/*
+ * Default vop for filesystems that do not support name lookup
+ */
+static int
+vop_nolookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+
+	*ap->a_vpp = NULL;
+	return (ENOTDIR);
+}
+
+/*
+ *	vop_nostrategy:
+ *
+ *	Strategy routine for VFS devices that have none.
+ *
+ *	BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
+ *	routine.  Typically this is done for a BIO_READ strategy call.
+ *	Typically B_INVAL is assumed to already be clear prior to a write 
+ *	and should not be cleared manually unless you just made the buffer
+ *	invalid.  BIO_ERROR should be cleared either way.
+ */
+
+static int
+vop_nostrategy (struct vop_strategy_args *ap)
+{
+	printf("No strategy for buffer at %p\n", ap->a_bp);
+	vprint("", ap->a_vp);
+	vprint("", ap->a_bp->b_vp);
+	ap->a_bp->b_ioflags |= BIO_ERROR;
+	ap->a_bp->b_error = EOPNOTSUPP;
+	bufdone(ap->a_bp);
+	return (EOPNOTSUPP);
+}
+
+/*
+ * vop_stdpathconf:
+ * 
+ * Standard implementation of POSIX pathconf, to get information about limits
+ * for a filesystem.
+ * Override per filesystem for the case where the filesystem has smaller
+ * limits.
+ */
+int
+vop_stdpathconf(ap)
+	struct vop_pathconf_args /* {
+	struct vnode *a_vp;
+	int a_name;
+	int *a_retval;
+	} */ *ap;
+{
+
+	switch (ap->a_name) {
+		case _PC_LINK_MAX:
+			*ap->a_retval = LINK_MAX;
+			return (0);
+		case _PC_MAX_CANON:
+			*ap->a_retval = MAX_CANON;
+			return (0);
+		case _PC_MAX_INPUT:
+			*ap->a_retval = MAX_INPUT;
+			return (0);
+		case _PC_PIPE_BUF:
+			*ap->a_retval = PIPE_BUF;
+			return (0);
+		case _PC_CHOWN_RESTRICTED:
+			*ap->a_retval = 1;
+			return (0);
+		case _PC_VDISABLE:
+			*ap->a_retval = _POSIX_VDISABLE;
+			return (0);
+		default:
+			return (EINVAL);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Standard lock, unlock and islocked functions.
+ */
+int
+vop_stdlock(ap)
+	struct vop_lock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct thread *a_td;
+	} */ *ap;
+{               
+	struct vnode *vp = ap->a_vp;
+
+#ifndef	DEBUG_LOCKS
+	return (lockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock, ap->a_td));
+#else
+	return (debuglockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock,
+	    ap->a_td, "vop_stdlock", vp->filename, vp->line));
+#endif
+}
+
+/* See above. */
+int
+vop_stdunlock(ap)
+	struct vop_unlock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct thread *a_td;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	return (lockmgr(&vp->v_lock, ap->a_flags | LK_RELEASE, &vp->v_interlock, 
+	    ap->a_td));
+}
+
+/* See above. */
+int
+vop_stdislocked(ap)
+	struct vop_islocked_args /* {
+		struct vnode *a_vp;
+		struct thread *a_td;
+	} */ *ap;
+{
+
+	return (lockstatus(&ap->a_vp->v_lock, ap->a_td));
+}
+
+/* Mark the vnode inactive */
+int
+vop_stdinactive(ap)
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+		struct thread *a_td;
+	} */ *ap;
+{
+
+	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
+	return (0);
+}
+
+/*
+ * Return true for select/poll.
+ */
+int
+vop_nopoll(ap)
+	struct vop_poll_args /* {
+		struct vnode *a_vp;
+		int  a_events;
+		struct ucred *a_cred;
+		struct thread *a_td;
+	} */ *ap;
+{
+	/*
+	 * Return true for read/write.  If the user asked for something
+	 * special, return POLLNVAL, so that clients have a way of
+	 * determining reliably whether or not the extended
+	 * functionality is present without hard-coding knowledge
+	 * of specific filesystem implementations.
+	 */
+	if (ap->a_events & ~POLLSTANDARD)
+		return (POLLNVAL);
+
+	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Implement poll for local filesystems that support it.
+ */
+int
+vop_stdpoll(ap)
+	struct vop_poll_args /* {
+		struct vnode *a_vp;
+		int  a_events;
+		struct ucred *a_cred;
+		struct thread *a_td;
+	} */ *ap;
+{
+	if (ap->a_events & ~POLLSTANDARD)
+		return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
+	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_sharedlock(ap)
+	struct vop_lock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct thread *a_td;
+	} */ *ap;
+{
+	/*
+	 * This code cannot be used until all the non-locking filesystems
+	 * (notably NFS) are converted to properly lock and release nodes.
+	 * Also, certain vnode operations change the locking state within
+	 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+	 * and symlink). Ideally these operations should not change the
+	 * lock state, but should be changed to let the caller of the
+	 * function unlock them. Otherwise all intermediate vnode layers
+	 * (such as union, umapfs, etc) must catch these functions to do
+	 * the necessary locking at their layer. Note that the inactive
+	 * and lookup operations also change their lock state, but this 
+	 * cannot be avoided, so these two operations will always need
+	 * to be handled in intermediate layers.
+	 */
+	struct vnode *vp = ap->a_vp;
+	int vnflags, flags = ap->a_flags;
+
+	switch (flags & LK_TYPE_MASK) {
+	case LK_DRAIN:
+		vnflags = LK_DRAIN;
+		break;
+	case LK_EXCLUSIVE:
+#ifdef DEBUG_VFS_LOCKS
+		/*
+		 * Normally, we use shared locks here, but that confuses
+		 * the locking assertions.
+		 */
+		vnflags = LK_EXCLUSIVE;
+		break;
+#endif
+	case LK_SHARED:
+		vnflags = LK_SHARED;
+		break;
+	case LK_UPGRADE:
+	case LK_EXCLUPGRADE:
+	case LK_DOWNGRADE:
+		return (0);
+	case LK_RELEASE:
+	default:
+		panic("vop_sharedlock: bad operation %d", flags & LK_TYPE_MASK);
+	}
+	if (flags & LK_INTERLOCK)
+		vnflags |= LK_INTERLOCK;
+#ifndef	DEBUG_LOCKS
+	return (lockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_td));
+#else
+	return (debuglockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_td,
+	    "vop_sharedlock", vp->filename, vp->line));
+#endif
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_nolock(ap)
+	struct vop_lock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct thread *a_td;
+	} */ *ap;
+{
+#ifdef notyet
+	/*
+	 * This code cannot be used until all the non-locking filesystems
+	 * (notably NFS) are converted to properly lock and release nodes.
+	 * Also, certain vnode operations change the locking state within
+	 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+	 * and symlink). Ideally these operations should not change the
+	 * lock state, but should be changed to let the caller of the
+	 * function unlock them. Otherwise all intermediate vnode layers
+	 * (such as union, umapfs, etc) must catch these functions to do
+	 * the necessary locking at their layer. Note that the inactive
+	 * and lookup operations also change their lock state, but this 
+	 * cannot be avoided, so these two operations will always need
+	 * to be handled in intermediate layers.
+	 */
+	struct vnode *vp = ap->a_vp;
+	int vnflags, flags = ap->a_flags;
+
+	switch (flags & LK_TYPE_MASK) {
+	case LK_DRAIN:
+		vnflags = LK_DRAIN;
+		break;
+	case LK_EXCLUSIVE:
+	case LK_SHARED:
+		vnflags = LK_SHARED;
+		break;
+	case LK_UPGRADE:
+	case LK_EXCLUPGRADE:
+	case LK_DOWNGRADE:
+		return (0);
+	case LK_RELEASE:
+	default:
+		panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
+	}
+	if (flags & LK_INTERLOCK)
+		vnflags |= LK_INTERLOCK;
+	return(lockmgr(&vp->v_lock, vnflags, &vp->v_interlock, ap->a_td));
+#else /* for now */
+	/*
+	 * Since we are not using the lock manager, we must clear
+	 * the interlock here.
+	 */
+	if (ap->a_flags & LK_INTERLOCK)
+		mtx_unlock(&ap->a_vp->v_interlock);
+	return (0);
+#endif
+}
+
+/*
+ * Do the inverse of vop_nolock, handling the interlock in a compatible way.
+ */
+int
+vop_nounlock(ap)
+	struct vop_unlock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct thread *a_td;
+	} */ *ap;
+{
+
+	/*
+	 * Since we are not using the lock manager, we must clear
+	 * the interlock here.
+	 */
+	if (ap->a_flags & LK_INTERLOCK)
+		mtx_unlock(&ap->a_vp->v_interlock);
+	return (0);
+}
+
+/*
+ * Return whether or not the node is in use.
+ */
+int
+vop_noislocked(ap)
+	struct vop_islocked_args /* {
+		struct vnode *a_vp;
+		struct thread *a_td;
+	} */ *ap;
+{
+
+	return (0);
+}
+
+/*
+ * Return our mount point, as we will take charge of the writes.
+ */
+int
+vop_stdgetwritemount(ap)
+	struct vop_getwritemount_args /* {
+		struct vnode *a_vp;
+		struct mount **a_mpp;
+	} */ *ap;
+{
+
+	*(ap->a_mpp) = ap->a_vp->v_mount;
+	return (0);
+}
+
+/* Create the VM system backing object for this vnode */
+int
+vop_stdcreatevobject(ap)
+	struct vop_createvobject_args /* {
+		struct vnode *vp;
+		struct ucred *cred;
+		struct thread *td;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+	struct ucred *cred = ap->a_cred;
+	struct thread *td = ap->a_td;
+	struct vattr vat;
+	vm_object_t object;
+	int error = 0;
+
+	GIANT_REQUIRED;
+
+	if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
+		return (0);
+
+retry:
+	if ((object = vp->v_object) == NULL) {
+		if (vp->v_type == VREG || vp->v_type == VDIR) {
+			if ((error = VOP_GETATTR(vp, &vat, cred, td)) != 0)
+				goto retn;
+			object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
+		} else if (devsw(vp->v_rdev) != NULL) {
+			/*
+			 * This simply allocates the biggest object possible
+			 * for a disk vnode.  This should be fixed, but doesn't
+			 * cause any problems (yet).
+			 */
+			object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
+		} else {
+			goto retn;
+		}
+		/*
+		 * Dereference the reference we just created.  This assumes
+		 * that the object is associated with the vp.
+		 */
+		object->ref_count--;
+		vp->v_usecount--;
+	} else {
+		if (object->flags & OBJ_DEAD) {
+			VOP_UNLOCK(vp, 0, td);
+			tsleep(object, PVM, "vodead", 0);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			goto retry;
+		}
+	}
+
+	KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
+	vp->v_flag |= VOBJBUF;
+
+retn:
+	return (error);
+}
+
+/* Destroy the VM system object associated with this vnode */
+int
+vop_stddestroyvobject(ap)
+	struct vop_destroyvobject_args /* {
+		struct vnode *vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+	vm_object_t obj = vp->v_object;
+
+	GIANT_REQUIRED;
+
+	if (vp->v_object == NULL)
+		return (0);
+
+	if (obj->ref_count == 0) {
+		/*
+		 * vclean() may be called twice. The first time
+		 * removes the primary reference to the object,
+		 * the second time goes one further and is a
+		 * special-case to terminate the object.
+		 *
+		 * don't double-terminate the object
+		 */
+		if ((obj->flags & OBJ_DEAD) == 0)
+			vm_object_terminate(obj);
+	} else {
+		/*
+		 * Woe to the process that tries to page now :-).
+		 */
+		vm_pager_deallocate(obj);
+	}
+	return (0);
+}
+
+/*
+ * Return the underlying VM object.  This routine may be called with or
+ * without the vnode interlock held.  If called without, the returned
+ * object is not guarenteed to be valid.  The syncer typically gets the
+ * object without holding the interlock in order to quickly test whether
+ * it might be dirty before going heavy-weight.  vm_object's use zalloc
+ * and thus stable-storage, so this is safe.
+ */
+int
+vop_stdgetvobject(ap)
+	struct vop_getvobject_args /* {
+		struct vnode *vp;
+		struct vm_object **objpp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+	struct vm_object **objpp = ap->a_objpp;
+
+	if (objpp)
+		*objpp = vp->v_object;
+	return (vp->v_object ? 0 : EINVAL);
+}
+
+/* XXX Needs good comment and VOP_BMAP(9) manpage */
+int
+vop_stdbmap(ap)
+	struct vop_bmap_args /* {  
+		struct vnode *a_vp;
+		daddr_t  a_bn;
+		struct vnode **a_vpp;
+		daddr_t *a_bnp;
+		int *a_runp;
+		int *a_runb;
+	} */ *ap;
+{
+
+	if (ap->a_vpp != NULL)
+		*ap->a_vpp = ap->a_vp;
+	if (ap->a_bnp != NULL)
+		*ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
+	if (ap->a_runp != NULL)
+		*ap->a_runp = 0;
+	if (ap->a_runb != NULL)
+		*ap->a_runb = 0;
+	return (0);
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
+int
+vop_stdgetpages(ap)
+	struct vop_getpages_args /* {
+		struct vnode *a_vp;
+		vm_page_t *a_m;
+		int a_count;
+		int a_reqpage;
+		vm_ooffset_t a_offset;
+	} */ *ap;
+{
+
+	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
+	    ap->a_count, ap->a_reqpage);
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
+int
+vop_stdputpages(ap)
+	struct vop_putpages_args /* {
+		struct vnode *a_vp;
+		vm_page_t *a_m;
+		int a_count;
+		int a_sync;
+		int *a_rtvals;
+		vm_ooffset_t a_offset;
+	} */ *ap;
+{
+
+	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
+	     ap->a_sync, ap->a_rtvals);
+}
+
+
+
+/* 
+ * vfs default ops
+ * used to fill the vfs function table to get reasonable default return values.
+ */
+int 
+vfs_stdmount (mp, path, data, ndp, td)
+	struct mount *mp;
+	char *path;
+	caddr_t data; 
+	struct nameidata *ndp;
+	struct thread *td;
+{
+	return (0);
+}
+
+int
+vfs_stdunmount (mp, mntflags, td)
+	struct mount *mp;
+	int mntflags;
+	struct thread *td;
+{
+	return (0);
+}
+
+int
+vfs_stdroot (mp, vpp)
+	struct mount *mp;
+	struct vnode **vpp;
+{
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdstatfs (mp, sbp, td)
+	struct mount *mp;
+	struct statfs *sbp;
+	struct thread *td;
+{
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdvptofh (vp, fhp)
+	struct vnode *vp;
+	struct fid *fhp;
+{
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdstart (mp, flags, td)
+	struct mount *mp;
+	int flags;
+	struct thread *td;
+{
+	return (0);
+}
+
+int	
+vfs_stdquotactl (mp, cmds, uid, arg, td)
+	struct mount *mp;
+	int cmds;
+	uid_t uid;
+	caddr_t arg;
+	struct thread *td;
+{
+	return (EOPNOTSUPP);
+}
+
+int	
+vfs_stdsync (mp, waitfor, cred, td)
+	struct mount *mp;
+	int waitfor;
+	struct ucred *cred; 
+	struct thread *td;
+{
+	return (0);
+}
+
+int	
+vfs_stdvget (mp, ino, flags, vpp)
+	struct mount *mp;
+	ino_t ino;
+	int flags;
+	struct vnode **vpp;
+{
+	return (EOPNOTSUPP);
+}
+
+int	
+vfs_stdfhtovp (mp, fhp, vpp)
+	struct mount *mp;
+	struct fid *fhp;
+	struct vnode **vpp;
+{
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdinit (vfsp) 
+	struct vfsconf *vfsp;
+{
+	return (0);
+}
+
+int
+vfs_stduninit (vfsp)
+	struct vfsconf *vfsp;
+{
+	return(0);
+}
+
+int
+vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)
+	struct mount *mp;
+	int cmd;
+	struct vnode *filename_vp;
+	int attrnamespace;
+	const char *attrname;
+	struct thread *td;
+{
+	return(EOPNOTSUPP);
+}
+
+/* end of vfs default ops */
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
new file mode 100644
index 0000000..ec135bd
--- /dev/null
+++ b/sys/kern/vfs_export.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <net/radix.h>
+#include <sys/domain.h>
+#include <sys/dirent.h>
+#include <sys/vnode.h>
+
+static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+
+static void	vfs_free_addrlist(struct netexport *nep);
+static int	vfs_free_netcred(struct radix_node *rn, void *w);
+static int	vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+		    struct export_args *argp);
+
+/*
+ * Network address lookup element
+ */
+struct netcred {
+	struct	radix_node netc_rnodes[2];
+	int	netc_exflags;
+	struct	ucred netc_anon;
+};
+
+/*
+ * Network export information
+ */
+struct netexport {
+	struct	netcred ne_defexported;		      /* Default export */
+	struct	radix_node_head *ne_rtable[AF_MAX+1]; /* Individual exports */
+};
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by ufs_mount() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(mp, nep, argp)
+	struct mount *mp;
+	struct netexport *nep;
+	struct export_args *argp;
+{
+	register struct netcred *np;
+	register struct radix_node_head *rnh;
+	register int i;
+	struct radix_node *rn;
+	struct sockaddr *saddr, *smask = 0;
+	struct domain *dom;
+	int error;
+
+	/*
+	 * XXX: This routine converts from a `struct xucred'
+	 * (argp->ex_anon) to a `struct ucred' (np->netc_anon).  This
+	 * operation is questionable; for example, what should be done
+	 * with fields like cr_uidinfo and cr_prison?  Currently, this
+	 * routine does not touch them (leaves them as NULL).
+	 */
+	if (argp->ex_anon.cr_version != XUCRED_VERSION)
+		return (EINVAL);
+
+	if (argp->ex_addrlen == 0) {
+		if (mp->mnt_flag & MNT_DEFEXPORTED)
+			return (EPERM);
+		np = &nep->ne_defexported;
+		np->netc_exflags = argp->ex_flags;
+		bzero(&np->netc_anon, sizeof(np->netc_anon));
+		np->netc_anon.cr_uid = argp->ex_anon.cr_uid;
+		np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
+		bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
+		    sizeof(np->netc_anon.cr_groups));
+		np->netc_anon.cr_ref = 1;
+		mp->mnt_flag |= MNT_DEFEXPORTED;
+		return (0);
+	}
+
+	if (argp->ex_addrlen > MLEN)
+		return (EINVAL);
+
+	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO);
+	saddr = (struct sockaddr *) (np + 1);
+	if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen)))
+		goto out;
+	if (saddr->sa_len > argp->ex_addrlen)
+		saddr->sa_len = argp->ex_addrlen;
+	if (argp->ex_masklen) {
+		smask = (struct sockaddr *) (saddr + argp->ex_addrlen);
+		error = copyin(argp->ex_mask, smask, argp->ex_masklen);
+		if (error)
+			goto out;
+		if (smask->sa_len > argp->ex_masklen)
+			smask->sa_len = argp->ex_masklen;
+	}
+	i = saddr->sa_family;
+	if ((rnh = nep->ne_rtable[i]) == 0) {
+		/*
+		 * Seems silly to initialize every AF when most are not used,
+		 * do so on demand here
+		 */
+		for (dom = domains; dom; dom = dom->dom_next)
+			if (dom->dom_family == i && dom->dom_rtattach) {
+				dom->dom_rtattach((void **) &nep->ne_rtable[i],
+				    dom->dom_rtoffset);
+				break;
+			}
+		if ((rnh = nep->ne_rtable[i]) == 0) {
+			error = ENOBUFS;
+			goto out;
+		}
+	}
+	rn = (*rnh->rnh_addaddr) (saddr, smask, rnh,
+	    np->netc_rnodes);
+	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
+		error = EPERM;
+		goto out;
+	}
+	np->netc_exflags = argp->ex_flags;
+	bzero(&np->netc_anon, sizeof(np->netc_anon));
+	np->netc_anon.cr_uid = argp->ex_anon.cr_uid;
+	np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
+	bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
+	    sizeof(np->netc_anon.cr_groups));
+	np->netc_anon.cr_ref = 1;
+	return (0);
+out:
+	free(np, M_NETADDR);
+	return (error);
+}
+
+/* Helper for vfs_free_addrlist. */
+/* ARGSUSED */
+static int
+vfs_free_netcred(rn, w)
+	struct radix_node *rn;
+	void *w;
+{
+	register struct radix_node_head *rnh = (struct radix_node_head *) w;
+
+	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+	free(rn, M_NETADDR);
+	return (0);
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(nep)
+	struct netexport *nep;
+{
+	register int i;
+	register struct radix_node_head *rnh;
+
+	for (i = 0; i <= AF_MAX; i++)
+		if ((rnh = nep->ne_rtable[i])) {
+			(*rnh->rnh_walktree) (rnh, vfs_free_netcred, rnh);
+			free(rnh, M_RTABLE);
+			nep->ne_rtable[i] = 0;
+		}
+}
+
+/*
+ * High level function to manipulate export options on a mount point
+ * and the passed in netexport.
+ * Struct export_args *argp is the variable used to twiddle options,
+ * the structure is described in sys/mount.h
+ */
+int
+vfs_export(mp, argp)
+	struct mount *mp;
+	struct export_args *argp;
+{
+	struct netexport *nep;
+	int error;
+
+	nep = mp->mnt_export;
+	if (argp->ex_flags & MNT_DELEXPORT) {
+		if (nep == NULL)
+			return (ENOENT);
+		if (mp->mnt_flag & MNT_EXPUBLIC) {
+			vfs_setpublicfs(NULL, NULL, NULL);
+			mp->mnt_flag &= ~MNT_EXPUBLIC;
+		}
+		vfs_free_addrlist(nep);
+		mp->mnt_export = NULL;
+		free(nep, M_MOUNT);
+		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+	}
+	if (argp->ex_flags & MNT_EXPORTED) {
+		if (nep == NULL) {
+			nep = malloc(sizeof(struct netexport), M_MOUNT, M_WAITOK | M_ZERO);
+			mp->mnt_export = nep;
+		}
+		if (argp->ex_flags & MNT_EXPUBLIC) {
+			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+				return (error);
+			mp->mnt_flag |= MNT_EXPUBLIC;
+		}
+		if ((error = vfs_hang_addrlist(mp, nep, argp)))
+			return (error);
+		mp->mnt_flag |= MNT_EXPORTED;
+	}
+	return (0);
+}
+
+/*
+ * Set the publicly exported filesystem (WebNFS). Currently, only
+ * one public filesystem is possible in the spec (RFC 2054 and 2055)
+ */
+int
+vfs_setpublicfs(mp, nep, argp)
+	struct mount *mp;
+	struct netexport *nep;
+	struct export_args *argp;
+{
+	int error;
+	struct vnode *rvp;
+	char *cp;
+
+	/*
+	 * mp == NULL -> invalidate the current info, the FS is
+	 * no longer exported. May be called from either vfs_export
+	 * or unmount, so check if it hasn't already been done.
+	 */
+	if (mp == NULL) {
+		if (nfs_pub.np_valid) {
+			nfs_pub.np_valid = 0;
+			if (nfs_pub.np_index != NULL) {
+				FREE(nfs_pub.np_index, M_TEMP);
+				nfs_pub.np_index = NULL;
+			}
+		}
+		return (0);
+	}
+
+	/*
+	 * Only one allowed at a time.
+	 */
+	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
+		return (EBUSY);
+
+	/*
+	 * Get real filehandle for root of exported FS.
+	 */
+	bzero(&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
+	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
+
+	if ((error = VFS_ROOT(mp, &rvp)))
+		return (error);
+
+	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+		return (error);
+
+	vput(rvp);
+
+	/*
+	 * If an indexfile was specified, pull it in.
+	 */
+	if (argp->ex_indexfile != NULL) {
+		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
+		    M_WAITOK);
+		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
+		    MAXNAMLEN, (size_t *)0);
+		if (!error) {
+			/*
+			 * Check for illegal filenames.
+			 */
+			for (cp = nfs_pub.np_index; *cp; cp++) {
+				if (*cp == '/') {
+					error = EINVAL;
+					break;
+				}
+			}
+		}
+		if (error) {
+			FREE(nfs_pub.np_index, M_TEMP);
+			return (error);
+		}
+	}
+
+	nfs_pub.np_mount = mp;
+	nfs_pub.np_valid = 1;
+	return (0);
+}
+
+/*
+ * Used by the filesystems to determine if a given network address
+ * (passed in 'nam') is present in thier exports list, returns a pointer
+ * to struct netcred so that the filesystem can examine it for
+ * access rights (read/write/etc).
+ */
+struct netcred *
+vfs_export_lookup(mp, nam)
+	register struct mount *mp;
+	struct sockaddr *nam;
+{
+	struct netexport *nep;
+	register struct netcred *np;
+	register struct radix_node_head *rnh;
+	struct sockaddr *saddr;
+
+	nep = mp->mnt_export;
+	if (nep == NULL)
+		return (NULL);
+	np = NULL;
+	if (mp->mnt_flag & MNT_EXPORTED) {
+		/*
+		 * Lookup in the export list first.
+		 */
+		if (nam != NULL) {
+			saddr = nam;
+			rnh = nep->ne_rtable[saddr->sa_family];
+			if (rnh != NULL) {
+				np = (struct netcred *)
+					(*rnh->rnh_matchaddr)(saddr,
+							      rnh);
+				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
+					np = NULL;
+			}
+		}
+		/*
+		 * If no address match, use the default if it exists.
+		 */
+		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
+			np = &nep->ne_defexported;
+	}
+	return (np);
+}
+
+/*
+ * XXX: This comment comes from the deprecated ufs_check_export()
+ * XXX: and may not entirely apply, but lacking something better:
+ * This is the generic part of fhtovp called after the underlying
+ * filesystem has validated the file handle.
+ *
+ * Verify that a host should have access to a filesystem.
+ */
+
+int 
+vfs_stdcheckexp(mp, nam, extflagsp, credanonp)
+	struct mount *mp;
+	struct sockaddr *nam;
+	int *extflagsp;
+	struct ucred **credanonp;
+{
+	struct netcred *np;
+
+	np = vfs_export_lookup(mp, nam);
+	if (np == NULL)
+		return (EACCES);
+	*extflagsp = np->netc_exflags;
+	*credanonp = &np->netc_anon;
+	return (0);
+}
+
diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c
new file mode 100644
index 0000000..1244e54
--- /dev/null
+++ b/sys/kern/vfs_extattr.c
@@ -0,0 +1,4862 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
+ * $FreeBSD$
+ */
+
+/* For 4.3 integer FS ID compatibility */
+#include "opt_compat.h"
+#include "opt_ffs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sysent.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/linker.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+#include <sys/extattr.h>
+#include <sys/jail.h>
+#include <sys/sysctl.h>
+
+#include <machine/limits.h>
+#include <machine/stdarg.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+static int change_dir(struct nameidata *ndp, struct thread *td);
+static void checkdirs(struct vnode *olddp, struct vnode *newdp);
+static int chroot_refuse_vdir_fds(struct filedesc *fdp);
+static int getutimes(const struct timeval *, struct timespec *);
+static int setfown(struct thread *td, struct vnode *, uid_t, gid_t);
+static int setfmode(struct thread *td, struct vnode *, int);
+static int setfflags(struct thread *td, struct vnode *, int);
+static int setutimes(struct thread *td, struct vnode *,
+    const struct timespec *, int);
+static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
+    struct thread *td);
+static int vfs_nmount(struct thread *td, int, struct uio *);
+
+static int	usermount = 0;	/* if 1, non-root can mount fs. */
+
+int (*union_dircheckp)(struct thread *td, struct vnode **, struct file *);
+
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
+
+/*
+ * Virtual File System System Calls
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct nmount_args {
+	struct iovec    *iovp;
+	unsigned int    iovcnt;
+	int             flags;
+};
+#endif
+/* ARGSUSED */
+int
+nmount(td, uap)
+	struct thread *td;
+	struct nmount_args /* {
+		syscallarg(struct iovec *) iovp;
+		syscallarg(unsigned int) iovcnt;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct uio auio;
+	struct iovec *iov, *needfree;
+	struct iovec aiov[UIO_SMALLIOV];
+	unsigned int i;
+	int error;
+	u_int iovlen, iovcnt;
+
+	iovcnt = SCARG(uap, iovcnt);
+	iovlen = iovcnt * sizeof (struct iovec);
+	/*
+	 * Check that we have an even number of iovec's
+	 * and that we have at least two options.
+	 */
+	if ((iovcnt & 1) || (iovcnt < 4) || (iovcnt > UIO_MAXIOV))
+		return (EINVAL);
+
+	if (iovcnt > UIO_SMALLIOV) {
+		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+		needfree = iov;
+	} else {
+		iov = aiov;
+		needfree = NULL;
+	}
+	auio.uio_iov = iov;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_segflg = UIO_USERSPACE;
+	if ((error = copyin(uap->iovp, iov, iovlen)))
+		goto finish;
+
+	for (i = 0; i < iovcnt; i++) {
+		if (iov->iov_len > MMAXOPTIONLEN) {
+			error = EINVAL;
+			goto finish;
+		}
+		iov++;
+	}
+	error = vfs_nmount(td, SCARG(uap, flags), &auio);
+finish:
+	if (needfree != NULL)
+		free(needfree, M_TEMP);
+	return (error);
+}
+
+/*
+ * Release all resources related to the
+ * mount options.
+ */
+void
+vfs_freeopts(struct vfsoptlist *opts)
+{
+	struct vfsopt *opt;
+
+	while (!TAILQ_EMPTY(opts)) {
+		opt = TAILQ_FIRST(opts);
+		TAILQ_REMOVE(opts, opt, link);
+		free(opt->name, M_MOUNT);
+		free(opt->value, M_MOUNT);
+		free(opt, M_MOUNT);
+	}
+	free(opts, M_MOUNT);
+}
+
+int
+kernel_mount(iovp, iovcnt, flags)
+	struct iovec *iovp;
+	unsigned int iovcnt;
+	int flags;
+{
+	struct uio auio;
+	int error;
+
+	/*
+	 * Check that we have an even number of iovec's
+	 * and that we have at least two options.
+	 */
+	if ((iovcnt & 1) || (iovcnt < 4))
+		return (EINVAL);
+
+	auio.uio_iov = iovp;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_segflg = UIO_SYSSPACE;
+
+	error = vfs_nmount(curthread, flags, &auio);
+	return (error);
+}
+
+int
+kernel_vmount(int flags, ...)
+{
+	struct iovec *iovp;
+	struct uio auio;
+	va_list ap;
+	unsigned int iovcnt, iovlen, len;
+	const char *cp;
+	char *buf, *pos;
+	size_t n;
+	int error, i;
+
+	len = 0;
+	va_start(ap, flags);
+	for (iovcnt = 0; (cp = va_arg(ap, const char *)) != NULL; iovcnt++)
+		len += strlen(cp) + 1;
+	va_end(ap);
+
+	if (iovcnt < 4 || iovcnt & 1)
+		return (EINVAL);
+
+	iovlen = iovcnt * sizeof (struct iovec);
+	MALLOC(iovp, struct iovec *, iovlen, M_MOUNT, M_WAITOK);
+	MALLOC(buf, char *, len, M_MOUNT, M_WAITOK);
+	pos = buf;
+	va_start(ap, flags);
+	for (i = 0; i < iovcnt; i++) {
+		cp = va_arg(ap, const char *);
+		copystr(cp, pos, len - (pos - buf), &n);
+		iovp[i].iov_base = pos;
+		iovp[i].iov_len = n;
+		pos += n;
+	}
+	va_end(ap);
+
+	auio.uio_iov = iovp;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_segflg = UIO_SYSSPACE;
+
+	error = vfs_nmount(curthread, flags, &auio);
+	FREE(iovp, M_MOUNT);
+	FREE(buf, M_MOUNT);
+	return (error);
+}
+
+/*
+ * vfs_nmount(): actually attempt a filesystem mount.
+ */
+static int
+vfs_nmount(td, fsflags, fsoptions)
+	struct thread *td;
+	int fsflags;		/* Flags common to all filesystems. */
+	struct uio *fsoptions;	/* Options local to the filesystem. */
+{
+	linker_file_t lf;
+	struct vnode *vp;
+	struct mount *mp;
+	struct vfsconf *vfsp;
+	struct vfsoptlist *optlist;
+	char *fstype, *fspath;
+	int error, flag = 0, kern_flag = 0;
+	int fstypelen, fspathlen;
+	struct vattr va;
+	struct nameidata nd;
+
+	error = vfs_buildopts(fsoptions, &optlist);
+	if (error)
+		return (error);
+
+	/*
+	 * We need these two options before the others,
+	 * and they are mandatory for any filesystem.
+	 * Ensure they are NUL terminated as well.
+	 */
+	fstypelen = 0;
+	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
+	if (error || fstype[fstypelen - 1] != '\0') {
+		error = EINVAL;
+		goto bad;
+	}
+	fspathlen = 0;
+	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
+	if (error || fspath[fspathlen - 1] != '\0') {
+		error = EINVAL;
+		goto bad;
+	}
+
+	/*
+	 * Be ultra-paranoid about making sure the type and fspath
+	 * variables will fit in our mp buffers, including the
+	 * terminating NUL.
+	 */
+	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
+		error = ENAMETOOLONG;
+		goto bad;
+	}
+
+	if (usermount == 0) {
+	       	error = suser(td);
+		if (error)
+			goto bad;
+	}
+	/*
+	 * Do not allow NFS export by non-root users.
+	 */
+	if (fsflags & MNT_EXPORTED) {
+		error = suser(td);
+		if (error)
+			goto bad;
+	}
+	/*
+	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users.
+	 */
+	if (suser(td)) 
+		fsflags |= MNT_NOSUID | MNT_NODEV;
+	/*
+	 * Get vnode to be covered
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
+	if ((error = namei(&nd)) != 0)
+		goto bad;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if (fsflags & MNT_UPDATE) {
+		if ((vp->v_flag & VROOT) == 0) {
+			vput(vp);
+			error = EINVAL;
+			goto bad;
+		}
+		mp = vp->v_mount;
+		flag = mp->mnt_flag;
+		kern_flag = mp->mnt_kern_flag;
+		/*
+		 * We only allow the filesystem to be reloaded if it
+		 * is currently mounted read-only.
+		 */
+		if ((fsflags & MNT_RELOAD) &&
+		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+			vput(vp);
+			error = EOPNOTSUPP;	/* Needs translation */
+			goto bad;
+		}
+		/*
+		 * Only root, or the user that did the original mount is
+		 * permitted to update it.
+		 */
+		if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+			error = suser(td);
+			if (error) {
+				vput(vp);
+				goto bad;
+			}
+		}
+		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
+			vput(vp);
+			error = EBUSY;
+			goto bad;
+		}
+		mtx_lock(&vp->v_interlock);
+		if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) {
+			mtx_unlock(&vp->v_interlock);
+			vfs_unbusy(mp, td);
+			vput(vp);
+			error = EBUSY;
+			goto bad;
+		}
+		vp->v_flag |= VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		mp->mnt_flag |= fsflags &
+		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT);
+		VOP_UNLOCK(vp, 0, td);
+		goto update;
+	}
+	/*
+	 * If the user is not root, ensure that they own the directory
+	 * onto which we are attempting to mount.
+	 */
+	error = VOP_GETATTR(vp, &va, td->td_ucred, td);
+	if (error) {
+		vput(vp);
+		goto bad;
+	}
+	if (va.va_uid != td->td_ucred->cr_uid) {
+		error = suser(td);
+		if (error) {
+			vput(vp);
+			goto bad;
+		}
+	}
+	if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) {
+		vput(vp);
+		goto bad;
+	}
+	if (vp->v_type != VDIR) {
+		vput(vp);
+		error = ENOTDIR;
+		goto bad;
+	}
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstype))
+			break;
+	if (vfsp == NULL) {
+		/* Only load modules for root (very important!). */
+		error = suser(td);
+		if (error) {
+			vput(vp);
+			goto bad;
+		}
+		error = securelevel_gt(td->td_ucred, 0);
+		if (error) {
+			vput(vp);
+			goto bad;
+		}
+		error = linker_load_file(fstype, &lf);
+		if (error || lf == NULL) {
+			vput(vp);
+			if (lf == NULL)
+				error = ENODEV;
+			goto bad;
+		}
+		lf->userrefs++;
+		/* Look up again to see if the VFS was loaded. */
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (!strcmp(vfsp->vfc_name, fstype))
+				break;
+		if (vfsp == NULL) {
+			lf->userrefs--;
+			linker_file_unload(lf);
+			vput(vp);
+			error = ENODEV;
+			goto bad;
+		}
+	}
+	mtx_lock(&vp->v_interlock);
+	if ((vp->v_flag & VMOUNT) != 0 ||
+	    vp->v_mountedhere != NULL) {
+		mtx_unlock(&vp->v_interlock);
+		vput(vp);
+		error = EBUSY;
+		goto bad;
+	}
+	vp->v_flag |= VMOUNT;
+	mtx_unlock(&vp->v_interlock);
+
+	/*
+	 * Allocate and initialize the filesystem.
+	 */
+	mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&mp->mnt_nvnodelist);
+	TAILQ_INIT(&mp->mnt_reservedvnlist);
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, td);
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_vfc = vfsp;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN);
+	mp->mnt_vnodecovered = vp;
+	mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
+	strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+	mp->mnt_iosize_max = DFLTPHYS;
+	VOP_UNLOCK(vp, 0, td);
+
+update:
+	mp->mnt_optnew = optlist;
+	/*
+	 * Check if the fs implements the new VFS_NMOUNT()
+	 * function, since the new system call was used.
+	 */
+	if (mp->mnt_op->vfs_mount != NULL) {
+		printf("%s doesn't support the new mount syscall\n",
+		    mp->mnt_vfc->vfc_name);
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		if (mp->mnt_flag & MNT_UPDATE)
+			vfs_unbusy(mp, td);
+		else {
+			mp->mnt_vfc->vfc_refcount--;
+			vfs_unbusy(mp, td);
+			free(mp, M_MOUNT);
+		}
+		vrele(vp);
+		error = EOPNOTSUPP;
+		goto bad;
+	}
+
+	/*
+	 * Set the mount level flags.
+	 */
+	if (fsflags & MNT_RDONLY)
+		mp->mnt_flag |= MNT_RDONLY;
+	else if (mp->mnt_flag & MNT_RDONLY)
+		mp->mnt_kern_flag |= MNTK_WANTRDWR;
+	mp->mnt_flag &=~ MNT_UPDATEMASK;
+	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE);
+	/*
+	 * Mount the filesystem.
+	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+	 * get.  No freeing of cn_pnbuf.
+	 */
+	error = VFS_NMOUNT(mp, &nd, td);
+	if (!error) {
+		if (mp->mnt_opt != NULL)
+			vfs_freeopts(mp->mnt_opt);
+		mp->mnt_opt = mp->mnt_optnew;
+	}
+	/*
+	 * Prevent external consumers of mount
+	 * options to read mnt_optnew.
+	 */
+	mp->mnt_optnew = NULL;
+	if (mp->mnt_flag & MNT_UPDATE) {
+		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+			mp->mnt_flag &= ~MNT_RDONLY;
+		mp->mnt_flag &=~
+		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
+		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+		if (error) {
+			mp->mnt_flag = flag;
+			mp->mnt_kern_flag = kern_flag;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			if (mp->mnt_syncer == NULL)
+				error = vfs_allocate_syncvnode(mp);
+		} else {
+			if (mp->mnt_syncer != NULL)
+				vrele(mp->mnt_syncer);
+			mp->mnt_syncer = NULL;
+		}
+		vfs_unbusy(mp, td);
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		vrele(vp);
+		return (error);
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	/*
+	 * Put the new filesystem on the mount list after root.
+	 */
+	cache_purge(vp);
+	if (!error) {
+		struct vnode *newdp;
+
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		vp->v_mountedhere = mp;
+		mtx_unlock(&vp->v_interlock);
+		mtx_lock(&mountlist_mtx);
+		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+		mtx_unlock(&mountlist_mtx);
+		if (VFS_ROOT(mp, &newdp))
+			panic("mount: lost mount");
+		checkdirs(vp, newdp);
+		vput(newdp);
+		VOP_UNLOCK(vp, 0, td);
+		if ((mp->mnt_flag & MNT_RDONLY) == 0)
+			error = vfs_allocate_syncvnode(mp);
+		vfs_unbusy(mp, td);
+		if ((error = VFS_START(mp, 0, td)) != 0) {
+			vrele(vp);
+			goto bad;
+		}
+	} else {
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		mp->mnt_vfc->vfc_refcount--;
+		vfs_unbusy(mp, td);
+		free(mp, M_MOUNT);
+		vput(vp);
+		goto bad;
+	}
+	return (0);
+bad:
+	vfs_freeopts(optlist);
+	return (error);
+}
+
+/*
+ * Old Mount API.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+	char	*type;
+	char	*path;
+	int	flags;
+	caddr_t	data;
+};
+#endif
+/* ARGSUSED */
+int
+mount(td, uap)
+	struct thread *td;
+	struct mount_args /* {
+		syscallarg(char *) type;
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(caddr_t) data;
+	} */ *uap;
+{
+	char *fstype;
+	char *fspath;
+	int error;
+
+	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
+	fspath = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+
+	/*
+	 * vfs_mount() actually takes a kernel string for `type' and
+	 * `path' now, so extract them.
+	 */
+	error = copyinstr(SCARG(uap, type), fstype, MFSNAMELEN, NULL);
+	if (error)
+		goto finish;
+	error = copyinstr(SCARG(uap, path), fspath, MNAMELEN, NULL);
+	if (error)
+		goto finish;
+	error = vfs_mount(td, fstype, fspath, SCARG(uap, flags),
+	    SCARG(uap, data));
+finish:
+	free(fstype, M_TEMP);
+	free(fspath, M_TEMP);
+	return (error);
+}
+
+/*
+ * vfs_mount(): actually attempt a filesystem mount.
+ *
+ * This routine is designed to be a "generic" entry point for routines
+ * that wish to mount a filesystem. All parameters except `fsdata' are
+ * pointers into kernel space. `fsdata' is currently still a pointer
+ * into userspace.
+ */
+int
+vfs_mount(td, fstype, fspath, fsflags, fsdata)
+	struct thread *td;
+	const char *fstype;
+	char *fspath;
+	int fsflags;
+	void *fsdata;
+{
+	linker_file_t lf;
+	struct vnode *vp;
+	struct mount *mp;
+	struct vfsconf *vfsp;
+	int error, flag = 0, kern_flag = 0;
+	struct vattr va;
+	struct nameidata nd;
+
+	/*
+	 * Be ultra-paranoid about making sure the type and fspath
+	 * variables will fit in our mp buffers, including the
+	 * terminating NUL.
+	 */
+	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+		return (ENAMETOOLONG);
+
+	if (usermount == 0) {
+		error = suser(td);
+		if (error)
+			return (error);
+	}
+	/*
+	 * Do not allow NFS export by non-root users.
+	 */
+	if (fsflags & MNT_EXPORTED) {
+		error = suser(td);
+		if (error)
+			return (error);
+	}
+	/*
+	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users.
+	 */
+	if (suser(td)) 
+		fsflags |= MNT_NOSUID | MNT_NODEV;
+	/*
+	 * Get vnode to be covered
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if (fsflags & MNT_UPDATE) {
+		if ((vp->v_flag & VROOT) == 0) {
+			vput(vp);
+			return (EINVAL);
+		}
+		mp = vp->v_mount;
+		flag = mp->mnt_flag;
+		kern_flag = mp->mnt_kern_flag;
+		/*
+		 * We only allow the filesystem to be reloaded if it
+		 * is currently mounted read-only.
+		 */
+		if ((fsflags & MNT_RELOAD) &&
+		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+			vput(vp);
+			return (EOPNOTSUPP);	/* Needs translation */
+		}
+		/*
+		 * Only root, or the user that did the original mount is
+		 * permitted to update it.
+		 */
+		if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+			error = suser(td);
+			if (error) {
+				vput(vp);
+				return (error);
+			}
+		}
+		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
+			vput(vp);
+			return (EBUSY);
+		}
+		mtx_lock(&vp->v_interlock);
+		if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) {
+			mtx_unlock(&vp->v_interlock);
+			vfs_unbusy(mp, td);
+			vput(vp);
+			return (EBUSY);
+		}
+		vp->v_flag |= VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		mp->mnt_flag |= fsflags &
+		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT);
+		VOP_UNLOCK(vp, 0, td);
+		goto update;
+	}
+	/*
+	 * If the user is not root, ensure that they own the directory
+	 * onto which we are attempting to mount.
+	 */
+	error = VOP_GETATTR(vp, &va, td->td_ucred, td);
+	if (error) {
+		vput(vp);
+		return (error);
+	}
+	if (va.va_uid != td->td_ucred->cr_uid) {
+		error = suser(td);
+		if (error) {
+			vput(vp);
+			return (error);
+		}
+	}
+	if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) {
+		vput(vp);
+		return (error);
+	}
+	if (vp->v_type != VDIR) {
+		vput(vp);
+		return (ENOTDIR);
+	}
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstype))
+			break;
+	if (vfsp == NULL) {
+		/* Only load modules for root (very important!). */
+		error = suser(td);
+		if (error) {
+			vput(vp);
+			return (error);
+		}
+		error = securelevel_gt(td->td_ucred, 0);
+		if (error) {
+			vput(vp);
+			return (error);
+		}
+		error = linker_load_file(fstype, &lf);
+		if (error || lf == NULL) {
+			vput(vp);
+			if (lf == NULL)
+				error = ENODEV;
+			return (error);
+		}
+		lf->userrefs++;
+		/* Look up again to see if the VFS was loaded. */
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (!strcmp(vfsp->vfc_name, fstype))
+				break;
+		if (vfsp == NULL) {
+			lf->userrefs--;
+			linker_file_unload(lf);
+			vput(vp);
+			return (ENODEV);
+		}
+	}
+	mtx_lock(&vp->v_interlock);
+	if ((vp->v_flag & VMOUNT) != 0 ||
+	    vp->v_mountedhere != NULL) {
+		mtx_unlock(&vp->v_interlock);
+		vput(vp);
+		return (EBUSY);
+	}
+	vp->v_flag |= VMOUNT;
+	mtx_unlock(&vp->v_interlock);
+
+	/*
+	 * Allocate and initialize the filesystem.
+	 */
+	mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&mp->mnt_nvnodelist);
+	TAILQ_INIT(&mp->mnt_reservedvnlist);
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, td);
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_vfc = vfsp;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN);
+	mp->mnt_vnodecovered = vp;
+	mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
+	strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+	mp->mnt_iosize_max = DFLTPHYS;
+	VOP_UNLOCK(vp, 0, td);
+update:
+	/*
+	 * Check if the fs implements the old VFS_MOUNT()
+	 * function, since the old system call was used.
+	 */
+	if (mp->mnt_op->vfs_mount == NULL) {
+		printf("%s doesn't support the old mount syscall\n",
+		    mp->mnt_vfc->vfc_name);
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		if (mp->mnt_flag & MNT_UPDATE)
+			vfs_unbusy(mp, td);
+		else {
+			mp->mnt_vfc->vfc_refcount--;
+			vfs_unbusy(mp, td);
+			free(mp, M_MOUNT);
+		}
+		vrele(vp);
+		return (EOPNOTSUPP);
+	}
+
+	/*
+	 * Set the mount level flags.
+	 */
+	if (fsflags & MNT_RDONLY)
+		mp->mnt_flag |= MNT_RDONLY;
+	else if (mp->mnt_flag & MNT_RDONLY)
+		mp->mnt_kern_flag |= MNTK_WANTRDWR;
+	mp->mnt_flag &=~ MNT_UPDATEMASK;
+	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE);
+	/*
+	 * Mount the filesystem.
+	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+	 * get.  No freeing of cn_pnbuf.
+	 */
+	error = VFS_MOUNT(mp, fspath, fsdata, &nd, td);
+	if (mp->mnt_flag & MNT_UPDATE) {
+		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+			mp->mnt_flag &= ~MNT_RDONLY;
+		mp->mnt_flag &=~
+		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
+		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+		if (error) {
+			mp->mnt_flag = flag;
+			mp->mnt_kern_flag = kern_flag;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			if (mp->mnt_syncer == NULL)
+				error = vfs_allocate_syncvnode(mp);
+		} else {
+			if (mp->mnt_syncer != NULL)
+				vrele(mp->mnt_syncer);
+			mp->mnt_syncer = NULL;
+		}
+		vfs_unbusy(mp, td);
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		vrele(vp);
+		return (error);
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	/*
+	 * Put the new filesystem on the mount list after root.
+	 */
+	cache_purge(vp);
+	if (!error) {
+		struct vnode *newdp;
+
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		vp->v_mountedhere = mp;
+		mtx_unlock(&vp->v_interlock);
+		mtx_lock(&mountlist_mtx);
+		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+		mtx_unlock(&mountlist_mtx);
+		if (VFS_ROOT(mp, &newdp))
+			panic("mount: lost mount");
+		checkdirs(vp, newdp);
+		vput(newdp);
+		VOP_UNLOCK(vp, 0, td);
+		if ((mp->mnt_flag & MNT_RDONLY) == 0)
+			error = vfs_allocate_syncvnode(mp);
+		vfs_unbusy(mp, td);
+		if ((error = VFS_START(mp, 0, td)) != 0)
+			vrele(vp);
+	} else {
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		mp->mnt_vfc->vfc_refcount--;
+		vfs_unbusy(mp, td);
+		free(mp, M_MOUNT);
+		vput(vp);
+	}
+	return (error);
+}
+
+/*
+ * Scan all active processes to see if any of them have a current
+ * or root directory of `olddp'. If so, replace them with the new
+ * mount point.
+ */
+static void
+checkdirs(olddp, newdp)
+	struct vnode *olddp, *newdp;
+{
+	struct filedesc *fdp;
+	struct proc *p;
+	int nrele;
+
+	if (olddp->v_usecount == 1)
+		return;
+	sx_slock(&allproc_lock);
+	LIST_FOREACH(p, &allproc, p_list) {
+		PROC_LOCK(p);
+		fdp = p->p_fd;
+		if (fdp == NULL) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		nrele = 0;
+		FILEDESC_LOCK(fdp);
+		if (fdp->fd_cdir == olddp) {
+			VREF(newdp);
+			fdp->fd_cdir = newdp;
+			nrele++;
+		}
+		if (fdp->fd_rdir == olddp) {
+			VREF(newdp);
+			fdp->fd_rdir = newdp;
+			nrele++;
+		}
+		FILEDESC_UNLOCK(fdp);
+		PROC_UNLOCK(p);
+		while (nrele--)
+			vrele(olddp);
+	}
+	sx_sunlock(&allproc_lock);
+	if (rootvnode == olddp) {
+		vrele(rootvnode);
+		VREF(newdp);
+		rootvnode = newdp;
+	}
+}
+
+/*
+ * Unmount a filesystem.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument,
+ * not special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+unmount(td, uap)
+	struct thread *td;
+	register struct unmount_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct mount *mp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	mp = vp->v_mount;
+
+	/*
+	 * Only root, or the user that did the original mount is
+	 * permitted to unmount this filesystem.
+	 */
+	if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+		error = suser(td);
+		if (error) {
+			vput(vp);
+			return (error);
+		}
+	}
+
+	/*
+	 * Don't allow unmounting the root filesystem.
+	 */
+	if (mp->mnt_flag & MNT_ROOTFS) {
+		vput(vp);
+		return (EINVAL);
+	}
+
+	/*
+	 * Must be the root of the filesystem
+	 */
+	if ((vp->v_flag & VROOT) == 0) {
+		vput(vp);
+		return (EINVAL);
+	}
+	vput(vp);
+	return (dounmount(mp, SCARG(uap, flags), td));
+}
+
+/*
+ * Do the actual filesystem unmount.
+ */
+int
+dounmount(mp, flags, td)
+	struct mount *mp;
+	int flags;
+	struct thread *td;
+{
+	struct vnode *coveredvp, *fsrootvp;
+	int error;
+	int async_flag;
+
+	mtx_lock(&mountlist_mtx);
+	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+		mtx_unlock(&mountlist_mtx);
+		return (EBUSY);
+	}
+	mp->mnt_kern_flag |= MNTK_UNMOUNT;
+	/* Allow filesystems to detect that a forced unmount is in progress. */
+	if (flags & MNT_FORCE)
+		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
+	error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
+	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), &mountlist_mtx, td);
+	if (error) {
+		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+		if (mp->mnt_kern_flag & MNTK_MWAIT)
+			wakeup(mp);
+		return (error);
+	}
+	vn_start_write(NULL, &mp, V_WAIT);
+
+	if (mp->mnt_flag & MNT_EXPUBLIC)
+		vfs_setpublicfs(NULL, NULL, NULL);
+
+	vfs_msync(mp, MNT_WAIT);
+	async_flag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &=~ MNT_ASYNC;
+	cache_purgevfs(mp);	/* remove cache entries for this file sys */
+	if (mp->mnt_syncer != NULL)
+		vrele(mp->mnt_syncer);
+	/* Move process cdir/rdir refs on fs root to underlying vnode. */
+	if (VFS_ROOT(mp, &fsrootvp) == 0) {
+		if (mp->mnt_vnodecovered != NULL)
+			checkdirs(fsrootvp, mp->mnt_vnodecovered);
+		if (fsrootvp == rootvnode) {
+			vrele(rootvnode);
+			rootvnode = NULL;
+		}
+		vput(fsrootvp);
+	}
+	if (((mp->mnt_flag & MNT_RDONLY) ||
+	     (error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) == 0) ||
+	    (flags & MNT_FORCE)) {
+		error = VFS_UNMOUNT(mp, flags, td);
+	}
+	vn_finished_write(mp);
+	if (error) {
+		/* Undo cdir/rdir and rootvnode changes made above. */
+		if (VFS_ROOT(mp, &fsrootvp) == 0) {
+			if (mp->mnt_vnodecovered != NULL)
+				checkdirs(mp->mnt_vnodecovered, fsrootvp);
+			if (rootvnode == NULL) {
+				rootvnode = fsrootvp;
+				vref(rootvnode);
+			}
+			vput(fsrootvp);
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
+			(void) vfs_allocate_syncvnode(mp);
+		mtx_lock(&mountlist_mtx);
+		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+		mp->mnt_flag |= async_flag;
+		lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK,
+		    &mountlist_mtx, td);
+		if (mp->mnt_kern_flag & MNTK_MWAIT)
+			wakeup(mp);
+		return (error);
+	}
+	mtx_lock(&mountlist_mtx);
+	TAILQ_REMOVE(&mountlist, mp, mnt_list);
+	if ((coveredvp = mp->mnt_vnodecovered) != NULL)
+		coveredvp->v_mountedhere = NULL;
+	mp->mnt_vfc->vfc_refcount--;
+	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
+		panic("unmount: dangling vnode");
+	lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td);
+	lockdestroy(&mp->mnt_lock);
+	if (coveredvp != NULL)
+		vrele(coveredvp);
+	if (mp->mnt_kern_flag & MNTK_MWAIT)
+		wakeup(mp);
+	if (mp->mnt_op->vfs_mount == NULL)
+		vfs_freeopts(mp->mnt_opt);
+	free(mp, M_MOUNT);
+	return (0);
+}
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+        int     dummy;
+};
+#endif
+
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/* ARGSUSED */
+int
+sync(td, uap)
+	struct thread *td;
+	struct sync_args *uap;
+{
+	struct mount *mp, *nmp;
+	int asyncflag;
+
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
+		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
+			asyncflag = mp->mnt_flag & MNT_ASYNC;
+			mp->mnt_flag &= ~MNT_ASYNC;
+			vfs_msync(mp, MNT_NOWAIT);
+			VFS_SYNC(mp, MNT_NOWAIT,
+			    ((td != NULL) ? td->td_ucred : NOCRED), td);
+			mp->mnt_flag |= asyncflag;
+			vn_finished_write(mp);
+		}
+		mtx_lock(&mountlist_mtx);
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		vfs_unbusy(mp, td);
+	}
+	mtx_unlock(&mountlist_mtx);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
+#ifdef DIAGNOSTIC
+	if (syncprt)
+		vfs_bufstats();
+#endif /* DIAGNOSTIC */
+#endif
+	return (0);
+}
+
+/* XXX PRISON: could be per prison flag */
+static int prison_quotas;
+#if 0
+SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
+#endif
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+	char *path;
+	int cmd;
+	int uid;
+	caddr_t arg;
+};
+#endif
+/* ARGSUSED */
+int
+quotactl(td, uap)
+	struct thread *td;
+	register struct quotactl_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) cmd;
+		syscallarg(int) uid;
+		syscallarg(caddr_t) arg;
+	} */ *uap;
+{
+	struct mount *mp;
+	int error;
+	struct nameidata nd;
+
+	if (jailed(td->td_ucred) && !prison_quotas)
+		return (EPERM);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
+	vrele(nd.ni_vp);
+	if (error)
+		return (error);
+	error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
+	    SCARG(uap, arg), td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+	char *path;
+	struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+statfs(td, uap)
+	struct thread *td;
+	register struct statfs_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct statfs *) buf;
+	} */ *uap;
+{
+	register struct mount *mp;
+	register struct statfs *sp;
+	int error;
+	struct nameidata nd;
+	struct statfs sb;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	mp = nd.ni_vp->v_mount;
+	sp = &mp->mnt_stat;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vrele(nd.ni_vp);
+	error = VFS_STATFS(mp, sp, td);
+	if (error)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (suser(td)) {
+		bcopy(sp, &sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+	int fd;
+	struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+fstatfs(td, uap)
+	struct thread *td;
+	register struct fstatfs_args /* {
+		syscallarg(int) fd;
+		syscallarg(struct statfs *) buf;
+	} */ *uap;
+{
+	struct file *fp;
+	struct mount *mp;
+	register struct statfs *sp;
+	int error;
+	struct statfs sb;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	mp = ((struct vnode *)fp->f_data)->v_mount;
+	fdrop(fp, td);
+	if (mp == NULL)
+		return (EBADF);
+	sp = &mp->mnt_stat;
+	error = VFS_STATFS(mp, sp, td);
+	if (error)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (suser(td)) {
+		bcopy(sp, &sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+	struct statfs *buf;
+	long bufsize;
+	int flags;
+};
+#endif
+int
+getfsstat(td, uap)
+	struct thread *td;
+	register struct getfsstat_args /* {
+		syscallarg(struct statfs *) buf;
+		syscallarg(long) bufsize;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	register struct mount *mp, *nmp;
+	register struct statfs *sp;
+	caddr_t sfsp;
+	long count, maxcount, error;
+
+	maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
+	sfsp = (caddr_t)SCARG(uap, buf);
+	count = 0;
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+		if (sfsp && count < maxcount) {
+			sp = &mp->mnt_stat;
+			/*
+			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
+			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+			 * overrides MNT_WAIT.
+			 */
+			if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
+			    (SCARG(uap, flags) & MNT_WAIT)) &&
+			    (error = VFS_STATFS(mp, sp, td))) {
+				mtx_lock(&mountlist_mtx);
+				nmp = TAILQ_NEXT(mp, mnt_list);
+				vfs_unbusy(mp, td);
+				continue;
+			}
+			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+			error = copyout(sp, sfsp, sizeof(*sp));
+			if (error) {
+				vfs_unbusy(mp, td);
+				return (error);
+			}
+			sfsp += sizeof(*sp);
+		}
+		count++;
+		mtx_lock(&mountlist_mtx);
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		vfs_unbusy(mp, td);
+	}
+	mtx_unlock(&mountlist_mtx);
+	if (sfsp && count > maxcount)
+		td->td_retval[0] = maxcount;
+	else
+		td->td_retval[0] = count;
+	return (0);
+}
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+	int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+fchdir(td, uap)
+	struct thread *td;
+	struct fchdir_args /* {
+		syscallarg(int) fd;
+	} */ *uap;
+{
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	struct vnode *vp, *tdp, *vpold;
+	struct mount *mp;
+	struct file *fp;
+	int error;
+
+	if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	VREF(vp);
+	fdrop(fp, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (vp->v_type != VDIR)
+		error = ENOTDIR;
+	else
+		error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+	while (!error && (mp = vp->v_mountedhere) != NULL) {
+		if (vfs_busy(mp, 0, 0, td))
+			continue;
+		error = VFS_ROOT(mp, &tdp);
+		vfs_unbusy(mp, td);
+		if (error)
+			break;
+		vput(vp);
+		vp = tdp;
+	}
+	if (error) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0, td);
+	FILEDESC_LOCK(fdp);
+	vpold = fdp->fd_cdir;
+	fdp->fd_cdir = vp;
+	FILEDESC_UNLOCK(fdp);
+	vrele(vpold);
+	return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+chdir(td, uap)
+	struct thread *td;
+	struct chdir_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	int error;
+	struct nameidata nd;
+	struct vnode *vp;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = change_dir(&nd, td)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	FILEDESC_LOCK(fdp);
+	vp = fdp->fd_cdir;
+	fdp->fd_cdir = nd.ni_vp;
+	FILEDESC_UNLOCK(fdp);
+	vrele(vp);
+	return (0);
+}
+
+/*
+ * Helper function for raised chroot(2) security function:  Refuse if
+ * any filedescriptors are open directories.
+ */
+static int
+chroot_refuse_vdir_fds(fdp)
+	struct filedesc *fdp;
+{
+	struct vnode *vp;
+	struct file *fp;
+	int fd;
+
+	FILEDESC_LOCK(fdp);
+	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
+		fp = fget_locked(fdp, fd);
+		if (fp == NULL)
+			continue;
+		if (fp->f_type == DTYPE_VNODE) {
+			vp = (struct vnode *)fp->f_data;
+			if (vp->v_type == VDIR) {
+				FILEDESC_UNLOCK(fdp);
+				return (EPERM);
+			}
+		}
+	}
+	FILEDESC_UNLOCK(fdp);
+	return (0);
+}
+
+/*
+ * This sysctl determines if we will allow a process to chroot(2) if it
+ * has a directory open:
+ *	0: disallowed for all processes.
+ *	1: allowed for processes that were not already chroot(2)'ed.
+ *	2: allowed for all processes.
+ */
+
+static int chroot_allow_open_directories = 1;
+
+SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
+     &chroot_allow_open_directories, 0, "");
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+chroot(td, uap)
+	struct thread *td;
+	struct chroot_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	int error;
+	struct nameidata nd;
+	struct vnode *vp;
+
+	error = suser_cred(td->td_ucred, PRISON_ROOT);
+	if (error)
+		return (error);
+	FILEDESC_LOCK(fdp);
+	if (chroot_allow_open_directories == 0 ||
+	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
+		FILEDESC_UNLOCK(fdp);
+		error = chroot_refuse_vdir_fds(fdp);
+	} else
+		FILEDESC_UNLOCK(fdp);
+	if (error)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = change_dir(&nd, td)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	FILEDESC_LOCK(fdp);
+	vp = fdp->fd_rdir;
+	fdp->fd_rdir = nd.ni_vp;
+	if (!fdp->fd_jdir) {
+		fdp->fd_jdir = nd.ni_vp;
+                VREF(fdp->fd_jdir);
+	}
+	FILEDESC_UNLOCK(fdp);
+	vrele(vp);
+	return (0);
+}
+
+/*
+ * Common routine for chroot and chdir.
+ */
+static int
+change_dir(ndp, td)
+	register struct nameidata *ndp;
+	struct thread *td;
+{
+	struct vnode *vp;
+	int error;
+
+	error = namei(ndp);
+	if (error)
+		return (error);
+	vp = ndp->ni_vp;
+	if (vp->v_type != VDIR)
+		error = ENOTDIR;
+	else
+		error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+	if (error)
+		vput(vp);
+	else
+		VOP_UNLOCK(vp, 0, td);
+	return (error);
+}
+
+/*
+ * Check permissions, allocate an open file structure,
+ * and call the device open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+	char	*path;
+	int	flags;
+	int	mode;
+};
+#endif
+int
+open(td, uap)
+	struct thread *td;
+	register struct open_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp = p->p_fd;
+	struct file *fp;
+	struct vnode *vp;
+	struct vattr vat;
+	struct mount *mp;
+	int cmode, flags, oflags;
+	struct file *nfp;
+	int type, indx, error;
+	struct flock lf;
+	struct nameidata nd;
+
+	oflags = SCARG(uap, flags);
+	if ((oflags & O_ACCMODE) == O_ACCMODE)
+		return (EINVAL);
+	flags = FFLAGS(oflags);
+	error = falloc(td, &nfp, &indx);
+	if (error)
+		return (error);
+	fp = nfp;
+	FILEDESC_LOCK(fdp);
+	cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+	FILEDESC_UNLOCK(fdp);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	td->td_dupfd = -indx - 1;		/* XXX check for fdopen */
+	/*
+	 * Bump the ref count to prevent another process from closing
+	 * the descriptor while we are blocked in vn_open()
+	 */
+	fhold(fp);
+	error = vn_open(&nd, &flags, cmode);
+	if (error) {
+		/*
+		 * release our own reference
+		 */
+		fdrop(fp, td);
+
+		/*
+		 * handle special fdopen() case.  bleh.  dupfdopen() is
+		 * responsible for dropping the old contents of ofiles[indx]
+		 * if it succeeds.
+		 */
+		if ((error == ENODEV || error == ENXIO) &&
+		    td->td_dupfd >= 0 &&		/* XXX from fdopen */
+		    (error =
+			dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) {
+			td->td_retval[0] = indx;
+			return (0);
+		}
+		/*
+		 * Clean up the descriptor, but only if another thread hadn't
+		 * replaced or closed it.
+		 */
+		FILEDESC_LOCK(fdp);
+		if (fdp->fd_ofiles[indx] == fp) {
+			fdp->fd_ofiles[indx] = NULL;
+			FILEDESC_UNLOCK(fdp);
+			fdrop(fp, td);
+		} else
+			FILEDESC_UNLOCK(fdp);
+
+		if (error == ERESTART)
+			error = EINTR;
+		return (error);
+	}
+	td->td_dupfd = 0;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+
+	/*
+	 * There should be 2 references on the file, one from the descriptor
+	 * table, and one for us.
+	 *
+	 * Handle the case where someone closed the file (via its file
+	 * descriptor) while we were blocked.  The end result should look
+	 * like opening the file succeeded but it was immediately closed.
+	 */
+	FILEDESC_LOCK(fdp);
+	FILE_LOCK(fp);
+	if (fp->f_count == 1) {
+		KASSERT(fdp->fd_ofiles[indx] != fp,
+		    ("Open file descriptor lost all refs"));
+		FILEDESC_UNLOCK(fdp);
+		FILE_UNLOCK(fp);
+		VOP_UNLOCK(vp, 0, td);
+		vn_close(vp, flags & FMASK, fp->f_cred, td);
+		fdrop(fp, td);
+		td->td_retval[0] = indx;
+		return 0;
+	}
+
+	/* assert that vn_open created a backing object if one is needed */
+	KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0,
+		("open: vmio vnode has no backing object after vn_open"));
+
+	fp->f_data = vp;
+	fp->f_flag = flags & FMASK;
+	fp->f_ops = &vnops;
+	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+	FILEDESC_UNLOCK(fdp);
+	FILE_UNLOCK(fp);
+	VOP_UNLOCK(vp, 0, td);
+	if (flags & (O_EXLOCK | O_SHLOCK)) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		if (flags & O_EXLOCK)
+			lf.l_type = F_WRLCK;
+		else
+			lf.l_type = F_RDLCK;
+		type = F_FLOCK;
+		if ((flags & FNONBLOCK) == 0)
+			type |= F_WAIT;
+		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+			    type)) != 0)
+			goto bad;
+		fp->f_flag |= FHASLOCK;
+	}
+	if (flags & O_TRUNC) {
+		if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+			goto bad;
+		VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+		VATTR_NULL(&vat);
+		vat.va_size = 0;
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		error = VOP_SETATTR(vp, &vat, td->td_ucred, td);
+		VOP_UNLOCK(vp, 0, td);
+		vn_finished_write(mp);
+		if (error)
+			goto bad;
+	}
+	/*
+	 * Release our private reference, leaving the one associated with
+	 * the descriptor table intact.
+	 */
+	fdrop(fp, td);
+	td->td_retval[0] = indx;
+	return (0);
+bad:
+	FILEDESC_LOCK(fdp);
+	if (fdp->fd_ofiles[indx] == fp) {
+		fdp->fd_ofiles[indx] = NULL;
+		FILEDESC_UNLOCK(fdp);
+		fdrop(fp, td);
+	} else
+		FILEDESC_UNLOCK(fdp);
+	return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+ocreat(td, uap)
+	struct thread *td;
+	register struct ocreat_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct open_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(int) mode;
+	} */ nuap;
+
+	SCARG(&nuap, path) = SCARG(uap, path);
+	SCARG(&nuap, mode) = SCARG(uap, mode);
+	SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
+	return (open(td, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+	char	*path;
+	int	mode;
+	int	dev;
+};
+#endif
+/* ARGSUSED */
+int
+mknod(td, uap)
+	struct thread *td;
+	register struct mknod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+		syscallarg(int) dev;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct vattr vattr;
+	int error;
+	int whiteout = 0;
+	struct nameidata nd;
+
+	switch (SCARG(uap, mode) & S_IFMT) {
+	case S_IFCHR:
+	case S_IFBLK:
+		error = suser(td);
+		break;
+	default:
+		error = suser_cred(td->td_ucred, PRISON_ROOT);
+		break;
+	}
+	if (error)
+		return (error);
+restart:
+	bwillwrite();
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		vrele(vp);
+		error = EEXIST;
+	} else {
+		VATTR_NULL(&vattr);
+		FILEDESC_LOCK(td->td_proc->p_fd);
+		vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+		FILEDESC_UNLOCK(td->td_proc->p_fd);
+		vattr.va_rdev = SCARG(uap, dev);
+		whiteout = 0;
+
+		switch (SCARG(uap, mode) & S_IFMT) {
+		case S_IFMT:	/* used by badsect to flag bad sectors */
+			vattr.va_type = VBAD;
+			break;
+		case S_IFCHR:
+			vattr.va_type = VCHR;
+			break;
+		case S_IFBLK:
+			vattr.va_type = VBLK;
+			break;
+		case S_IFWHT:
+			whiteout = 1;
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+		if (whiteout)
+			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+		else {
+			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+						&nd.ni_cnd, &vattr);
+			if (error == 0)
+				vput(nd.ni_vp);
+		}
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod");
+	return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkfifo(td, uap)
+	struct thread *td;
+	register struct mkfifo_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	if (nd.ni_vp != NULL) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vrele(nd.ni_vp);
+		vput(nd.ni_dvp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VFIFO;
+	FILEDESC_LOCK(td->td_proc->p_fd);
+	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	if (error == 0)
+		vput(nd.ni_vp);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+	char	*path;
+	char	*link;
+};
+#endif
+/* ARGSUSED */
+int
+link(td, uap)
+	struct thread *td;
+	register struct link_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) link;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct nameidata nd;
+	int error;
+
+	bwillwrite();
+	NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if (vp->v_type == VDIR) {
+		vrele(vp);
+		return (EPERM);		/* POSIX */
+	}
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		vrele(vp);
+		return (error);
+	}
+	NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
+	if ((error = namei(&nd)) == 0) {
+		if (nd.ni_vp != NULL) {
+			vrele(nd.ni_vp);
+			error = EEXIST;
+		} else {
+			VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+			VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+		}
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+	}
+	vrele(vp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "link");
+	return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+	char	*path;
+	char	*link;
+};
+#endif
+/* ARGSUSED */
+int
+symlink(td, uap)
+	struct thread *td;
+	register struct symlink_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) link;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vattr vattr;
+	char *path;
+	int error;
+	struct nameidata nd;
+
+	path = uma_zalloc(namei_zone, M_WAITOK);
+	if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0)
+		goto out;
+restart:
+	bwillwrite();
+	NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
+	if ((error = namei(&nd)) != 0)
+		goto out;
+	if (nd.ni_vp) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vrele(nd.ni_vp);
+		vput(nd.ni_dvp);
+		error = EEXIST;
+		goto out;
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	FILEDESC_LOCK(td->td_proc->p_fd);
+	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error == 0)
+		vput(nd.ni_vp);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
+out:
+	uma_zfree(namei_zone, path);
+	return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+/* ARGSUSED */
+int
+undelete(td, uap)
+	struct thread *td;
+	register struct undelete_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	int error;
+	struct mount *mp;
+	struct nameidata nd;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+
+	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_vp)
+			vrele(nd.ni_vp);
+		vput(nd.ni_dvp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete");
+	return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+unlink(td, uap)
+	struct thread *td;
+	struct unlink_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type == VDIR)
+		error = EPERM;		/* POSIX */
+	else {
+		/*
+		 * The root of a mounted filesystem cannot be deleted.
+		 *
+		 * XXX: can this only be a VDIR case?
+		 */
+		if (vp->v_flag & VROOT)
+			error = EBUSY;
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vrele(vp);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vput(vp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink");
+	return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+	int	fd;
+	int	pad;
+	off_t	offset;
+	int	whence;
+};
+#endif
+int
+lseek(td, uap)
+	struct thread *td;
+	register struct lseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) offset;
+		syscallarg(int) whence;
+	} */ *uap;
+{
+	struct ucred *cred = td->td_ucred;
+	struct file *fp;
+	struct vnode *vp;
+	struct vattr vattr;
+	off_t offset;
+	int error, noneg;
+
+	if ((error = fget(td, uap->fd, &fp)) != 0)
+		return (error);
+	if (fp->f_type != DTYPE_VNODE) {
+		fdrop(fp, td);
+		return (ESPIPE);
+	}
+	vp = (struct vnode *)fp->f_data;
+	noneg = (vp->v_type != VCHR);
+	offset = SCARG(uap, offset);
+	switch (SCARG(uap, whence)) {
+	case L_INCR:
+		if (noneg &&
+		    (fp->f_offset < 0 ||
+		     (offset > 0 && fp->f_offset > OFF_MAX - offset)))
+			return (EOVERFLOW);
+		offset += fp->f_offset;
+		break;
+	case L_XTND:
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		error = VOP_GETATTR(vp, &vattr, cred, td);
+		VOP_UNLOCK(vp, 0, td);
+		if (error)
+			return (error);
+		if (noneg &&
+		    (vattr.va_size > OFF_MAX ||
+		     (offset > 0 && vattr.va_size > OFF_MAX - offset)))
+			return (EOVERFLOW);
+		offset += vattr.va_size;
+		break;
+	case L_SET:
+		break;
+	default:
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	if (noneg && offset < 0)
+		return (EINVAL);
+	fp->f_offset = offset;
+	*(off_t *)(td->td_retval) = fp->f_offset;
+	fdrop(fp, td);
+	return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+	int	fd;
+	long	offset;
+	int	whence;
+};
+#endif
+int
+olseek(td, uap)
+	struct thread *td;
+	register struct olseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(long) offset;
+		syscallarg(int) whence;
+	} */ *uap;
+{
+	struct lseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) offset;
+		syscallarg(int) whence;
+	} */ nuap;
+	int error;
+
+	SCARG(&nuap, fd) = SCARG(uap, fd);
+	SCARG(&nuap, offset) = SCARG(uap, offset);
+	SCARG(&nuap, whence) = SCARG(uap, whence);
+	error = lseek(td, &nuap);
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Check access permissions using passed credentials.
+ */
+static int
+vn_access(vp, user_flags, cred, td)
+	struct vnode	*vp;
+	int		user_flags;
+	struct ucred	*cred;
+	struct thread	*td;
+{
+	int error, flags;
+
+	/* Flags == 0 means only check for existence. */
+	error = 0;
+	if (user_flags) {
+		flags = 0;
+		if (user_flags & R_OK)
+			flags |= VREAD;
+		if (user_flags & W_OK)
+			flags |= VWRITE;
+		if (user_flags & X_OK)
+			flags |= VEXEC;
+		if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+			error = VOP_ACCESS(vp, flags, cred, td);
+	}
+	return (error);
+}
+
+/*
+ * Check access permissions using "real" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+	char	*path;
+	int	flags;
+};
+#endif
+int
+access(td, uap)
+	struct thread *td;
+	register struct access_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct ucred *cred, *tmpcred;
+	register struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+	/*
+	 * Create and modify a temporary credential instead of one that
+	 * is potentially shared.  This could also mess up socket
+	 * buffer accounting which can run in an interrupt context.
+	 *
+	 * XXX - Depending on how "threads" are finally implemented, it
+	 * may be better to explicitly pass the credential to namei()
+	 * rather than to modify the potentially shared process structure.
+	 */
+	cred = td->td_ucred;
+	tmpcred = crdup(cred);
+	tmpcred->cr_uid = cred->cr_ruid;
+	tmpcred->cr_groups[0] = cred->cr_rgid;
+	td->td_ucred = tmpcred;
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		goto out1;
+	vp = nd.ni_vp;
+
+	error = vn_access(vp, SCARG(uap, flags), tmpcred, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+out1:
+	td->td_ucred = cred;
+	crfree(tmpcred);
+	return (error);
+}
+
+/*
+ * Check access permissions using "effective" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct eaccess_args {
+	char	*path;
+	int	flags;
+};
+#endif
+int
+eaccess(td, uap)
+	struct thread *td;
+	register struct eaccess_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct nameidata nd;
+	struct vnode *vp;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+
+	error = vn_access(vp, SCARG(uap, flags), td->td_ucred, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+ostat(td, uap)
+	struct thread *td;
+	register struct ostat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct ostat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct ostat osb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = vn_stat(nd.ni_vp, &sb, td);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	cvtstat(&sb, &osb);
+	error = copyout(&osb, SCARG(uap, ub), sizeof (osb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+olstat(td, uap)
+	struct thread *td;
+	register struct olstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct ostat *) ub;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct stat sb;
+	struct ostat osb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	error = vn_stat(vp, &sb, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+	if (error)
+		return (error);
+	cvtstat(&sb, &osb);
+	error = copyout(&osb, SCARG(uap, ub), sizeof (osb));
+	return (error);
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+	struct stat *st;
+	struct ostat *ost;
+{
+
+	ost->st_dev = st->st_dev;
+	ost->st_ino = st->st_ino;
+	ost->st_mode = st->st_mode;
+	ost->st_nlink = st->st_nlink;
+	ost->st_uid = st->st_uid;
+	ost->st_gid = st->st_gid;
+	ost->st_rdev = st->st_rdev;
+	if (st->st_size < (quad_t)1 << 32)
+		ost->st_size = st->st_size;
+	else
+		ost->st_size = -2;
+	ost->st_atime = st->st_atime;
+	ost->st_mtime = st->st_mtime;
+	ost->st_ctime = st->st_ctime;
+	ost->st_blksize = st->st_blksize;
+	ost->st_blocks = st->st_blocks;
+	ost->st_flags = st->st_flags;
+	ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+stat(td, uap)
+	struct thread *td;
+	register struct stat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct stat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	int error;
+	struct nameidata nd;
+
+#ifdef LOOKUP_SHARED
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | NOOBJ,
+	    UIO_USERSPACE, SCARG(uap, path), td);
+#else
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+#endif
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	error = copyout(&sb, SCARG(uap, ub), sizeof (sb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+lstat(td, uap)
+	struct thread *td;
+	register struct lstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct stat *) ub;
+	} */ *uap;
+{
+	int error;
+	struct vnode *vp;
+	struct stat sb;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	error = vn_stat(vp, &sb, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+	if (error)
+		return (error);
+	error = copyout(&sb, SCARG(uap, ub), sizeof (sb));
+	return (error);
+}
+
+/*
+ * Implementation of the NetBSD stat() function.
+ * XXX This should probably be collapsed with the FreeBSD version,
+ * as the differences are only due to vn_stat() clearing spares at
+ * the end of the structures.  vn_stat could be split to avoid this,
+ * and thus collapse the following to close to zero code.
+ */
+void
+cvtnstat(sb, nsb)
+	struct stat *sb;
+	struct nstat *nsb;
+{
+	bzero(nsb, sizeof *nsb);
+	nsb->st_dev = sb->st_dev;
+	nsb->st_ino = sb->st_ino;
+	nsb->st_mode = sb->st_mode;
+	nsb->st_nlink = sb->st_nlink;
+	nsb->st_uid = sb->st_uid;
+	nsb->st_gid = sb->st_gid;
+	nsb->st_rdev = sb->st_rdev;
+	nsb->st_atimespec = sb->st_atimespec;
+	nsb->st_mtimespec = sb->st_mtimespec;
+	nsb->st_ctimespec = sb->st_ctimespec;
+	nsb->st_size = sb->st_size;
+	nsb->st_blocks = sb->st_blocks;
+	nsb->st_blksize = sb->st_blksize;
+	nsb->st_flags = sb->st_flags;
+	nsb->st_gen = sb->st_gen;
+	nsb->st_createtimespec = sb->st_createtimespec;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nstat_args {
+	char	*path;
+	struct nstat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nstat(td, uap)
+	struct thread *td;
+	register struct nstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct nstat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct nstat nsb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = vn_stat(nd.ni_vp, &sb, td);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	cvtnstat(&sb, &nsb);
+	error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb));
+	return (error);
+}
+
+/*
+ * NetBSD lstat.  Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nlstat(td, uap)
+	struct thread *td;
+	register struct nlstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct nstat *) ub;
+	} */ *uap;
+{
+	int error;
+	struct vnode *vp;
+	struct stat sb;
+	struct nstat nsb;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = vn_stat(vp, &sb, td);
+	vput(vp);
+	if (error)
+		return (error);
+	cvtnstat(&sb, &nsb);
+	error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb));
+	return (error);
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+	char	*path;
+	int	name;
+};
+#endif
+/* ARGSUSED */
+int
+pathconf(td, uap)
+	struct thread *td;
+	register struct pathconf_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) name;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), td->td_retval);
+	vput(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+	char	*path;
+	char	*buf;
+	int	count;
+};
+#endif
+/* ARGSUSED */
+int
+readlink(td, uap)
+	struct thread *td;
+	register struct readlink_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) buf;
+		syscallarg(int) count;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct iovec aiov;
+	struct uio auio;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if (vp->v_type != VLNK)
+		error = EINVAL;
+	else {
+		aiov.iov_base = SCARG(uap, buf);
+		aiov.iov_len = SCARG(uap, count);
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_td = td;
+		auio.uio_resid = SCARG(uap, count);
+		error = VOP_READLINK(vp, &auio, td->td_ucred);
+	}
+	vput(vp);
+	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+
+/*
+ * Common implementation code for chflags() and fchflags().
+ */
+static int
+setfflags(td, vp, flags)
+	struct thread *td;
+	struct vnode *vp;
+	int flags;
+{
+	int error;
+	struct mount *mp;
+	struct vattr vattr;
+
+	/*
+	 * Prevent non-root users from setting flags on devices.  When
+	 * a device is reused, users can retain ownership of the device
+	 * if they are allowed to set flags and programs assume that
+	 * chown can't fail when done as root.
+	 */
+	if (vp->v_type == VCHR || vp->v_type == VBLK) {
+		error = suser_cred(td->td_ucred, PRISON_ROOT);
+		if (error)
+			return (error);
+	}
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	VATTR_NULL(&vattr);
+	vattr.va_flags = flags;
+	error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+chflags(td, uap)
+	struct thread *td;
+	register struct chflags_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfflags(td, nd.ni_vp, SCARG(uap, flags));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Same as chflags() but doesn't follow symlinks.
+ */
+int
+lchflags(td, uap)
+	struct thread *td;
+	register struct lchflags_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfflags(td, nd.ni_vp, SCARG(uap, flags));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+	int	fd;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+fchflags(td, uap)
+	struct thread *td;
+	register struct fchflags_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct file *fp;
+	int error;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	error = setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags));
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation code for chmod(), lchmod() and fchmod().
+ */
+static int
+setfmode(td, vp, mode)
+	struct thread *td;
+	struct vnode *vp;
+	int mode;
+{
+	int error;
+	struct mount *mp;
+	struct vattr vattr;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	VATTR_NULL(&vattr);
+	vattr.va_mode = mode & ALLPERMS;
+	error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return error;
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+chmod(td, uap)
+	struct thread *td;
+	register struct chmod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+lchmod(td, uap)
+	struct thread *td;
+	register struct lchmod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+	int	fd;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+fchmod(td, uap)
+	struct thread *td;
+	register struct fchmod_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct file *fp;
+	struct vnode *vp;
+	int error;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	error = setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode));
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation for chown(), lchown(), and fchown()
+ */
+static int
+setfown(td, vp, uid, gid)
+	struct thread *td;
+	struct vnode *vp;
+	uid_t uid;
+	gid_t gid;
+{
+	int error;
+	struct mount *mp;
+	struct vattr vattr;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	VATTR_NULL(&vattr);
+	vattr.va_uid = uid;
+	vattr.va_gid = gid;
+	error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return error;
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+chown(td, uap)
+	struct thread *td;
+	register struct chown_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+lchown(td, uap)
+	struct thread *td;
+	register struct lchown_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+	int	fd;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+fchown(td, uap)
+	struct thread *td;
+	register struct fchown_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	struct file *fp;
+	struct vnode *vp;
+	int error;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	error = setfown(td, (struct vnode *)fp->f_data,
+		SCARG(uap, uid), SCARG(uap, gid));
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+getutimes(usrtvp, tsp)
+	const struct timeval *usrtvp;
+	struct timespec *tsp;
+{
+	struct timeval tv[2];
+	int error;
+
+	if (usrtvp == NULL) {
+		microtime(&tv[0]);
+		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
+		tsp[1] = tsp[0];
+	} else {
+		if ((error = copyin(usrtvp, tv, sizeof (tv))) != 0)
+			return (error);
+		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
+		TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
+	}
+	return 0;
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+setutimes(td, vp, ts, nullflag)
+	struct thread *td;
+	struct vnode *vp;
+	const struct timespec *ts;
+	int nullflag;
+{
+	int error;
+	struct mount *mp;
+	struct vattr vattr;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	VATTR_NULL(&vattr);
+	vattr.va_atime = ts[0];
+	vattr.va_mtime = ts[1];
+	if (nullflag)
+		vattr.va_vaflags |= VA_UTIMES_NULL;
+	error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return error;
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+utimes(td, uap)
+	struct thread *td;
+	register struct utimes_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timespec ts[2];
+	struct timeval *usrtvp;
+	int error;
+	struct nameidata nd;
+
+	usrtvp = SCARG(uap, tptr);
+	if ((error = getutimes(usrtvp, ts)) != 0)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+lutimes(td, uap)
+	struct thread *td;
+	register struct lutimes_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timespec ts[2];
+	struct timeval *usrtvp;
+	int error;
+	struct nameidata nd;
+
+	usrtvp = SCARG(uap, tptr);
+	if ((error = getutimes(usrtvp, ts)) != 0)
+		return (error);
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+	int	fd;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+futimes(td, uap)
+	struct thread *td;
+	register struct futimes_args /* {
+		syscallarg(int ) fd;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timespec ts[2];
+	struct file *fp;
+	struct timeval *usrtvp;
+	int error;
+
+	usrtvp = SCARG(uap, tptr);
+	if ((error = getutimes(usrtvp, ts)) != 0)
+		return (error);
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	error = setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+	char	*path;
+	int	pad;
+	off_t	length;
+};
+#endif
+/* ARGSUSED */
+int
+truncate(td, uap)
+	struct thread *td;
+	register struct truncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	if (uap->length < 0)
+		return(EINVAL);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		vrele(vp);
+		return (error);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0 &&
+	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = SCARG(uap, length);
+		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+	}
+	vput(vp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+	int	fd;
+	int	pad;
+	off_t	length;
+};
+#endif
+/* ARGSUSED */
+int
+ftruncate(td, uap)
+	struct thread *td;
+	register struct ftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vattr vattr;
+	struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (uap->length < 0)
+		return(EINVAL);
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	if ((fp->f_flag & FWRITE) == 0) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	vp = (struct vnode *)fp->f_data;
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		fdrop(fp, td);
+		return (error);
+	}
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = SCARG(uap, length);
+		error = VOP_SETATTR(vp, &vattr, fp->f_cred, td);
+	}
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	fdrop(fp, td);
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+	char	*path;
+	long	length;
+};
+#endif
+/* ARGSUSED */
+int
+otruncate(td, uap)
+	struct thread *td;
+	register struct otruncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(long) length;
+	} */ *uap;
+{
+	struct truncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ nuap;
+
+	SCARG(&nuap, path) = SCARG(uap, path);
+	SCARG(&nuap, length) = SCARG(uap, length);
+	return (truncate(td, &nuap));
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+	int	fd;
+	long	length;
+};
+#endif
+/* ARGSUSED */
+int
+oftruncate(td, uap)
+	struct thread *td;
+	register struct oftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(long) length;
+	} */ *uap;
+{
+	struct ftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ nuap;
+
+	SCARG(&nuap, fd) = SCARG(uap, fd);
+	SCARG(&nuap, length) = SCARG(uap, length);
+	return (ftruncate(td, &nuap));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+	int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+fsync(td, uap)
+	struct thread *td;
+	struct fsync_args /* {
+		syscallarg(int) fd;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct file *fp;
+	vm_object_t obj;
+	int error;
+
+	GIANT_REQUIRED;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		fdrop(fp, td);
+		return (error);
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (VOP_GETVOBJECT(vp, &obj) == 0) {
+		vm_object_page_clean(obj, 0, 0, 0);
+	}
+	error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td);
+#ifdef SOFTUPDATES
+	if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
+	    error = softdep_fsync(vp);
+#endif
+
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Rename files.  Source and destination must either both be directories,
+ * or both not be directories.  If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+	char	*from;
+	char	*to;
+};
+#endif
+/* ARGSUSED */
+int
+rename(td, uap)
+	struct thread *td;
+	register struct rename_args /* {
+		syscallarg(char *) from;
+		syscallarg(char *) to;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *tvp, *fvp, *tdvp;
+	struct nameidata fromnd, tond;
+	int error;
+
+	bwillwrite();
+	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
+	    SCARG(uap, from), td);
+	if ((error = namei(&fromnd)) != 0)
+		return (error);
+	fvp = fromnd.ni_vp;
+	if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) {
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		goto out1;
+	}
+	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ,
+	    UIO_USERSPACE, SCARG(uap, to), td);
+	if (fromnd.ni_vp->v_type == VDIR)
+		tond.ni_cnd.cn_flags |= WILLBEDIR;
+	if ((error = namei(&tond)) != 0) {
+		/* Translate error code for rename("dir1", "dir2/."). */
+		if (error == EISDIR && fvp->v_type == VDIR)
+			error = EINVAL;
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		goto out1;
+	}
+	tdvp = tond.ni_dvp;
+	tvp = tond.ni_vp;
+	if (tvp != NULL) {
+		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto out;
+		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+			error = EISDIR;
+			goto out;
+		}
+	}
+	if (fvp == tdvp)
+		error = EINVAL;
+	/*
+	 * If source is the same as the destination (that is the
+	 * same inode number with the same name in the same directory),
+	 * then there is nothing to do.
+	 */
+	if (fvp == tvp && fromnd.ni_dvp == tdvp &&
+	    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
+	    !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
+	      fromnd.ni_cnd.cn_namelen))
+		error = -1;
+out:
+	if (!error) {
+		VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE);
+		if (fromnd.ni_dvp != tdvp) {
+			VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+		}
+		if (tvp) {
+			VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE);
+		}
+		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		NDFREE(&tond, NDF_ONLY_PNBUF);
+	} else {
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		NDFREE(&tond, NDF_ONLY_PNBUF);
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		if (tvp)
+			vput(tvp);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+	}
+	vrele(tond.ni_startdir);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename");
+	ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename");
+	ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename");
+	ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename");
+out1:
+	if (fromnd.ni_startdir)
+		vrele(fromnd.ni_startdir);
+	if (error == -1)
+		return (0);
+	return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkdir(td, uap)
+	struct thread *td;
+	register struct mkdir_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+
+	return vn_mkdir(uap->path, uap->mode, UIO_USERSPACE, td);
+}
+
+int
+vn_mkdir(path, mode, segflg, td)
+	char *path;
+	int mode;
+	enum uio_seg segflg;
+	struct thread *td;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, CREATE, LOCKPARENT, segflg, path, td);
+	nd.ni_cnd.cn_flags |= WILLBEDIR;
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vrele(vp);
+		/*
+		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
+		 * the strange behaviour of leaving the vnode unlocked
+		 * if the target is the same vnode as the parent.
+		 */
+		if (vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VDIR;
+	FILEDESC_LOCK(td->td_proc->p_fd);
+	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	if (!error)
+		vput(nd.ni_vp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir");
+	return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+rmdir(td, uap)
+	struct thread *td;
+	struct rmdir_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+	/*
+	 * No rmdir "." please.
+	 */
+	if (nd.ni_dvp == vp) {
+		error = EINVAL;
+		goto out;
+	}
+	/*
+	 * The root of a mounted filesystem cannot be deleted.
+	 */
+	if (vp->v_flag & VROOT) {
+		error = EBUSY;
+		goto out;
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vput(vp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+	vn_finished_write(mp);
+out:
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (nd.ni_dvp == vp)
+		vrele(nd.ni_dvp);
+	else
+		vput(nd.ni_dvp);
+	vput(vp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir");
+	return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+ogetdirentries(td, uap)
+	struct thread *td;
+	register struct ogetdirentries_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+		syscallarg(long *) basep;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio, kuio;
+	struct iovec aiov, kiov;
+	struct dirent *dp, *edp;
+	caddr_t dirbuf;
+	int error, eofflag, readcnt;
+	long loff;
+
+	/* XXX arbitrary sanity limit on `count'. */
+	if (SCARG(uap, count) > 64 * 1024)
+		return (EINVAL);
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+	vp = (struct vnode *)fp->f_data;
+unionread:
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	aiov.iov_base = SCARG(uap, buf);
+	aiov.iov_len = SCARG(uap, count);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	auio.uio_resid = SCARG(uap, count);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	loff = auio.uio_offset = fp->f_offset;
+#	if (BYTE_ORDER != LITTLE_ENDIAN)
+		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+			fp->f_offset = auio.uio_offset;
+		} else
+#	endif
+	{
+		kuio = auio;
+		kuio.uio_iov = &kiov;
+		kuio.uio_segflg = UIO_SYSSPACE;
+		kiov.iov_len = SCARG(uap, count);
+		MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
+		kiov.iov_base = dirbuf;
+		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+		fp->f_offset = kuio.uio_offset;
+		if (error == 0) {
+			readcnt = SCARG(uap, count) - kuio.uio_resid;
+			edp = (struct dirent *)&dirbuf[readcnt];
+			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+#				if (BYTE_ORDER == LITTLE_ENDIAN)
+					/*
+					 * The expected low byte of
+					 * dp->d_namlen is our dp->d_type.
+					 * The high MBZ byte of dp->d_namlen
+					 * is our dp->d_namlen.
+					 */
+					dp->d_type = dp->d_namlen;
+					dp->d_namlen = 0;
+#				else
+					/*
+					 * The dp->d_type is the high byte
+					 * of the expected dp->d_namlen,
+					 * so must be zero'ed.
+					 */
+					dp->d_type = 0;
+#				endif
+				if (dp->d_reclen > 0) {
+					dp = (struct dirent *)
+					    ((char *)dp + dp->d_reclen);
+				} else {
+					error = EIO;
+					break;
+				}
+			}
+			if (dp >= edp)
+				error = uiomove(dirbuf, readcnt, &auio);
+		}
+		FREE(dirbuf, M_TEMP);
+	}
+	VOP_UNLOCK(vp, 0, td);
+	if (error) {
+		fdrop(fp, td);
+		return (error);
+	}
+	if (SCARG(uap, count) == auio.uio_resid) {
+		if (union_dircheckp) {
+			error = union_dircheckp(td, &vp, fp);
+			if (error == -1)
+				goto unionread;
+			if (error) {
+				fdrop(fp, td);
+				return (error);
+			}
+		}
+		if ((vp->v_flag & VROOT) &&
+		    (vp->v_mount->mnt_flag & MNT_UNION)) {
+			struct vnode *tvp = vp;
+			vp = vp->v_mount->mnt_vnodecovered;
+			VREF(vp);
+			fp->f_data = vp;
+			fp->f_offset = 0;
+			vrele(tvp);
+			goto unionread;
+		}
+	}
+	error = copyout(&loff, SCARG(uap, basep), sizeof(long));
+	fdrop(fp, td);
+	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+getdirentries(td, uap)
+	struct thread *td;
+	register struct getdirentries_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+		syscallarg(long *) basep;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	long loff;
+	int error, eofflag;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+	vp = (struct vnode *)fp->f_data;
+unionread:
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	aiov.iov_base = SCARG(uap, buf);
+	aiov.iov_len = SCARG(uap, count);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	auio.uio_resid = SCARG(uap, count);
+	/* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	loff = auio.uio_offset = fp->f_offset;
+	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
+	fp->f_offset = auio.uio_offset;
+	VOP_UNLOCK(vp, 0, td);
+	if (error) {
+		fdrop(fp, td);
+		return (error);
+	}
+	if (SCARG(uap, count) == auio.uio_resid) {
+		if (union_dircheckp) {
+			error = union_dircheckp(td, &vp, fp);
+			if (error == -1)
+				goto unionread;
+			if (error) {
+				fdrop(fp, td);
+				return (error);
+			}
+		}
+		if ((vp->v_flag & VROOT) &&
+		    (vp->v_mount->mnt_flag & MNT_UNION)) {
+			struct vnode *tvp = vp;
+			vp = vp->v_mount->mnt_vnodecovered;
+			VREF(vp);
+			fp->f_data = vp;
+			fp->f_offset = 0;
+			vrele(tvp);
+			goto unionread;
+		}
+	}
+	if (SCARG(uap, basep) != NULL) {
+		error = copyout(&loff, SCARG(uap, basep), sizeof(long));
+	}
+	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	fdrop(fp, td);
+	return (error);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct getdents_args {
+	int fd;
+	char *buf;
+	size_t count;
+};
+#endif
+int
+getdents(td, uap)
+	struct thread *td;
+	register struct getdents_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+	} */ *uap;
+{
+	struct getdirentries_args ap;
+	ap.fd = uap->fd;
+	ap.buf = uap->buf;
+	ap.count = uap->count;
+	ap.basep = NULL;
+	return getdirentries(td, &ap);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ *
+ * MP SAFE
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+	int	newmask;
+};
+#endif
+int
+umask(td, uap)
+	struct thread *td;
+	struct umask_args /* {
+		syscallarg(int) newmask;
+	} */ *uap;
+{
+	register struct filedesc *fdp;
+
+	FILEDESC_LOCK(td->td_proc->p_fd);
+	fdp = td->td_proc->p_fd;
+	td->td_retval[0] = fdp->fd_cmask;
+	fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem
+ * away from vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+revoke(td, uap)
+	struct thread *td;
+	register struct revoke_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path),
+	    td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (vp->v_type != VCHR) {
+		vput(vp);
+		return (EINVAL);
+	}
+	error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
+	if (error) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0, td);
+	if (td->td_ucred->cr_uid != vattr.va_uid) {
+		error = suser_cred(td->td_ucred, PRISON_ROOT);
+		if (error)
+			goto out;
+	}
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto out;
+	if (vcount(vp) > 1)
+		VOP_REVOKE(vp, REVOKEALL);
+	vn_finished_write(mp);
+out:
+	vrele(vp);
+	return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry.
+ * The file entry is locked upon returning.
+ */
+int
+getvnode(fdp, fd, fpp)
+	struct filedesc *fdp;
+	int fd;
+	struct file **fpp;
+{
+	int error;
+	struct file *fp;
+
+	fp = NULL;
+	if (fdp == NULL)
+		error = EBADF;
+	else {
+		FILEDESC_LOCK(fdp);
+		if ((u_int)fd >= fdp->fd_nfiles ||
+		    (fp = fdp->fd_ofiles[fd]) == NULL)
+			error = EBADF;
+		else if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
+			fp = NULL;
+			error = EINVAL;
+		} else {
+			fhold(fp);
+			error = 0;
+		}
+		FILEDESC_UNLOCK(fdp);
+	}
+	*fpp = fp;
+	return (error);
+}
+/*
+ * Get (NFS) file handle
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfh_args {
+	char	*fname;
+	fhandle_t *fhp;
+};
+#endif
+int
+getfh(td, uap)
+	struct thread *td;
+	register struct getfh_args *uap;
+{
+	struct nameidata nd;
+	fhandle_t fh;
+	register struct vnode *vp;
+	int error;
+
+	/*
+	 * Must be super user
+	 */
+	error = suser(td);
+	if (error)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	bzero(&fh, sizeof(fh));
+	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
+	error = VFS_VPTOFH(vp, &fh.fh_fid);
+	vput(vp);
+	if (error)
+		return (error);
+	error = copyout(&fh, uap->fhp, sizeof (fh));
+	return (error);
+}
+
+/*
+ * syscall for the rpc.lockd to use to translate a NFS file handle into
+ * an open descriptor.
+ *
+ * warning: do not remove the suser() call or this becomes one giant
+ * security hole.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhopen_args {
+	const struct fhandle *u_fhp;
+	int flags;
+};
+#endif
+int
+fhopen(td, uap)
+	struct thread *td;
+	struct fhopen_args /* {
+		syscallarg(const struct fhandle *) u_fhp;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct proc *p = td->td_proc;
+	struct mount *mp;
+	struct vnode *vp;
+	struct fhandle fhp;
+	struct vattr vat;
+	struct vattr *vap = &vat;
+	struct flock lf;
+	struct file *fp;
+	register struct filedesc *fdp = p->p_fd;
+	int fmode, mode, error, type;
+	struct file *nfp; 
+	int indx;
+
+	/*
+	 * Must be super user
+	 */
+	error = suser(td);
+	if (error)
+		return (error);
+
+	fmode = FFLAGS(SCARG(uap, flags));
+	/* why not allow a non-read/write open for our lockd? */
+	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
+		return (EINVAL);
+	error = copyin(SCARG(uap,u_fhp), &fhp, sizeof(fhp));
+	if (error)
+		return(error);
+	/* find the mount point */
+	mp = vfs_getvfs(&fhp.fh_fsid);
+	if (mp == NULL)
+		return (ESTALE);
+	/* now give me my vnode, it gets returned to me locked */
+	error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
+	if (error)
+		return (error);
+ 	/*
+	 * from now on we have to make sure not
+	 * to forget about the vnode
+	 * any error that causes an abort must vput(vp) 
+	 * just set error = err and 'goto bad;'.
+	 */
+
+	/* 
+	 * from vn_open 
+	 */
+	if (vp->v_type == VLNK) {
+		error = EMLINK;
+		goto bad;
+	}
+	if (vp->v_type == VSOCK) {
+		error = EOPNOTSUPP;
+		goto bad;
+	}
+	mode = 0;
+	if (fmode & (FWRITE | O_TRUNC)) {
+		if (vp->v_type == VDIR) {
+			error = EISDIR;
+			goto bad;
+		}
+		error = vn_writechk(vp);
+		if (error)
+			goto bad;
+		mode |= VWRITE;
+	}
+	if (fmode & FREAD)
+		mode |= VREAD;
+	if (mode) {
+		error = VOP_ACCESS(vp, mode, td->td_ucred, td);
+		if (error)
+			goto bad;
+	}
+	if (fmode & O_TRUNC) {
+		VOP_UNLOCK(vp, 0, td);				/* XXX */
+		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
+			vrele(vp);
+			return (error);
+		}
+		VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);	/* XXX */
+		VATTR_NULL(vap);
+		vap->va_size = 0;
+		error = VOP_SETATTR(vp, vap, td->td_ucred, td);
+		vn_finished_write(mp);
+		if (error)
+			goto bad;
+	}
+	error = VOP_OPEN(vp, fmode, td->td_ucred, td);
+	if (error)
+		goto bad;
+	/*
+	 * Make sure that a VM object is created for VMIO support.
+	 */
+	if (vn_canvmio(vp) == TRUE) {
+		if ((error = vfs_object_create(vp, td, td->td_ucred)) != 0)
+			goto bad;
+	}
+	if (fmode & FWRITE)
+		vp->v_writecount++;
+
+	/*
+	 * end of vn_open code 
+	 */
+
+	if ((error = falloc(td, &nfp, &indx)) != 0) {
+		if (fmode & FWRITE)
+			vp->v_writecount--;
+		goto bad;
+	}
+	fp = nfp;	
+
+	/*
+	 * Hold an extra reference to avoid having fp ripped out 
+	 * from under us while we block in the lock op
+	 */
+	fhold(fp);
+	nfp->f_data = vp;
+	nfp->f_flag = fmode & FMASK;
+	nfp->f_ops = &vnops;
+	nfp->f_type = DTYPE_VNODE;
+	if (fmode & (O_EXLOCK | O_SHLOCK)) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		if (fmode & O_EXLOCK)
+			lf.l_type = F_WRLCK;
+		else
+			lf.l_type = F_RDLCK;
+		type = F_FLOCK;
+		if ((fmode & FNONBLOCK) == 0)
+			type |= F_WAIT;
+		VOP_UNLOCK(vp, 0, td);
+		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+			    type)) != 0) {
+			/*
+			 * The lock request failed.  Normally close the
+			 * descriptor but handle the case where someone might
+			 * have dup()d or close()d it when we weren't looking.
+			 */
+			FILEDESC_LOCK(fdp);
+			if (fdp->fd_ofiles[indx] == fp) {
+				fdp->fd_ofiles[indx] = NULL;
+				FILEDESC_UNLOCK(fdp);
+				fdrop(fp, td);
+			} else
+				FILEDESC_UNLOCK(fdp);
+			/*
+			 * release our private reference
+			 */
+			fdrop(fp, td);
+			return(error);
+		}
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		fp->f_flag |= FHASLOCK;
+	}
+	if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0))
+		vfs_object_create(vp, td, td->td_ucred);
+
+	VOP_UNLOCK(vp, 0, td);
+	fdrop(fp, td);
+	td->td_retval[0] = indx;
+	return (0);
+
+bad:
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Stat an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstat_args {
+	struct fhandle *u_fhp;
+	struct stat *sb;
+};
+#endif
+int
+fhstat(td, uap)
+	struct thread *td;
+	register struct fhstat_args /* {
+		syscallarg(struct fhandle *) u_fhp;
+		syscallarg(struct stat *) sb;
+	} */ *uap;
+{
+	struct stat sb;
+	fhandle_t fh;
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+
+	/*
+	 * Must be super user
+	 */
+	error = suser(td);
+	if (error)
+		return (error);
+	
+	error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t));
+	if (error)
+		return (error);
+
+	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
+		return (ESTALE);
+	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
+		return (error);
+	error = vn_stat(vp, &sb, td);
+	vput(vp);
+	if (error)
+		return (error);
+	error = copyout(&sb, SCARG(uap, sb), sizeof(sb));
+	return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstatfs_args {
+	struct fhandle *u_fhp;
+	struct statfs *buf;
+};
+#endif
+int
+fhstatfs(td, uap)
+	struct thread *td;
+	struct fhstatfs_args /* {
+		syscallarg(struct fhandle) *u_fhp;
+		syscallarg(struct statfs) *buf;
+	} */ *uap;
+{
+	struct statfs *sp;
+	struct mount *mp;
+	struct vnode *vp;
+	struct statfs sb;
+	fhandle_t fh;
+	int error;
+
+	/*
+	 * Must be super user
+	 */
+	error = suser(td);
+	if (error)
+		return (error);
+
+	if ((error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t))) != 0)
+		return (error);
+
+	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
+		return (ESTALE);
+	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
+		return (error);
+	mp = vp->v_mount;
+	sp = &mp->mnt_stat;
+	vput(vp);
+	if ((error = VFS_STATFS(mp, sp, td)) != 0)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (suser(td)) {
+		bcopy(sp, &sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Syscall to push extended attribute configuration information into the
+ * VFS.  Accepts a path, which it converts to a mountpoint, as well as
+ * a command (int cmd), and attribute name and misc data.  For now, the
+ * attribute name is left in userspace for consumption by the VFS_op.
+ * It will probably be changed to be copied into sysspace by the
+ * syscall in the future, once issues with various consumers of the
+ * attribute code have raised their hands.
+ *
+ * Currently this is used only by UFS Extended Attributes.
+ */
+int
+extattrctl(td, uap)
+	struct thread *td;
+	struct extattrctl_args /* {
+		syscallarg(const char *) path;
+		syscallarg(int) cmd;
+		syscallarg(const char *) filename;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+	} */ *uap;
+{
+	struct vnode *filename_vp;
+	struct nameidata nd;
+	struct mount *mp, *mp_writable;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	/*
+	 * uap->attrname is not always defined.  We check again later when we
+	 * invoke the VFS call so as to pass in NULL there if needed.
+	 */
+	if (uap->attrname != NULL) {
+		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
+		    NULL);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * uap->filename is not always defined.  If it is, grab a vnode lock,
+	 * which VFS_EXTATTRCTL() will later release.
+	 */
+	filename_vp = NULL;
+	if (uap->filename != NULL) {
+		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+		    uap->filename, td);
+		if ((error = namei(&nd)) != 0)
+			return (error);
+		filename_vp = nd.ni_vp;
+		NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
+	}
+
+	/* uap->path is always defined. */
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	if ((error = namei(&nd)) != 0) {
+		if (filename_vp != NULL)
+			vput(filename_vp);
+		return (error);
+	}
+	mp = nd.ni_vp->v_mount;
+	error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
+	NDFREE(&nd, 0);
+	if (error) {
+		if (filename_vp != NULL)
+			vput(filename_vp);
+		return (error);
+	}
+
+	if (uap->attrname != NULL) {
+		error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp,
+		    uap->attrnamespace, attrname, td);
+	} else {
+		error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp,
+		    uap->attrnamespace, NULL, td);
+	}
+
+	vn_finished_write(mp_writable);
+	/*
+	 * VFS_EXTATTRCTL will have unlocked, but not de-ref'd,
+	 * filename_vp, so vrele it if it is defined.
+	 */
+	if (filename_vp != NULL)
+		vrele(filename_vp);
+
+	return (error);
+}
+
+/*-
+ * Set a named extended attribute on a file or directory
+ * 
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", userspace buffer
+ *            pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    void *data, size_t nbytes, struct thread *td)
+{
+	struct mount *mp;
+	struct uio auio;
+	struct iovec aiov;
+	ssize_t cnt;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+	aiov.iov_base = data;
+	aiov.iov_len = nbytes;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = 0;
+	if (nbytes > INT_MAX) {
+		error = EINVAL;
+		goto done;
+	}
+	auio.uio_resid = nbytes;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	cnt = nbytes;
+
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
+	    td->td_ucred, td);
+	cnt -= auio.uio_resid;
+	td->td_retval[0] = cnt;
+
+done:
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+int
+extattr_set_file(td, uap)
+	struct thread *td;
+	struct extattr_set_file_args /* {
+		syscallarg(const char *) path;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+		syscallarg(void *) data;
+		syscallarg(size_t) nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+int
+extattr_set_fd(td, uap)
+	struct thread *td;
+	struct extattr_set_fd_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+		syscallarg(void *) data;
+		syscallarg(size_t) nbytes;
+	} */ *uap;
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+
+	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
+		return (error);
+
+	error = extattr_set_vp((struct vnode *)fp->f_data, uap->attrnamespace,
+	    attrname, uap->data, uap->nbytes, td);
+	fdrop(fp, td);
+
+	return (error);
+}
+
+/*-
+ * Get a named extended attribute on a file or directory
+ * 
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", userspace buffer
+ *            pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    void *data, size_t nbytes, struct thread *td)
+{
+	struct uio auio, *auiop;
+	struct iovec aiov;
+	ssize_t cnt;
+	size_t size, *sizep;
+	int error;
+
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+	/*
+	 * Slightly unusual semantics: if the user provides a NULL data
+	 * pointer, they don't want to receive the data, just the
+	 * maximum read length.
+	 */
+	auiop = NULL;
+	sizep = NULL;
+	cnt = 0;
+	if (data != NULL) {
+		aiov.iov_base = data;
+		aiov.iov_len = nbytes;
+		auio.uio_iov = &aiov;
+		auio.uio_offset = 0;
+		if (nbytes > INT_MAX) {
+			error = EINVAL;
+			goto done;
+		}
+		auio.uio_resid = nbytes;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_td = td;
+		auiop = &auio;
+		cnt = nbytes;
+	} else
+		sizep = &size;
+
+	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
+	    td->td_ucred, td);
+
+	if (auiop != NULL) {
+		cnt -= auio.uio_resid;
+		td->td_retval[0] = cnt;
+	} else
+		td->td_retval[0] = size;
+
+done:
+	VOP_UNLOCK(vp, 0, td);
+	return (error);
+}
+
+int
+extattr_get_file(td, uap)
+	struct thread *td;
+	struct extattr_get_file_args /* {
+		syscallarg(const char *) path;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+		syscallarg(void *) data;
+		syscallarg(size_t) nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+int
+extattr_get_fd(td, uap)
+	struct thread *td;
+	struct extattr_get_fd_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+		syscallarg(void *) data;
+		syscallarg(size_t) nbytes;
+	} */ *uap;
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+
+	error = extattr_get_vp((struct vnode *)fp->f_data, uap->attrnamespace,
+	    attrname, uap->data, uap->nbytes, td);
+
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * extattr_delete_vp(): Delete a named extended attribute on a file or
+ *                      directory
+ * 
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", proc "p"
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    struct thread *td)
+{
+	struct mount *mp;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred,
+	    td);
+
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+int
+extattr_delete_file(td, uap)
+	struct thread *td;
+	struct extattr_delete_file_args /* {
+		syscallarg(const char *) path;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return(error);
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	if ((error = namei(&nd)) != 0)
+		return(error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+
+	vrele(nd.ni_vp);
+	return(error);
+}
+
+int
+extattr_delete_fd(td, uap)
+	struct thread *td;
+	struct extattr_delete_fd_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+	} */ *uap;
+{
+	struct file *fp;
+	struct vnode *vp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+
+	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+
+	error = extattr_delete_vp((struct vnode *)fp->f_data,
+	    uap->attrnamespace, attrname, td);
+
+	fdrop(fp, td);
+	return (error);
+}
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
new file mode 100644
index 0000000..b221cd3
--- /dev/null
+++ b/sys/kern/vfs_init.c
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_init.c	8.3 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+
+
+MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
+
+/*
+ * The highest defined VFS number.
+ */
+int maxvfsconf = VFS_GENERIC + 1;
+
+/*
+ * Single-linked list of configured VFSes.
+ * New entries are added/deleted by vfs_register()/vfs_unregister()
+ */
+struct vfsconf *vfsconf;
+
+/*
+ * vfs_init.c
+ *
+ * Allocate and fill in operations vectors.
+ *
+ * An undocumented feature of this approach to defining operations is that
+ * there can be multiple entries in vfs_opv_descs for the same operations
+ * vector. This allows third parties to extend the set of operations
+ * supported by another layer in a binary compatibile way. For example,
+ * assume that NFS needed to be modified to support Ficus. NFS has an entry
+ * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
+ * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
+ * listing those new operations Ficus adds to NFS, all without modifying the
+ * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
+ * that is a(whole)nother story.) This is a feature.
+ */
+
+/* Table of known vnodeop vectors (list of VFS vnode vectors) */
+static const struct vnodeopv_desc **vnodeopv_descs;
+static int vnodeopv_num;
+
+/* Table of known descs (list of vnode op handlers "vop_access_desc") */
+static struct vnodeop_desc **vfs_op_descs;
+/* Reference counts for vfs_op_descs */
+static int *vfs_op_desc_refs;
+/* Number of descriptions */
+static int num_op_descs;
+/* Number of entries in each description */
+static int vfs_opv_numops = 64;
+
+/* Allow this number to be tuned at boot */
+TUNABLE_INT("vfs.opv_numops", &vfs_opv_numops);
+SYSCTL_INT(_vfs, OID_AUTO, opv_numops, CTLFLAG_RD, &vfs_opv_numops,
+	0, "Maximum number of operations in vop_t vector");
+
+static int int_cmp(const void *a, const void *b);
+
+static int
+int_cmp(const void *a, const void *b)
+{
+	return(*(const int *)a - *(const int *)b);
+}
+
+/*
+ * Recalculate the operations vector/description (those parts of it that can
+ * be recalculated, that is.)
+ * Always allocate operations vector large enough to hold vfs_opv_numops
+ * entries. The vector is never freed or deallocated once it is initialized,
+ * so that vnodes might safely reference it through their v_op pointer without
+ * vector changing suddenly from under them.
+ */
+static void
+vfs_opv_recalc(void)
+{
+	int i, j, k;
+	int *vfs_op_offsets;
+	vop_t ***opv_desc_vector_p;
+	vop_t **opv_desc_vector;
+	struct vnodeopv_entry_desc *opve_descp;
+	const struct vnodeopv_desc *opv;
+
+	if (vfs_op_descs == NULL)
+		panic("vfs_opv_recalc called with null vfs_op_descs");
+
+	/*
+	 * Allocate and initialize temporary array to store
+	 * offsets. Sort it to put all uninitialized entries
+	 * first and to make holes in existing offset sequence
+	 * detectable.
+	 */
+	MALLOC(vfs_op_offsets, int *,
+		num_op_descs * sizeof(int), M_TEMP, M_WAITOK);
+	if (vfs_op_offsets == NULL)
+		panic("vfs_opv_recalc: no memory");
+	for (i = 0; i < num_op_descs; i++)
+		vfs_op_offsets[i] = vfs_op_descs[i]->vdesc_offset;
+	qsort(vfs_op_offsets, num_op_descs, sizeof(int), int_cmp);
+
+	/*
+	 * Run through and make sure all known descs have an offset.
+	 * Use vfs_op_offsets to locate holes in offset sequence and
+	 * reuse them.
+	 * vop_default_desc is hardwired at offset 1, and offset 0
+	 * is a panic sanity check.
+	 */
+	j = 1; k = 1;
+	for (i = 0; i < num_op_descs; i++) {
+		if (vfs_op_descs[i]->vdesc_offset != 0)
+			continue;
+		/*
+		 * Look at two adjacent entries vfs_op_offsets[j - 1] and
+		 * vfs_op_offsets[j] and see if we can fit a new offset
+		 * number in between. If not, look at the next pair until
+		 * hole is found or the end of the vfs_op_offsets vector is
+		 * reached. j has been initialized to 1 above so that
+		 * referencing (j-1)-th element is safe and the loop will
+		 * never execute if num_op_descs is 1. For each new value s
+		 * of i the j loop pick up from where previous iteration has
+		 * left off. When the last hole has been consumed or if no
+		 * hole has been found, we will start allocating new numbers
+		 * starting from the biggest already available offset + 1.
+		 */
+		for (; j < num_op_descs; j++) {
+			if (vfs_op_offsets[j - 1] < k && vfs_op_offsets[j] > k)
+				break;
+			k = vfs_op_offsets[j] + 1;
+		}
+		vfs_op_descs[i]->vdesc_offset = k++;
+	}
+	FREE(vfs_op_offsets, M_TEMP);
+
+	/* Panic if new vops will cause vector overflow */
+	if (k > vfs_opv_numops)
+		panic("VFS: Ran out of vop_t vector entries. %d entries required, only %d available.\n", k, vfs_opv_numops);
+
+	/*
+	 * Allocate and fill in the vectors
+	 */
+	for (i = 0; i < vnodeopv_num; i++) {
+		opv = vnodeopv_descs[i];
+		opv_desc_vector_p = opv->opv_desc_vector_p;
+		if (*opv_desc_vector_p == NULL)
+			MALLOC(*opv_desc_vector_p, vop_t **,
+				vfs_opv_numops * sizeof(vop_t *), M_VNODE,
+				M_WAITOK | M_ZERO);
+
+		/* Fill in, with slot 0 being to return EOPNOTSUPP */
+		opv_desc_vector = *opv_desc_vector_p;
+		opv_desc_vector[0] = (vop_t *)vop_eopnotsupp;
+		for (j = 0; opv->opv_desc_ops[j].opve_op; j++) {
+			opve_descp = &(opv->opv_desc_ops[j]);
+			opv_desc_vector[opve_descp->opve_op->vdesc_offset] =
+				opve_descp->opve_impl;
+		}
+
+		/* Replace unfilled routines with their default (slot 1). */
+		opv_desc_vector = *(opv->opv_desc_vector_p);
+		if (opv_desc_vector[1] == NULL)
+			panic("vfs_opv_recalc: vector without a default.");
+		for (j = 0; j < vfs_opv_numops; j++)
+			if (opv_desc_vector[j] == NULL)
+				opv_desc_vector[j] = opv_desc_vector[1];
+	}
+}
+
+/* Add a set of vnode operations (a description) to the table above. */
+void
+vfs_add_vnodeops(const void *data)
+{
+	const struct vnodeopv_desc *opv;
+	const struct vnodeopv_desc **newopv;
+	struct vnodeop_desc **newop;
+	int *newref;
+	vop_t **opv_desc_vector;
+	struct vnodeop_desc *desc;
+	int i, j;
+
+	opv = (const struct vnodeopv_desc *)data;
+	MALLOC(newopv, const struct vnodeopv_desc **,
+	       (vnodeopv_num + 1) * sizeof(*newopv), M_VNODE, M_WAITOK);
+	if (vnodeopv_descs) {
+		bcopy(vnodeopv_descs, newopv, vnodeopv_num * sizeof(*newopv));
+		FREE(vnodeopv_descs, M_VNODE);
+	}
+	newopv[vnodeopv_num] = opv;
+	vnodeopv_descs = newopv;
+	vnodeopv_num++;
+
+	/* See if we have turned up a new vnode op desc */
+	opv_desc_vector = *(opv->opv_desc_vector_p);
+	for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) {
+		for (j = 0; j < num_op_descs; j++) {
+			if (desc == vfs_op_descs[j]) {
+				/* found it, increase reference count */
+				vfs_op_desc_refs[j]++;
+				break;
+			}
+		}
+		if (j == num_op_descs) {
+			/* not found, new entry */
+			MALLOC(newop, struct vnodeop_desc **,
+			       (num_op_descs + 1) * sizeof(*newop),
+			       M_VNODE, M_WAITOK);
+			/* new reference count (for unload) */
+			MALLOC(newref, int *,
+				(num_op_descs + 1) * sizeof(*newref),
+				M_VNODE, M_WAITOK);
+			if (vfs_op_descs) {
+				bcopy(vfs_op_descs, newop,
+					num_op_descs * sizeof(*newop));
+				FREE(vfs_op_descs, M_VNODE);
+			}
+			if (vfs_op_desc_refs) {
+				bcopy(vfs_op_desc_refs, newref,
+					num_op_descs * sizeof(*newref));
+				FREE(vfs_op_desc_refs, M_VNODE);
+			}
+			newop[num_op_descs] = desc;
+			newref[num_op_descs] = 1;
+			vfs_op_descs = newop;
+			vfs_op_desc_refs = newref;
+			num_op_descs++;
+		}
+	}
+	vfs_opv_recalc();
+}
+
+/* Remove a vnode type from the vnode description table above. */
+void
+vfs_rm_vnodeops(const void *data)
+{
+	const struct vnodeopv_desc *opv;
+	const struct vnodeopv_desc **newopv;
+	struct vnodeop_desc **newop;
+	int *newref;
+	vop_t **opv_desc_vector;
+	struct vnodeop_desc *desc;
+	int i, j, k;
+
+	opv = (const struct vnodeopv_desc *)data;
+	/* Lower ref counts on descs in the table and release if zero */
+	for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) {
+		for (j = 0; j < num_op_descs; j++) {
+			if (desc == vfs_op_descs[j]) {
+				/* found it, decrease reference count */
+				vfs_op_desc_refs[j]--;
+				break;
+			}
+		}
+		for (j = 0; j < num_op_descs; j++) {
+			if (vfs_op_desc_refs[j] > 0)
+				continue;
+			if (vfs_op_desc_refs[j] < 0)
+				panic("vfs_remove_vnodeops: negative refcnt");
+			/* Entry is going away - replace it with defaultop */
+			for (k = 0; k < vnodeopv_num; k++) {
+				opv_desc_vector = 
+					*(vnodeopv_descs[k]->opv_desc_vector_p);
+				if (opv_desc_vector != NULL)
+					opv_desc_vector[desc->vdesc_offset] =
+						opv_desc_vector[1];
+			}
+			MALLOC(newop, struct vnodeop_desc **,
+			       (num_op_descs - 1) * sizeof(*newop),
+			       M_VNODE, M_WAITOK);
+			/* new reference count (for unload) */
+			MALLOC(newref, int *,
+				(num_op_descs - 1) * sizeof(*newref),
+				M_VNODE, M_WAITOK);
+			for (k = j; k < (num_op_descs - 1); k++) {
+				vfs_op_descs[k] = vfs_op_descs[k + 1];
+				vfs_op_desc_refs[k] = vfs_op_desc_refs[k + 1];
+			}
+			bcopy(vfs_op_descs, newop,
+				(num_op_descs - 1) * sizeof(*newop));
+			bcopy(vfs_op_desc_refs, newref,
+				(num_op_descs - 1) * sizeof(*newref));
+			FREE(vfs_op_descs, M_VNODE);
+			FREE(vfs_op_desc_refs, M_VNODE);
+			vfs_op_descs = newop;
+			vfs_op_desc_refs = newref;
+			num_op_descs--;
+		}
+	}
+
+	for (i = 0; i < vnodeopv_num; i++) {
+		if (vnodeopv_descs[i] == opv) {
+			for (j = i; j < (vnodeopv_num - 1); j++)
+				vnodeopv_descs[j] = vnodeopv_descs[j + 1];
+			break;
+		}
+	}
+	if (i == vnodeopv_num)
+		panic("vfs_remove_vnodeops: opv not found");
+	opv_desc_vector = *(opv->opv_desc_vector_p);
+	if (opv_desc_vector != NULL)
+		FREE(opv_desc_vector, M_VNODE);
+	MALLOC(newopv, const struct vnodeopv_desc **,
+	       (vnodeopv_num - 1) * sizeof(*newopv), M_VNODE, M_WAITOK);
+	bcopy(vnodeopv_descs, newopv, (vnodeopv_num - 1) * sizeof(*newopv));
+	FREE(vnodeopv_descs, M_VNODE);
+	vnodeopv_descs = newopv;
+	vnodeopv_num--;
+
+	vfs_opv_recalc();
+}
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+struct vattr va_null;
+
+/*
+ * Initialize the vnode structures and initialize each filesystem type.
+ */
+/* ARGSUSED*/
+static void
+vfsinit(void *dummy)
+{
+
+	vattr_null(&va_null);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vfsinit, NULL)
+
+/* Register a new filesystem type in the global table */
+int
+vfs_register(struct vfsconf *vfc)
+{
+	struct sysctl_oid *oidp;
+	struct vfsconf *vfsp;
+
+	vfsp = NULL;
+	if (vfsconf)
+		for (vfsp = vfsconf; vfsp->vfc_next; vfsp = vfsp->vfc_next)
+			if (strcmp(vfc->vfc_name, vfsp->vfc_name) == 0)
+				return EEXIST;
+
+	vfc->vfc_typenum = maxvfsconf++;
+	if (vfsp)
+		vfsp->vfc_next = vfc;
+	else
+		vfsconf = vfc;
+	vfc->vfc_next = NULL;
+
+	/*
+	 * If this filesystem has a sysctl node under vfs
+	 * (i.e. vfs.xxfs), then change the oid number of that node to 
+	 * match the filesystem's type number.  This allows user code
+	 * which uses the type number to read sysctl variables defined
+	 * by the filesystem to continue working. Since the oids are
+	 * in a sorted list, we need to make sure the order is
+	 * preserved by re-registering the oid after modifying its
+	 * number.
+	 */
+	SLIST_FOREACH(oidp, &sysctl__vfs_children, oid_link)
+		if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
+			sysctl_unregister_oid(oidp);
+			oidp->oid_number = vfc->vfc_typenum;
+			sysctl_register_oid(oidp);
+		}
+
+	/*
+	 * Call init function for this VFS...
+	 */
+	(*(vfc->vfc_vfsops->vfs_init))(vfc);
+
+	return 0;
+}
+
+
+/* Remove registration of a filesystem type */
+int
+vfs_unregister(struct vfsconf *vfc)
+{
+	struct vfsconf *vfsp, *prev_vfsp;
+	int error, i, maxtypenum;
+
+	i = vfc->vfc_typenum;
+
+	prev_vfsp = NULL;
+	for (vfsp = vfsconf; vfsp;
+			prev_vfsp = vfsp, vfsp = vfsp->vfc_next) {
+		if (!strcmp(vfc->vfc_name, vfsp->vfc_name))
+			break;
+	}
+	if (vfsp == NULL)
+		return EINVAL;
+	if (vfsp->vfc_refcount)
+		return EBUSY;
+	if (vfc->vfc_vfsops->vfs_uninit != NULL) {
+		error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp);
+		if (error)
+			return (error);
+	}
+	if (prev_vfsp)
+		prev_vfsp->vfc_next = vfsp->vfc_next;
+	else
+		vfsconf = vfsp->vfc_next;
+	maxtypenum = VFS_GENERIC;
+	for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
+		if (maxtypenum < vfsp->vfc_typenum)
+			maxtypenum = vfsp->vfc_typenum;
+	maxvfsconf = maxtypenum + 1;
+	return 0;
+}
+
+/*
+ * Standard kernel module handling code for filesystem modules.
+ * Referenced from VFS_SET().
+ */
+int
+vfs_modevent(module_t mod, int type, void *data)
+{
+	struct vfsconf *vfc;
+	int error = 0;
+
+	vfc = (struct vfsconf *)data;
+
+	switch (type) {
+	case MOD_LOAD:
+		if (vfc)
+			error = vfs_register(vfc);
+		break;
+
+	case MOD_UNLOAD:
+		if (vfc)
+			error = vfs_unregister(vfc);
+		break;
+	default:	/* including MOD_SHUTDOWN */
+		break;
+	}
+	return (error);
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
new file mode 100644
index 0000000..8e4af42
--- /dev/null
+++ b/sys/kern/vfs_lookup.c
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/uma.h>
+
+/*
+ * Allocation zone for namei
+ */
+uma_zone_t namei_zone;
+
+static void
+nameiinit(void *dummy __unused)
+{
+	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, 0);
+
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
+
+/*
+ * Convert a pathname into a pointer to a locked inode.
+ *
+ * The FOLLOW flag is set when symbolic links are to be followed
+ * when they occur at the end of the name translation process.
+ * Symbolic links are always followed for all other pathname
+ * components other than the last.
+ *
+ * The segflg defines whether the name is to be copied from user
+ * space or kernel space.
+ *
+ * Overall outline of namei:
+ *
+ *	copy in name
+ *	get starting directory
+ *	while (!done && !error) {
+ *		call lookup to search path.
+ *		if symbolic link, massage name in buffer and continue
+ *	}
+ */
+int
+namei(ndp)
+	register struct nameidata *ndp;
+{
+	register struct filedesc *fdp;	/* pointer to file descriptor state */
+	register char *cp;		/* pointer into pathname argument */
+	register struct vnode *dp;	/* the directory we are searching */
+	struct iovec aiov;		/* uio for reading symbolic links */
+	struct uio auio;
+	int error, linklen;
+	struct componentname *cnp = &ndp->ni_cnd;
+	struct thread *td = cnp->cn_thread;
+	struct proc *p = td->td_proc;
+
+	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
+	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
+	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
+	    ("namei: nameiop contaminated with flags"));
+	KASSERT((cnp->cn_flags & OPMASK) == 0,
+	    ("namei: flags contaminated with nameiops"));
+	fdp = p->p_fd;
+
+	/*
+	 * Get a buffer for the name to be translated, and copy the
+	 * name into the buffer.
+	 */
+	if ((cnp->cn_flags & HASBUF) == 0)
+		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
+	if (ndp->ni_segflg == UIO_SYSSPACE)
+		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
+			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+	else
+		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
+			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+
+	/*
+	 * Don't allow empty pathnames.
+	 */
+	if (!error && *cnp->cn_pnbuf == '\0')
+		error = ENOENT;
+
+	if (error) {
+		uma_zfree(namei_zone, cnp->cn_pnbuf);
+		ndp->ni_vp = NULL;
+		return (error);
+	}
+	ndp->ni_loopcnt = 0;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_NAMEI)) {
+		KASSERT(cnp->cn_thread == curthread,
+		    ("namei not using curthread"));
+		ktrnamei(cnp->cn_pnbuf);
+	}
+#endif
+
+	/*
+	 * Get starting point for the translation.
+	 */
+	FILEDESC_LOCK(fdp);
+	ndp->ni_rootdir = fdp->fd_rdir;
+	ndp->ni_topdir = fdp->fd_jdir;
+
+	dp = fdp->fd_cdir;
+	VREF(dp);
+	FILEDESC_UNLOCK(fdp);
+	for (;;) {
+		/*
+		 * Check if root directory should replace current directory.
+		 * Done at start of translation and after symbolic link.
+		 */
+		cnp->cn_nameptr = cnp->cn_pnbuf;
+		if (*(cnp->cn_nameptr) == '/') {
+			vrele(dp);
+			while (*(cnp->cn_nameptr) == '/') {
+				cnp->cn_nameptr++;
+				ndp->ni_pathlen--;
+			}
+			dp = ndp->ni_rootdir;
+			VREF(dp);
+		}
+		ndp->ni_startdir = dp;
+		error = lookup(ndp);
+		if (error) {
+			uma_zfree(namei_zone, cnp->cn_pnbuf);
+			return (error);
+		}
+		/*
+		 * Check for symbolic link
+		 */
+		if ((cnp->cn_flags & ISSYMLINK) == 0) {
+			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
+				uma_zfree(namei_zone, cnp->cn_pnbuf);
+			else
+				cnp->cn_flags |= HASBUF;
+
+			if (vn_canvmio(ndp->ni_vp) == TRUE &&
+				(cnp->cn_nameiop != DELETE) &&
+				((cnp->cn_flags & (NOOBJ|LOCKLEAF)) ==
+				 LOCKLEAF))
+				vfs_object_create(ndp->ni_vp, td,
+					ndp->ni_cnd.cn_cred);
+
+			return (0);
+		}
+		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
+			VOP_UNLOCK(ndp->ni_dvp, 0, td);
+		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
+			error = ELOOP;
+			break;
+		}
+		if (ndp->ni_pathlen > 1)
+			cp = uma_zalloc(namei_zone, M_WAITOK);
+		else
+			cp = cnp->cn_pnbuf;
+		aiov.iov_base = cp;
+		aiov.iov_len = MAXPATHLEN;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_SYSSPACE;
+		auio.uio_td = (struct thread *)0;
+		auio.uio_resid = MAXPATHLEN;
+		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+		if (error) {
+			if (ndp->ni_pathlen > 1)
+				uma_zfree(namei_zone, cp);
+			break;
+		}
+		linklen = MAXPATHLEN - auio.uio_resid;
+		if (linklen == 0) {
+			if (ndp->ni_pathlen > 1)
+				uma_zfree(namei_zone, cp);
+			error = ENOENT;
+			break;
+		}
+		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
+			if (ndp->ni_pathlen > 1)
+				uma_zfree(namei_zone, cp);
+			error = ENAMETOOLONG;
+			break;
+		}
+		if (ndp->ni_pathlen > 1) {
+			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
+			uma_zfree(namei_zone, cnp->cn_pnbuf);
+			cnp->cn_pnbuf = cp;
+		} else
+			cnp->cn_pnbuf[linklen] = '\0';
+		ndp->ni_pathlen += linklen;
+		vput(ndp->ni_vp);
+		dp = ndp->ni_dvp;
+	}
+	uma_zfree(namei_zone, cnp->cn_pnbuf);
+	vrele(ndp->ni_dvp);
+	vput(ndp->ni_vp);
+	ndp->ni_vp = NULL;
+	return (error);
+}
+
+/*
+ * Search a pathname.
+ * This is a very central and rather complicated routine.
+ *
+ * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
+ * The starting directory is taken from ni_startdir. The pathname is
+ * descended until done, or a symbolic link is encountered. The variable
+ * ni_more is clear if the path is completed; it is set to one if a
+ * symbolic link needing interpretation is encountered.
+ *
+ * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
+ * whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it, the parent directory is returned
+ * locked. If flag has WANTPARENT or'ed into it, the parent directory is
+ * returned unlocked. Otherwise the parent directory is not returned. If
+ * the target of the pathname exists and LOCKLEAF is or'ed into the flag
+ * the target is returned locked, otherwise it is returned unlocked.
+ * When creating or renaming and LOCKPARENT is specified, the target may not
+ * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
+ *
+ * Overall outline of lookup:
+ *
+ * dirloop:
+ *	identify next component of name at ndp->ni_ptr
+ *	handle degenerate case where name is null string
+ *	if .. and crossing mount points and on mounted filesys, find parent
+ *	call VOP_LOOKUP routine for next component name
+ *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
+ *	    component vnode returned in ni_vp (if it exists), locked.
+ *	if result vnode is mounted on and crossing mount points,
+ *	    find mounted on vnode
+ *	if more components of name, do next level at dirloop
+ *	return the answer in ni_vp, locked if LOCKLEAF set
+ *	    if LOCKPARENT set, return locked parent in ni_dvp
+ *	    if WANTPARENT set, return unlocked parent in ni_dvp
+ */
+int
+lookup(ndp)
+	register struct nameidata *ndp;
+{
+	register char *cp;		/* pointer into pathname argument */
+	register struct vnode *dp = 0;	/* the directory we are searching */
+	struct vnode *tdp;		/* saved dp */
+	struct mount *mp;		/* mount table entry */
+	int docache;			/* == 0 do not cache last component */
+	int wantparent;			/* 1 => wantparent or lockparent flag */
+	int rdonly;			/* lookup read-only flag bit */
+	int trailing_slash;
+	int error = 0;
+	int dpunlocked = 0;		/* dp has already been unlocked */
+	struct componentname *cnp = &ndp->ni_cnd;
+	struct thread *td = cnp->cn_thread;
+
+	/*
+	 * Setup: break out flag bits into variables.
+	 */
+	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
+	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+	if (cnp->cn_nameiop == DELETE ||
+	    (wantparent && cnp->cn_nameiop != CREATE &&
+	     cnp->cn_nameiop != LOOKUP))
+		docache = 0;
+	rdonly = cnp->cn_flags & RDONLY;
+	ndp->ni_dvp = NULL;
+	cnp->cn_flags &= ~ISSYMLINK;
+	dp = ndp->ni_startdir;
+	ndp->ni_startdir = NULLVP;
+	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+
+dirloop:
+	/*
+	 * Search a new directory.
+	 *
+	 * The last component of the filename is left accessible via
+	 * cnp->cn_nameptr for callers that need the name. Callers needing
+	 * the name set the SAVENAME flag. When done, they assume
+	 * responsibility for freeing the pathname buffer.
+	 */
+	cnp->cn_consume = 0;
+	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+		continue;
+	cnp->cn_namelen = cp - cnp->cn_nameptr;
+	if (cnp->cn_namelen > NAME_MAX) {
+		error = ENAMETOOLONG;
+		goto bad;
+	}
+#ifdef NAMEI_DIAGNOSTIC
+	{ char c = *cp;
+	*cp = '\0';
+	printf("{%s}: ", cnp->cn_nameptr);
+	*cp = c; }
+#endif
+	ndp->ni_pathlen -= cnp->cn_namelen;
+	ndp->ni_next = cp;
+
+	/*
+	 * Replace multiple slashes by a single slash and trailing slashes
+	 * by a null.  This must be done before VOP_LOOKUP() because some
+	 * fs's don't know about trailing slashes.  Remember if there were
+	 * trailing slashes to handle symlinks, existing non-directories
+	 * and non-existing files that won't be directories specially later.
+	 */
+	trailing_slash = 0;
+	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+		cp++;
+		ndp->ni_pathlen--;
+		if (*cp == '\0') {
+			trailing_slash = 1;
+			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
+		}
+	}
+	ndp->ni_next = cp;
+
+	cnp->cn_flags |= MAKEENTRY;
+	if (*cp == '\0' && docache == 0)
+		cnp->cn_flags &= ~MAKEENTRY;
+	if (cnp->cn_namelen == 2 &&
+	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+		cnp->cn_flags |= ISDOTDOT;
+	else
+		cnp->cn_flags &= ~ISDOTDOT;
+	if (*ndp->ni_next == 0)
+		cnp->cn_flags |= ISLASTCN;
+	else
+		cnp->cn_flags &= ~ISLASTCN;
+
+
+	/*
+	 * Check for degenerate name (e.g. / or "")
+	 * which is a way of talking about a directory,
+	 * e.g. like "/." or ".".
+	 */
+	if (cnp->cn_nameptr[0] == '\0') {
+		if (dp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto bad;
+		}
+		if (cnp->cn_nameiop != LOOKUP) {
+			error = EISDIR;
+			goto bad;
+		}
+		if (wantparent) {
+			ndp->ni_dvp = dp;
+			VREF(dp);
+		}
+		ndp->ni_vp = dp;
+		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
+			VOP_UNLOCK(dp, 0, td);
+		/* XXX This should probably move to the top of function. */
+		if (cnp->cn_flags & SAVESTART)
+			panic("lookup: SAVESTART");
+		return (0);
+	}
+
+	/*
+	 * Handle "..": two special cases.
+	 * 1. If at root directory (e.g. after chroot)
+	 *    or at absolute root directory
+	 *    then ignore it so can't get out.
+	 * 2. If this vnode is the root of a mounted
+	 *    filesystem, then replace it with the
+	 *    vnode which was mounted on so we take the
+	 *    .. in the other filesystem.
+	 * 3. If the vnode is the top directory of
+	 *    the jail or chroot, don't let them out.
+	 */
+	if (cnp->cn_flags & ISDOTDOT) {
+		for (;;) {
+			if (dp == ndp->ni_rootdir || 
+			    dp == ndp->ni_topdir || 
+			    dp == rootvnode) {
+				ndp->ni_dvp = dp;
+				ndp->ni_vp = dp;
+				VREF(dp);
+				goto nextname;
+			}
+			if ((dp->v_flag & VROOT) == 0 ||
+			    (cnp->cn_flags & NOCROSSMOUNT))
+				break;
+			if (dp->v_mount == NULL) {	/* forced unmount */
+				error = EBADF;
+				goto bad;
+			}
+			tdp = dp;
+			dp = dp->v_mount->mnt_vnodecovered;
+			vput(tdp);
+			VREF(dp);
+			vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+		}
+	}
+
+	/*
+	 * We now have a segment name to search for, and a directory to search.
+	 */
+unionlookup:
+	ndp->ni_dvp = dp;
+	ndp->ni_vp = NULL;
+	cnp->cn_flags &= ~PDIRUNLOCK;
+	ASSERT_VOP_LOCKED(dp, "lookup");
+	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
+		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
+#ifdef NAMEI_DIAGNOSTIC
+		printf("not found\n");
+#endif
+		if ((error == ENOENT) &&
+		    (dp->v_flag & VROOT) && (dp->v_mount != NULL) &&
+		    (dp->v_mount->mnt_flag & MNT_UNION)) {
+			tdp = dp;
+			dp = dp->v_mount->mnt_vnodecovered;
+			if (cnp->cn_flags & PDIRUNLOCK)
+				vrele(tdp);
+			else
+				vput(tdp);
+			VREF(dp);
+			vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+			goto unionlookup;
+		}
+
+		if (error != EJUSTRETURN)
+			goto bad;
+		/*
+		 * If creating and at end of pathname, then can consider
+		 * allowing file to be created.
+		 */
+		if (rdonly) {
+			error = EROFS;
+			goto bad;
+		}
+		if (*cp == '\0' && trailing_slash &&
+		     !(cnp->cn_flags & WILLBEDIR)) {
+			error = ENOENT;
+			goto bad;
+		}
+		/*
+		 * We return with ni_vp NULL to indicate that the entry
+		 * doesn't currently exist, leaving a pointer to the
+		 * (possibly locked) directory inode in ndp->ni_dvp.
+		 */
+		if (cnp->cn_flags & SAVESTART) {
+			ndp->ni_startdir = ndp->ni_dvp;
+			VREF(ndp->ni_startdir);
+		}
+		return (0);
+	}
+#ifdef NAMEI_DIAGNOSTIC
+	printf("found\n");
+#endif
+
+	ASSERT_VOP_LOCKED(ndp->ni_vp, "lookup");
+
+	/*
+	 * Take into account any additional components consumed by
+	 * the underlying filesystem.
+	 */
+	if (cnp->cn_consume > 0) {
+		cnp->cn_nameptr += cnp->cn_consume;
+		ndp->ni_next += cnp->cn_consume;
+		ndp->ni_pathlen -= cnp->cn_consume;
+		cnp->cn_consume = 0;
+	}
+
+	dp = ndp->ni_vp;
+
+	/*
+	 * Check to see if the vnode has been mounted on;
+	 * if so find the root of the mounted filesystem.
+	 */
+	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
+	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+		if (vfs_busy(mp, 0, 0, td))
+			continue;
+		VOP_UNLOCK(dp, 0, td);
+		error = VFS_ROOT(mp, &tdp);
+		vfs_unbusy(mp, td);
+		if (error) {
+			dpunlocked = 1;
+			goto bad2;
+		}
+		vrele(dp);
+		ndp->ni_vp = dp = tdp;
+	}
+
+	/*
+	 * Check for symbolic link
+	 */
+	if ((dp->v_type == VLNK) &&
+	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
+	     *ndp->ni_next == '/')) {
+		cnp->cn_flags |= ISSYMLINK;
+		if (dp->v_mount == NULL) {
+			/* We can't know whether the directory was mounted with
+			 * NOSYMFOLLOW, so we can't follow safely. */
+			error = EBADF;
+			goto bad2;
+		}
+		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
+			error = EACCES;
+			goto bad2;
+		}
+		return (0);
+	}
+
+	/*
+	 * Check for bogus trailing slashes.
+	 */
+	if (trailing_slash && dp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto bad2;
+	}
+
+nextname:
+	/*
+	 * Not a symbolic link.  If more pathname,
+	 * continue at next component, else return.
+	 */
+	if (*ndp->ni_next == '/') {
+		cnp->cn_nameptr = ndp->ni_next;
+		while (*cnp->cn_nameptr == '/') {
+			cnp->cn_nameptr++;
+			ndp->ni_pathlen--;
+		}
+		if (ndp->ni_dvp != ndp->ni_vp)
+			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "lookup");
+		vrele(ndp->ni_dvp);
+		goto dirloop;
+	}
+	/*
+	 * Disallow directory write attempts on read-only filesystems.
+	 */
+	if (rdonly &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		error = EROFS;
+		goto bad2;
+	}
+	if (cnp->cn_flags & SAVESTART) {
+		ndp->ni_startdir = ndp->ni_dvp;
+		VREF(ndp->ni_startdir);
+	}
+	if (!wantparent)
+		vrele(ndp->ni_dvp);
+
+	if ((cnp->cn_flags & LOCKLEAF) == 0)
+		VOP_UNLOCK(dp, 0, td);
+	return (0);
+
+bad2:
+	if ((cnp->cn_flags & (LOCKPARENT | PDIRUNLOCK)) == LOCKPARENT &&
+	    *ndp->ni_next == '\0')
+		VOP_UNLOCK(ndp->ni_dvp, 0, td);
+	vrele(ndp->ni_dvp);
+bad:
+	if (dpunlocked)
+		vrele(dp);
+	else
+		vput(dp);
+	ndp->ni_vp = NULL;
+	return (error);
+}
+
+/*
+ * relookup - lookup a path name component
+ *    Used by lookup to re-aquire things.
+ */
+int
+relookup(dvp, vpp, cnp)
+	struct vnode *dvp, **vpp;
+	struct componentname *cnp;
+{
+	struct thread *td = cnp->cn_thread;
+	struct vnode *dp = 0;		/* the directory we are searching */
+	int docache;			/* == 0 do not cache last component */
+	int wantparent;			/* 1 => wantparent or lockparent flag */
+	int rdonly;			/* lookup read-only flag bit */
+	int error = 0;
+#ifdef NAMEI_DIAGNOSTIC
+	int newhash;			/* DEBUG: check name hash */
+	char *cp;			/* DEBUG: check name ptr/len */
+#endif
+
+	/*
+	 * Setup: break out flag bits into variables.
+	 */
+	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
+	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+	if (cnp->cn_nameiop == DELETE ||
+	    (wantparent && cnp->cn_nameiop != CREATE))
+		docache = 0;
+	rdonly = cnp->cn_flags & RDONLY;
+	cnp->cn_flags &= ~ISSYMLINK;
+	dp = dvp;
+	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
+
+/* dirloop: */
+	/*
+	 * Search a new directory.
+	 *
+	 * The last component of the filename is left accessible via
+	 * cnp->cn_nameptr for callers that need the name. Callers needing
+	 * the name set the SAVENAME flag. When done, they assume
+	 * responsibility for freeing the pathname buffer.
+	 */
+#ifdef NAMEI_DIAGNOSTIC
+	if (cnp->cn_namelen != cp - cnp->cn_nameptr)
+		panic ("relookup: bad len");
+	if (*cp != 0)
+		panic("relookup: not last component");
+	printf("{%s}: ", cnp->cn_nameptr);
+#endif
+
+	/*
+	 * Check for degenerate name (e.g. / or "")
+	 * which is a way of talking about a directory,
+	 * e.g. like "/." or ".".
+	 */
+	if (cnp->cn_nameptr[0] == '\0') {
+		if (cnp->cn_nameiop != LOOKUP || wantparent) {
+			error = EISDIR;
+			goto bad;
+		}
+		if (dp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto bad;
+		}
+		if (!(cnp->cn_flags & LOCKLEAF))
+			VOP_UNLOCK(dp, 0, td);
+		*vpp = dp;
+		/* XXX This should probably move to the top of function. */
+		if (cnp->cn_flags & SAVESTART)
+			panic("lookup: SAVESTART");
+		return (0);
+	}
+
+	if (cnp->cn_flags & ISDOTDOT)
+		panic ("relookup: lookup on dot-dot");
+
+	/*
+	 * We now have a segment name to search for, and a directory to search.
+	 */
+	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
+		KASSERT(*vpp == NULL, ("leaf should be empty"));
+		if (error != EJUSTRETURN)
+			goto bad;
+		/*
+		 * If creating and at end of pathname, then can consider
+		 * allowing file to be created.
+		 */
+		if (rdonly) {
+			error = EROFS;
+			goto bad;
+		}
+		/* ASSERT(dvp == ndp->ni_startdir) */
+		if (cnp->cn_flags & SAVESTART)
+			VREF(dvp);
+		/*
+		 * We return with ni_vp NULL to indicate that the entry
+		 * doesn't currently exist, leaving a pointer to the
+		 * (possibly locked) directory inode in ndp->ni_dvp.
+		 */
+		return (0);
+	}
+	dp = *vpp;
+
+	/*
+	 * Check for symbolic link
+	 */
+	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
+	    ("relookup: symlink found.\n"));
+
+	/*
+	 * Disallow directory write attempts on read-only filesystems.
+	 */
+	if (rdonly &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		error = EROFS;
+		goto bad2;
+	}
+	/* ASSERT(dvp == ndp->ni_startdir) */
+	if (cnp->cn_flags & SAVESTART)
+		VREF(dvp);
+	
+	if (!wantparent)
+		vrele(dvp);
+
+	if (vn_canvmio(dp) == TRUE &&
+		((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF))
+		vfs_object_create(dp, td, cnp->cn_cred);
+
+	if ((cnp->cn_flags & LOCKLEAF) == 0)
+		VOP_UNLOCK(dp, 0, td);
+	return (0);
+
+bad2:
+	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN))
+		VOP_UNLOCK(dvp, 0, td);
+	vrele(dvp);
+bad:
+	vput(dp);
+	*vpp = NULL;
+	return (error);
+}
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
new file mode 100644
index 0000000..20d9b90
--- /dev/null
+++ b/sys/kern/vfs_mount.c
@@ -0,0 +1,396 @@
+/*-
+ * Copyright (c) 1999 Michael Smith
+ * All rights reserved.
+ * Copyright (c) 1999 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$FreeBSD$
+ */
+
+/*
+ * Locate and mount the root filesystem.
+ *
+ * The root filesystem is detailed in the kernel environment variable
+ * vfs.root.mountfrom, which is expected to be in the general format
+ *
+ * <vfsname>:[<path>]
+ * vfsname   := the name of a VFS known to the kernel and capable
+ *              of being mounted as root
+ * path      := disk device name or other data used by the filesystem
+ *              to locate its physical store
+ *
+ */
+
+#include "opt_rootdevname.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/reboot.h>
+#include <sys/diskslice.h>
+#include <sys/disklabel.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/proc.h>
+
+#include "opt_ddb.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <paths.h>
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
+
+#define ROOTNAME	"root_device"
+
+/*
+ * The vnode of the system's root (/ in the filesystem, without chroot
+ * active.)
+ */
+struct vnode	*rootvnode;
+
+/* 
+ * The root specifiers we will try if RB_CDROM is specified.
+ */
+static char *cdrom_rootdevnames[] = {
+	"cd9660:cd0a",
+	"cd9660:acd0a",
+	"cd9660:wcd0a",
+	NULL
+};
+
+static int	vfs_mountroot_try(char *mountfrom);
+static int	vfs_mountroot_ask(void);
+static void	gets(char *cp);
+
+/* legacy find-root code */
+char		*rootdevnames[2] = {NULL, NULL};
+static int	setrootbyname(char *name);
+dev_t		rootdev = NODEV;
+
+/*
+ * Find and mount the root filesystem
+ */
+void
+vfs_mountroot(void *foo __unused)
+{
+	char		*cp;
+	int		i, error;
+	
+	/* 
+	 * The root filesystem information is compiled in, and we are
+	 * booted with instructions to use it.
+	 */
+#ifdef ROOTDEVNAME
+	if ((boothowto & RB_DFLTROOT) && 
+	    !vfs_mountroot_try(ROOTDEVNAME))
+		return;
+#endif
+	/* 
+	 * We are booted with instructions to prompt for the root filesystem,
+	 * or to use the compiled-in default when it doesn't exist.
+	 */
+	if (boothowto & (RB_DFLTROOT | RB_ASKNAME)) {
+		if (!vfs_mountroot_ask())
+			return;
+	}
+
+	/*
+	 * We've been given the generic "use CDROM as root" flag.  This is
+	 * necessary because one media may be used in many different
+	 * devices, so we need to search for them.
+	 */
+	if (boothowto & RB_CDROM) {
+		for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
+			if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
+				return;
+		}
+	}
+
+	/*
+	 * Try to use the value read by the loader from /etc/fstab, or
+	 * supplied via some other means.  This is the preferred 
+	 * mechanism.
+	 */
+	if ((cp = getenv("vfs.root.mountfrom")) != NULL) {
+		error = vfs_mountroot_try(cp);
+		freeenv(cp);
+		if (!error)
+			return;
+	}
+
+	/* 
+	 * Try values that may have been computed by the machine-dependant
+	 * legacy code.
+	 */
+	if (!vfs_mountroot_try(rootdevnames[0]))
+		return;
+	if (!vfs_mountroot_try(rootdevnames[1]))
+		return;
+
+	/*
+	 * If we have a compiled-in default, and haven't already tried it, try
+	 * it now.
+	 */
+#ifdef ROOTDEVNAME
+	if (!(boothowto & RB_DFLTROOT))
+		if (!vfs_mountroot_try(ROOTDEVNAME))
+			return;
+#endif
+
+	/* 
+	 * Everything so far has failed, prompt on the console if we haven't
+	 * already tried that.
+	 */
+	if (!(boothowto & (RB_DFLTROOT | RB_ASKNAME)) && !vfs_mountroot_ask())
+		return;
+	panic("Root mount failed, startup aborted.");
+}
+
+/*
+ * Mount (mountfrom) as the root filesystem.
+ */
+static int
+vfs_mountroot_try(char *mountfrom)
+{
+        struct mount	*mp;
+	char		*vfsname, *path;
+	int		error;
+	char		patt[32];
+	int		s;
+
+	vfsname = NULL;
+	path    = NULL;
+	mp      = NULL;
+	error   = EINVAL;
+
+	if (mountfrom == NULL)
+		return(error);		/* don't complain */
+
+	s = splcam();			/* Overkill, but annoying without it */
+	printf("Mounting root from %s\n", mountfrom);
+	splx(s);
+
+	/* parse vfs name and path */
+	vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK);
+	path = malloc(MNAMELEN, M_MOUNT, M_WAITOK);
+	vfsname[0] = path[0] = 0;
+	sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN);
+	if (sscanf(mountfrom, patt, vfsname, path) < 1)
+		goto done;
+
+	/* allocate a root mount */
+	error = vfs_rootmountalloc(vfsname, path[0] != 0 ? path : ROOTNAME,
+				   &mp);
+	if (error != 0) {
+		printf("Can't allocate root mount for filesystem '%s': %d\n",
+		       vfsname, error);
+		goto done;
+	}
+	mp->mnt_flag |= MNT_ROOTFS;
+
+	/* do our best to set rootdev */
+	if ((path[0] != 0) && setrootbyname(path))
+		printf("setrootbyname failed\n");
+
+	/* If the root device is a type "memory disk", mount RW */
+	if (rootdev != NODEV && devsw(rootdev) &&
+	    (devsw(rootdev)->d_flags & D_MEMDISK))
+		mp->mnt_flag &= ~MNT_RDONLY;
+
+	/* 
+	 * Set the mount path to be something useful, because the
+	 * filesystem code isn't responsible now for initialising
+	 * f_mntonname unless they want to override the default
+	 * (which is `path'.)
+	 */
+	strncpy(mp->mnt_stat.f_mntonname, "/", MNAMELEN);
+
+	error = VFS_MOUNT(mp, NULL, NULL, NULL, curthread);
+
+done:
+	if (vfsname != NULL)
+		free(vfsname, M_MOUNT);
+	if (path != NULL)
+		free(path, M_MOUNT);
+	if (error != 0) {
+		if (mp != NULL) {
+			vfs_unbusy(mp, curthread);
+			free(mp, M_MOUNT);
+		}
+		printf("Root mount failed: %d\n", error);
+	} else {
+
+		/* register with list of mounted filesystems */
+		mtx_lock(&mountlist_mtx);
+		TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+		mtx_unlock(&mountlist_mtx);
+
+		/* sanity check system clock against root filesystem timestamp */
+		inittodr(mp->mnt_time);
+		vfs_unbusy(mp, curthread);
+	}
+	return(error);
+}
+
+/*
+ * Spin prompting on the console for a suitable root filesystem
+ */
+static int
+vfs_mountroot_ask(void)
+{
+	char name[128];
+	int i;
+	dev_t dev;
+
+	for(;;) {
+		printf("\nManual root filesystem specification:\n");
+		printf("  <fstype>:<device>  Mount <device> using filesystem <fstype>\n");
+#if defined(__i386__) || defined(__ia64__)
+		printf("                       eg. ufs:da0s1a\n");
+#else
+		printf("                       eg. ufs:da0a\n");
+#endif
+		printf("  ?                  List valid disk boot devices\n");
+		printf("  <empty line>       Abort manual input\n");
+		printf("\nmountroot> ");
+		gets(name);
+		if (name[0] == 0)
+			return(1);
+		if (name[0] == '?') {
+			printf("Possibly valid devices for 'ufs' root:\n");
+			for (i = 0; i < NUMCDEVSW; i++) {
+				dev = makedev(i, 0);
+				if (devsw(dev) != NULL)
+					printf(" \"%s\"", devsw(dev)->d_name);
+			}
+			printf("\n");
+			continue;
+		}
+		if (!vfs_mountroot_try(name))
+			return(0);
+	}
+}
+
+/*
+ * Local helper function for vfs_mountroot_ask.
+ */
+static void
+gets(char *cp)
+{
+	char *lp;
+	int c;
+
+	lp = cp;
+	for (;;) {
+		printf("%c", c = cngetc() & 0177);
+		switch (c) {
+		case -1:
+		case '\n':
+		case '\r':
+			*lp++ = '\0';
+			return;
+		case '\b':
+		case '\177':
+			if (lp > cp) {
+				printf(" \b");
+				lp--;
+			}
+			continue;
+		case '#':
+			lp--;
+			if (lp < cp)
+				lp = cp;
+			continue;
+		case '@':
+		case 'u' & 037:
+			lp = cp;
+			printf("%c", '\n');
+			continue;
+		default:
+			*lp++ = c;
+		}
+	}
+}
+
+/*
+ * Convert a given name to the dev_t of the disk-like device
+ * it refers to.
+ */
+dev_t
+getdiskbyname(char *name) {
+	char *cp;
+	dev_t dev;
+
+	cp = name;
+	if (!bcmp(cp, "/dev/", 5))
+		cp += 5;
+
+	dev = NODEV;
+	EVENTHANDLER_INVOKE(dev_clone, cp, strlen(cp), &dev);
+	return (dev);
+}
+
+/*
+ * Set rootdev to match (name), given that we expect it to
+ * refer to a disk-like device.
+ */
+static int
+setrootbyname(char *name)
+{
+	dev_t diskdev;
+
+	diskdev = getdiskbyname(name);
+	if (diskdev != NODEV) {
+		rootdev = diskdev;
+		return (0);
+	}
+
+	return (1);
+}
+
+/* Show the dev_t for a disk specified by name */
+#ifdef DDB
+DB_SHOW_COMMAND(disk, db_getdiskbyname)
+{
+	dev_t dev;
+
+	if (modif[0] == '\0') {
+		db_error("usage: show disk/devicename");
+		return;
+	}
+	dev = getdiskbyname(modif);
+	if (dev != NODEV)
+		db_printf("dev_t = %p\n", dev);
+	else
+		db_printf("No disk device matched.\n");
+}
+#endif
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
new file mode 100644
index 0000000..0575662
--- /dev/null
+++ b/sys/kern/vfs_subr.c
@@ -0,0 +1,3275 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
+ * $FreeBSD$
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+#include "opt_ddb.h"
+#include "opt_ffs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+
+static void	addalias(struct vnode *vp, dev_t nvp_rdev);
+static void	insmntque(struct vnode *vp, struct mount *mp);
+static void	vclean(struct vnode *vp, int flags, struct thread *td);
+static void	vlruvp(struct vnode *vp);
+
+/*
+ * Number of vnodes in existence.  Increased whenever getnewvnode()
+ * allocates a new vnode, never decreased.
+ */
+static unsigned long	numvnodes;
+
+SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
+
+/*
+ * Conversion tables for conversion from vnode types to inode formats
+ * and back.
+ */
+enum vtype iftovt_tab[16] = {
+	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[9] = {
+	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+	S_IFSOCK, S_IFIFO, S_IFMT,
+};
+
+/*
+ * List of vnodes that are ready for recycling.
+ */
+static TAILQ_HEAD(freelst, vnode) vnode_free_list;
+
+/*
+ * Minimum number of free vnodes.  If there are fewer than this free vnodes,
+ * getnewvnode() will return a newly allocated vnode.
+ */
+static u_long wantfreevnodes = 25;
+SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+/* Number of vnodes in the free list. */
+static u_long freevnodes;
+SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
+
+/*
+ * Various variables used for debugging the new implementation of
+ * reassignbuf().
+ * XXX these are probably of (very) limited utility now.
+ */
+static int reassignbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
+static int reassignbufloops;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
+static int reassignbufsortgood;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
+static int reassignbufsortbad;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
+/* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */
+static int reassignbufmethod = 1;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
+static int nameileafonly;
+SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
+
+#ifdef ENABLE_VFS_IOOPT
+/* See NOTES for a description of this setting. */
+int vfs_ioopt;
+SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
+#endif
+
+/* List of mounted filesystems. */
+struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
+
+/* For any iteration/modification of mountlist */
+struct mtx mountlist_mtx;
+
+/* For any iteration/modification of mnt_vnodelist */
+struct mtx mntvnode_mtx;
+
+/*
+ * Cache for the mount type id assigned to NFS.  This is used for
+ * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
+ */
+int	nfs_mount_type = -1;
+
+/* To keep more than one thread at a time from running vfs_getnewfsid */
+static struct mtx mntid_mtx;
+
+/* For any iteration/modification of vnode_free_list */
+static struct mtx vnode_free_list_mtx;
+
+/*
+ * For any iteration/modification of dev->si_hlist (linked through
+ * v_specnext)
+ */
+static struct mtx spechash_mtx;
+
+/* Publicly exported FS */
+struct nfs_public nfs_pub;
+
+/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
+static uma_zone_t vnode_zone;
+static uma_zone_t vnodepoll_zone;
+
+/* Set to 1 to print out reclaim of active vnodes */
+int	prtactive;
+
+/*
+ * The workitem queue.
+ *
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syncer process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ *	syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+static int syncer_delayno;
+static long syncer_mask;
+LIST_HEAD(synclist, vnode);
+static struct synclist *syncer_workitem_pending;
+
+#define SYNCER_MAXDELAY		32
+static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
+static int syncdelay = 30;		/* max time to delay syncing data */
+static int filedelay = 30;		/* time to delay syncing files */
+SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
+static int dirdelay = 29;		/* time to delay syncing directories */
+SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
+static int metadelay = 28;		/* time to delay syncing metadata */
+SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
+static int rushjob;		/* number of slots to run ASAP */
+static int stat_rush_requests;	/* number of times I/O speeded up */
+SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
+
+/*
+ * Number of vnodes we want to exist at any one time.  This is mostly used
+ * to size hash tables in vnode-related code.  It is normally not used in
+ * getnewvnode(), as wantfreevnodes is normally nonzero.)
+ *
+ * XXX desiredvnodes is historical cruft and should not exist.
+ */
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
+    &desiredvnodes, 0, "Maximum number of vnodes");
+static int minvnodes;
+SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
+    &minvnodes, 0, "Minimum number of vnodes");
+static int vnlru_nowhere;
+SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
+    "Number of times the vnlru process ran without success");
+
+#ifdef DEBUG_VFS_LOCKS
+/* Print lock violations */
+int vfs_badlock_print = 1;
+/* Panic on violation */
+int vfs_badlock_panic = 1;
+#endif
+
+void
+v_addpollinfo(struct vnode *vp)
+{
+	vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
+	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
+}
+
+/*
+ * Initialize the vnode management data structures.
+ */
+static void
+vntblinit(void *dummy __unused)
+{
+
+	desiredvnodes = maxproc + cnt.v_page_count / 4;
+	minvnodes = desiredvnodes / 4;
+	mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
+	mtx_init(&mntvnode_mtx, "mntvnode", NULL, MTX_DEF);
+	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
+	mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
+	TAILQ_INIT(&vnode_free_list);
+	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
+	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
+	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	/*
+	 * Initialize the filesystem syncer.
+	 */
+	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
+		&syncer_mask);
+	syncer_maxdelay = syncer_mask + 1;
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
+
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Interlock is not released on failure.
+ */
+int
+vfs_busy(mp, flags, interlkp, td)
+	struct mount *mp;
+	int flags;
+	struct mtx *interlkp;
+	struct thread *td;
+{
+	int lkflags;
+
+	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+		if (flags & LK_NOWAIT)
+			return (ENOENT);
+		mp->mnt_kern_flag |= MNTK_MWAIT;
+		/*
+		 * Since all busy locks are shared except the exclusive
+		 * lock granted when unmounting, the only place that a
+		 * wakeup needs to be done is at the release of the
+		 * exclusive lock at the end of dounmount.
+		 */
+		msleep(mp, interlkp, PVFS, "vfs_busy", 0);
+		return (ENOENT);
+	}
+	lkflags = LK_SHARED | LK_NOPAUSE;
+	if (interlkp)
+		lkflags |= LK_INTERLOCK;
+	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
+		panic("vfs_busy: unexpected lock failure");
+	return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(mp, td)
+	struct mount *mp;
+	struct thread *td;
+{
+
+	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
+}
+
+/*
+ * Lookup a filesystem type, and if found allocate and initialize
+ * a mount structure for it.
+ *
+ * Devname is usually updated by mount(8) after booting.
+ */
+int
+vfs_rootmountalloc(fstypename, devname, mpp)
+	char *fstypename;
+	char *devname;
+	struct mount **mpp;
+{
+	struct thread *td = curthread;	/* XXX */
+	struct vfsconf *vfsp;
+	struct mount *mp;
+
+	if (fstypename == NULL)
+		return (ENODEV);
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstypename))
+			break;
+	if (vfsp == NULL)
+		return (ENODEV);
+	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, td);
+	TAILQ_INIT(&mp->mnt_nvnodelist);
+	TAILQ_INIT(&mp->mnt_reservedvnlist);
+	mp->mnt_vfc = vfsp;
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_flag = MNT_RDONLY;
+	mp->mnt_vnodecovered = NULLVP;
+	vfsp->vfc_refcount++;
+	mp->mnt_iosize_max = DFLTPHYS;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+	mp->mnt_stat.f_mntonname[0] = '/';
+	mp->mnt_stat.f_mntonname[1] = 0;
+	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
+	*mpp = mp;
+	return (0);
+}
+
+/*
+ * Find an appropriate filesystem to use for the root. If a filesystem
+ * has not been preselected, walk through the list of known filesystems
+ * trying those that have mountroot routines, and try them until one
+ * works or we have tried them all.
+ */
+#ifdef notdef	/* XXX JH */
+int
+lite2_vfs_mountroot()
+{
+	struct vfsconf *vfsp;
+	extern int (*lite2_mountroot)(void);
+	int error;
+
+	if (lite2_mountroot != NULL)
+		return ((*lite2_mountroot)());
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		if (vfsp->vfc_mountroot == NULL)
+			continue;
+		if ((error = (*vfsp->vfc_mountroot)()) == 0)
+			return (0);
+		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
+	}
+	return (ENODEV);
+}
+#endif
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid)
+	fsid_t *fsid;
+{
+	register struct mount *mp;
+
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+			mtx_unlock(&mountlist_mtx);
+			return (mp);
+	    }
+	}
+	mtx_unlock(&mountlist_mtx);
+	return ((struct mount *) 0);
+}
+
+/*
+ * Get a new unique fsid.  Try to make its val[0] unique, since this value
+ * will be used to create fake device numbers for stat().  Also try (but
+ * not so hard) make its val[0] unique mod 2^16, since some emulators only
+ * support 16-bit device numbers.  We end up with unique val[0]'s for the
+ * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
+ *
+ * Keep in mind that several mounts may be running in parallel.  Starting
+ * the search one past where the previous search terminated is both a
+ * micro-optimization and a defense against returning the same fsid to
+ * different mounts.
+ */
+void
+vfs_getnewfsid(mp)
+	struct mount *mp;
+{
+	static u_int16_t mntid_base;
+	fsid_t tfsid;
+	int mtype;
+
+	mtx_lock(&mntid_mtx);
+	mtype = mp->mnt_vfc->vfc_typenum;
+	tfsid.val[1] = mtype;
+	mtype = (mtype & 0xFF) << 24;
+	for (;;) {
+		tfsid.val[0] = makeudev(255,
+		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
+		mntid_base++;
+		if (vfs_getvfs(&tfsid) == NULL)
+			break;
+	}
+	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
+	mtx_unlock(&mntid_mtx);
+}
+
+/*
+ * Knob to control the precision of file timestamps:
+ *
+ *   0 = seconds only; nanoseconds zeroed.
+ *   1 = seconds and nanoseconds, accurate within 1/HZ.
+ *   2 = seconds and nanoseconds, truncated to microseconds.
+ * >=3 = seconds and nanoseconds, maximum precision.
+ */
+enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
+
+static int timestamp_precision = TSP_SEC;
+SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
+    &timestamp_precision, 0, "");
+
+/*
+ * Get a current timestamp.
+ */
+void
+vfs_timestamp(tsp)
+	struct timespec *tsp;
+{
+	struct timeval tv;
+
+	switch (timestamp_precision) {
+	case TSP_SEC:
+		tsp->tv_sec = time_second;
+		tsp->tv_nsec = 0;
+		break;
+	case TSP_HZ:
+		getnanotime(tsp);
+		break;
+	case TSP_USEC:
+		microtime(&tv);
+		TIMEVAL_TO_TIMESPEC(&tv, tsp);
+		break;
+	case TSP_NSEC:
+	default:
+		nanotime(tsp);
+		break;
+	}
+}
+
+/*
+ * Build a linked list of mount options from a struct uio.
+ */
+int
+vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
+{
+	struct vfsoptlist *opts;
+	struct vfsopt *opt;
+	unsigned int i, iovcnt;
+	int error, namelen, optlen;
+
+	iovcnt = auio->uio_iovcnt;
+	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+	TAILQ_INIT(opts);
+	for (i = 0; i < iovcnt; i += 2) {
+		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+		namelen = auio->uio_iov[i].iov_len;
+		optlen = auio->uio_iov[i + 1].iov_len;
+		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
+		opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
+		opt->len = optlen;
+		if (auio->uio_segflg == UIO_SYSSPACE) {
+			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
+			bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
+			    optlen);
+		} else {
+			error = copyin(auio->uio_iov[i].iov_base, opt->name,
+			    namelen);
+			if (!error)
+				error = copyin(auio->uio_iov[i + 1].iov_base,
+				    opt->value, optlen);
+			if (error)
+				goto bad;
+		}
+		TAILQ_INSERT_TAIL(opts, opt, link);
+	}
+	*options = opts;
+	return (0);
+bad:
+	vfs_freeopts(opts);
+	return (error);
+}
+
+/*
+ * Get a mount option by its name.
+ *
+ * Return 0 if the option was found, ENOENT otherwise.
+ * If len is non-NULL it will be filled with the length
+ * of the option. If buf is non-NULL, it will be filled
+ * with the address of the option.
+ */
+int
+vfs_getopt(opts, name, buf, len)
+	struct vfsoptlist *opts;
+	const char *name;
+	void **buf;
+	int *len;
+{
+	struct vfsopt *opt;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0) {
+			if (len != NULL)
+				*len = opt->len;
+			if (buf != NULL)
+				*buf = opt->value;
+			return (0);
+		}
+	}
+	return (ENOENT);
+}
+
+/*
+ * Find and copy a mount option.
+ *
+ * The size of the buffer has to be specified
+ * in len, if it is not the same length as the
+ * mount option, EINVAL is returned.
+ * Returns ENOENT if the option is not found.
+ */
+int
+vfs_copyopt(opts, name, dest, len)
+	struct vfsoptlist *opts;
+	const char *name;
+	void *dest;
+	int len;
+{
+	struct vfsopt *opt;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0) {
+			if (len != opt->len)
+				return (EINVAL);
+			bcopy(opt->value, dest, opt->len);
+			return (0);
+		}
+	}
+	return (ENOENT);
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(vap)
+	register struct vattr *vap;
+{
+
+	vap->va_type = VNON;
+	vap->va_size = VNOVAL;
+	vap->va_bytes = VNOVAL;
+	vap->va_mode = VNOVAL;
+	vap->va_nlink = VNOVAL;
+	vap->va_uid = VNOVAL;
+	vap->va_gid = VNOVAL;
+	vap->va_fsid = VNOVAL;
+	vap->va_fileid = VNOVAL;
+	vap->va_blocksize = VNOVAL;
+	vap->va_rdev = VNOVAL;
+	vap->va_atime.tv_sec = VNOVAL;
+	vap->va_atime.tv_nsec = VNOVAL;
+	vap->va_mtime.tv_sec = VNOVAL;
+	vap->va_mtime.tv_nsec = VNOVAL;
+	vap->va_ctime.tv_sec = VNOVAL;
+	vap->va_ctime.tv_nsec = VNOVAL;
+	vap->va_flags = VNOVAL;
+	vap->va_gen = VNOVAL;
+	vap->va_vaflags = 0;
+}
+
+/*
+ * This routine is called when we have too many vnodes.  It attempts
+ * to free <count> vnodes and will potentially free vnodes that still
+ * have VM backing store (VM backing store is typically the cause
+ * of a vnode blowout so we want to do this).  Therefore, this operation
+ * is not considered cheap.
+ *
+ * A number of conditions may prevent a vnode from being reclaimed.
+ * the buffer cache may have references on the vnode, a directory
+ * vnode may still have references due to the namei cache representing
+ * underlying files, or the vnode may be in active use.   It is not
+ * desireable to reuse such vnodes.  These conditions may cause the
+ * number of vnodes to reach some minimum value regardless of what
+ * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
+ */
+static int
+vlrureclaim(struct mount *mp, int count)
+{
+	struct vnode *vp;
+	int done;
+	int trigger;
+	int usevnodes;
+
+	/*
+	 * Calculate the trigger point, don't allow user
+	 * screwups to blow us up.   This prevents us from
+	 * recycling vnodes with lots of resident pages.  We
+	 * aren't trying to free memory, we are trying to
+	 * free vnodes.
+	 */
+	usevnodes = desiredvnodes;
+	if (usevnodes <= 0)
+		usevnodes = 1;
+	trigger = cnt.v_page_count * 2 / usevnodes;
+
+	done = 0;
+	mtx_lock(&mntvnode_mtx);
+	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
+		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+
+		if (vp->v_type != VNON &&
+		    vp->v_type != VBAD &&
+		    VMIGHTFREE(vp) &&           /* critical path opt */
+		    (vp->v_object == NULL || vp->v_object->resident_page_count < trigger) &&
+		    mtx_trylock(&vp->v_interlock)
+		) {
+			mtx_unlock(&mntvnode_mtx);
+			if (VMIGHTFREE(vp)) {
+				vgonel(vp, curthread);
+				done++;
+			} else {
+				mtx_unlock(&vp->v_interlock);
+			}
+			mtx_lock(&mntvnode_mtx);
+		}
+		--count;
+	}
+	mtx_unlock(&mntvnode_mtx);
+	return done;
+}
+
+/*
+ * Attempt to recycle vnodes in a context that is always safe to block.
+ * Calling vlrurecycle() from the bowels of filesystem code has some
+ * interesting deadlock problems.
+ */
+static struct proc *vnlruproc;
+static int vnlruproc_sig;
+
+static void
+vnlru_proc(void)
+{
+	struct mount *mp, *nmp;
+	int s;
+	int done;
+	struct proc *p = vnlruproc;
+	struct thread *td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
+
+	mtx_lock(&Giant);
+
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
+	    SHUTDOWN_PRI_FIRST);
+
+	s = splbio();
+	for (;;) {
+		kthread_suspend_check(p);
+		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
+			vnlruproc_sig = 0;
+			tsleep(vnlruproc, PVFS, "vlruwt", 0);
+			continue;
+		}
+		done = 0;
+		mtx_lock(&mountlist_mtx);
+		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+				nmp = TAILQ_NEXT(mp, mnt_list);
+				continue;
+			}
+			done += vlrureclaim(mp, 10);
+			mtx_lock(&mountlist_mtx);
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			vfs_unbusy(mp, td);
+		}
+		mtx_unlock(&mountlist_mtx);
+		if (done == 0) {
+#if 0
+			/* These messages are temporary debugging aids */
+			if (vnlru_nowhere < 5)
+				printf("vnlru process getting nowhere..\n");
+			else if (vnlru_nowhere == 5)
+				printf("vnlru process messages stopped.\n");
+#endif
+			vnlru_nowhere++;
+			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
+		}
+	}
+	splx(s);
+}
+
+static struct kproc_desc vnlru_kp = {
+	"vnlru",
+	vnlru_proc,
+	&vnlruproc
+};
+SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
+
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(tag, mp, vops, vpp)
+	enum vtagtype tag;
+	struct mount *mp;
+	vop_t **vops;
+	struct vnode **vpp;
+{
+	int s;
+	struct thread *td = curthread;	/* XXX */
+	struct vnode *vp = NULL;
+	struct mount *vnmp;
+	vm_object_t object;
+
+	s = splbio();
+	/*
+	 * Try to reuse vnodes if we hit the max.  This situation only
+	 * occurs in certain large-memory (2G+) situations.  We cannot
+	 * attempt to directly reclaim vnodes due to nasty recursion
+	 * problems.
+	 */
+	if (vnlruproc_sig == 0 && numvnodes - freevnodes > desiredvnodes) {
+		vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
+		wakeup(vnlruproc);
+	}
+
+	/*
+	 * Attempt to reuse a vnode already on the free list, allocating
+	 * a new vnode if we can't find one or if we have not reached a
+	 * good minimum for good LRU performance.
+	 */
+
+	mtx_lock(&vnode_free_list_mtx);
+
+	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
+		int count;
+
+		for (count = 0; count < freevnodes; count++) {
+			vp = TAILQ_FIRST(&vnode_free_list);
+			if (vp == NULL || vp->v_usecount)
+				panic("getnewvnode: free vnode isn't");
+			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+
+			/* Don't recycle if we can't get the interlock */
+			if (!mtx_trylock(&vp->v_interlock)) {
+				vp = NULL;
+				continue;
+			}
+
+			/* We should be able to immediately acquire this */
+			if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td) != 0)
+				continue;
+			/*
+			 * Don't recycle if we still have cached pages.
+			 */
+			if (VOP_GETVOBJECT(vp, &object) == 0 &&
+			     (object->resident_page_count ||
+			      object->ref_count)) {
+				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
+						    v_freelist);
+				VOP_UNLOCK(vp, 0, td);
+				vp = NULL;
+				continue;
+			}
+			if (LIST_FIRST(&vp->v_cache_src)) {
+				/*
+				 * note: nameileafonly sysctl is temporary,
+				 * for debugging only, and will eventually be
+				 * removed.
+				 */
+				if (nameileafonly > 0) {
+					/*
+					 * Do not reuse namei-cached directory
+					 * vnodes that have cached
+					 * subdirectories.
+					 */
+					if (cache_leaf_test(vp) < 0) {
+						VOP_UNLOCK(vp, 0, td);
+						TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+						vp = NULL;
+						continue;
+					}
+				} else if (nameileafonly < 0 ||
+					    vmiodirenable == 0) {
+					/*
+					 * Do not reuse namei-cached directory
+					 * vnodes if nameileafonly is -1 or
+					 * if VMIO backing for directories is
+					 * turned off (otherwise we reuse them
+					 * too quickly).
+					 */
+					VOP_UNLOCK(vp, 0, td);
+					TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+					vp = NULL;
+					continue;
+				}
+			}
+			/*
+			 * Skip over it if its filesystem is being suspended.
+			 */
+			if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0)
+				break;
+			VOP_UNLOCK(vp, 0, td);
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+			vp = NULL;
+		}
+	}
+	if (vp) {
+		vp->v_flag |= VDOOMED;
+		vp->v_flag &= ~VFREE;
+		freevnodes--;
+		mtx_unlock(&vnode_free_list_mtx);
+		cache_purge(vp);
+		if (vp->v_type != VBAD) {
+			VOP_UNLOCK(vp, 0, td);
+			vgone(vp);
+		} else {
+			VOP_UNLOCK(vp, 0, td);
+		}
+		vn_finished_write(vnmp);
+
+#ifdef INVARIANTS
+		{
+			int s;
+
+			if (vp->v_data)
+				panic("cleaned vnode isn't");
+			s = splbio();
+			if (vp->v_numoutput)
+				panic("Clean vnode has pending I/O's");
+			splx(s);
+			if (vp->v_writecount != 0)
+				panic("Non-zero write count");
+		}
+#endif
+		if (vp->v_pollinfo) {
+			mtx_destroy(&vp->v_pollinfo->vpi_lock);
+			uma_zfree(vnodepoll_zone, vp->v_pollinfo);
+		}
+		vp->v_pollinfo = NULL;
+		vp->v_flag = 0;
+		vp->v_lastw = 0;
+		vp->v_lasta = 0;
+		vp->v_cstart = 0;
+		vp->v_clen = 0;
+		vp->v_socket = 0;
+	} else {
+		mtx_unlock(&vnode_free_list_mtx);
+		vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
+		bzero((char *) vp, sizeof *vp);
+		mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+		vp->v_dd = vp;
+		cache_purge(vp);
+		LIST_INIT(&vp->v_cache_src);
+		TAILQ_INIT(&vp->v_cache_dst);
+		numvnodes++;
+	}
+
+	TAILQ_INIT(&vp->v_cleanblkhd);
+	TAILQ_INIT(&vp->v_dirtyblkhd);
+	vp->v_type = VNON;
+	vp->v_tag = tag;
+	vp->v_op = vops;
+	lockinit(&vp->v_lock, PVFS, "vnlock", VLKTIMEOUT, LK_NOPAUSE);
+	insmntque(vp, mp);
+	*vpp = vp;
+	vp->v_usecount = 1;
+	vp->v_data = 0;
+
+	splx(s);
+
+#if 0
+	vnodeallocs++;
+	if (vnodeallocs % vnoderecycleperiod == 0 &&
+	    freevnodes < vnoderecycleminfreevn &&
+	    vnoderecyclemintotalvn < numvnodes) {
+		/* Recycle vnodes. */
+		cache_purgeleafdirs(vnoderecyclenumber);
+	}
+#endif
+
+	return (0);
+}
+
+/*
+ * Move a vnode from one mount queue to another.
+ */
+static void
+insmntque(vp, mp)
+	register struct vnode *vp;
+	register struct mount *mp;
+{
+
+	mtx_lock(&mntvnode_mtx);
+	/*
+	 * Delete from old mount point vnode list, if on one.
+	 */
+	if (vp->v_mount != NULL)
+		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
+	/*
+	 * Insert into list of vnodes for the new mount point, if available.
+	 */
+	if ((vp->v_mount = mp) == NULL) {
+		mtx_unlock(&mntvnode_mtx);
+		return;
+	}
+	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+	mtx_unlock(&mntvnode_mtx);
+}
+
+/*
+ * Update outstanding I/O count and do wakeup if requested.
+ */
+void
+vwakeup(bp)
+	register struct buf *bp;
+{
+	register struct vnode *vp;
+
+	bp->b_flags &= ~B_WRITEINPROG;
+	if ((vp = bp->b_vp)) {
+		vp->v_numoutput--;
+		if (vp->v_numoutput < 0)
+			panic("vwakeup: neg numoutput");
+		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
+			vp->v_flag &= ~VBWAIT;
+			wakeup(&vp->v_numoutput);
+		}
+	}
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
+	register struct vnode *vp;
+	int flags;
+	struct ucred *cred;
+	struct thread *td;
+	int slpflag, slptimeo;
+{
+	register struct buf *bp;
+	struct buf *nbp, *blist;
+	int s, error;
+	vm_object_t object;
+
+	GIANT_REQUIRED;
+
+	if (flags & V_SAVE) {
+		s = splbio();
+		while (vp->v_numoutput) {
+			vp->v_flag |= VBWAIT;
+			error = tsleep(&vp->v_numoutput,
+			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
+			if (error) {
+				splx(s);
+				return (error);
+			}
+		}
+		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+			splx(s);
+			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
+				return (error);
+			s = splbio();
+			if (vp->v_numoutput > 0 ||
+			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
+				panic("vinvalbuf: dirty bufs");
+		}
+		splx(s);
+	}
+	s = splbio();
+	for (;;) {
+		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
+		if (!blist)
+			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
+		if (!blist)
+			break;
+
+		for (bp = blist; bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+				error = BUF_TIMELOCK(bp,
+				    LK_EXCLUSIVE | LK_SLEEPFAIL,
+				    "vinvalbuf", slpflag, slptimeo);
+				if (error == ENOLCK)
+					break;
+				splx(s);
+				return (error);
+			}
+			/*
+			 * XXX Since there are no node locks for NFS, I
+			 * believe there is a slight chance that a delayed
+			 * write will occur while sleeping just above, so
+			 * check for it.  Note that vfs_bio_awrite expects
+			 * buffers to reside on a queue, while BUF_WRITE and
+			 * brelse do not.
+			 */
+			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+				(flags & V_SAVE)) {
+
+				if (bp->b_vp == vp) {
+					if (bp->b_flags & B_CLUSTEROK) {
+						BUF_UNLOCK(bp);
+						vfs_bio_awrite(bp);
+					} else {
+						bremfree(bp);
+						bp->b_flags |= B_ASYNC;
+						BUF_WRITE(bp);
+					}
+				} else {
+					bremfree(bp);
+					(void) BUF_WRITE(bp);
+				}
+				break;
+			}
+			bremfree(bp);
+			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
+			bp->b_flags &= ~B_ASYNC;
+			brelse(bp);
+		}
+	}
+
+	/*
+	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
+	 * have write I/O in-progress but if there is a VM object then the
+	 * VM object can also have read-I/O in-progress.
+	 */
+	do {
+		while (vp->v_numoutput > 0) {
+			vp->v_flag |= VBWAIT;
+			tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+		}
+		if (VOP_GETVOBJECT(vp, &object) == 0) {
+			while (object->paging_in_progress)
+			vm_object_pip_sleep(object, "vnvlbx");
+		}
+	} while (vp->v_numoutput > 0);
+
+	splx(s);
+
+	/*
+	 * Destroy the copy in the VM cache, too.
+	 */
+	mtx_lock(&vp->v_interlock);
+	if (VOP_GETVOBJECT(vp, &object) == 0) {
+		vm_object_page_remove(object, 0, 0,
+			(flags & V_SAVE) ? TRUE : FALSE);
+	}
+	mtx_unlock(&vp->v_interlock);
+
+	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
+		panic("vinvalbuf: flush failed");
+	return (0);
+}
+
+/*
+ * Truncate a file's buffer and pages to a specified length.  This
+ * is in lieu of the old vinvalbuf mechanism, which performed unneeded
+ * sync activity.
+ */
+int
+vtruncbuf(vp, cred, td, length, blksize)
+	register struct vnode *vp;
+	struct ucred *cred;
+	struct thread *td;
+	off_t length;
+	int blksize;
+{
+	register struct buf *bp;
+	struct buf *nbp;
+	int s, anyfreed;
+	int trunclbn;
+
+	/*
+	 * Round up to the *next* lbn.
+	 */
+	trunclbn = (length + blksize - 1) / blksize;
+
+	s = splbio();
+restart:
+	anyfreed = 1;
+	for (;anyfreed;) {
+		anyfreed = 0;
+		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if (bp->b_lblkno >= trunclbn) {
+				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
+					goto restart;
+				} else {
+					bremfree(bp);
+					bp->b_flags |= (B_INVAL | B_RELBUF);
+					bp->b_flags &= ~B_ASYNC;
+					brelse(bp);
+					anyfreed = 1;
+				}
+				if (nbp &&
+				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
+				    (nbp->b_vp != vp) ||
+				    (nbp->b_flags & B_DELWRI))) {
+					goto restart;
+				}
+			}
+		}
+
+		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if (bp->b_lblkno >= trunclbn) {
+				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
+					goto restart;
+				} else {
+					bremfree(bp);
+					bp->b_flags |= (B_INVAL | B_RELBUF);
+					bp->b_flags &= ~B_ASYNC;
+					brelse(bp);
+					anyfreed = 1;
+				}
+				if (nbp &&
+				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
+				    (nbp->b_vp != vp) ||
+				    (nbp->b_flags & B_DELWRI) == 0)) {
+					goto restart;
+				}
+			}
+		}
+	}
+
+	if (length > 0) {
+restartsync:
+		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+			nbp = TAILQ_NEXT(bp, b_vnbufs);
+			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
+				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
+					goto restart;
+				} else {
+					bremfree(bp);
+					if (bp->b_vp == vp) {
+						bp->b_flags |= B_ASYNC;
+					} else {
+						bp->b_flags &= ~B_ASYNC;
+					}
+					BUF_WRITE(bp);
+				}
+				goto restartsync;
+			}
+
+		}
+	}
+
+	while (vp->v_numoutput > 0) {
+		vp->v_flag |= VBWAIT;
+		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
+	}
+
+	splx(s);
+
+	vnode_pager_setsize(vp, length);
+
+	return (0);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(vp, bp)
+	register struct vnode *vp;
+	register struct buf *bp;
+{
+	int s;
+
+	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
+
+	vhold(vp);
+	bp->b_vp = vp;
+	bp->b_dev = vn_todev(vp);
+	/*
+	 * Insert onto list for new vnode.
+	 */
+	s = splbio();
+	bp->b_xflags |= BX_VNCLEAN;
+	bp->b_xflags &= ~BX_VNDIRTY;
+	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
+	splx(s);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(bp)
+	register struct buf *bp;
+{
+	struct vnode *vp;
+	struct buflists *listheadp;
+	int s;
+
+	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
+
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	vp = bp->b_vp;
+	s = splbio();
+	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
+		if (bp->b_xflags & BX_VNDIRTY)
+			listheadp = &vp->v_dirtyblkhd;
+		else
+			listheadp = &vp->v_cleanblkhd;
+		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
+	}
+	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+		vp->v_flag &= ~VONWORKLST;
+		LIST_REMOVE(vp, v_synclist);
+	}
+	splx(s);
+	bp->b_vp = (struct vnode *) 0;
+	vdrop(vp);
+}
+
+/*
+ * Add an item to the syncer work queue.
+ */
+static void
+vn_syncer_add_to_worklist(struct vnode *vp, int delay)
+{
+	int s, slot;
+
+	s = splbio();
+
+	if (vp->v_flag & VONWORKLST) {
+		LIST_REMOVE(vp, v_synclist);
+	}
+
+	if (delay > syncer_maxdelay - 2)
+		delay = syncer_maxdelay - 2;
+	slot = (syncer_delayno + delay) & syncer_mask;
+
+	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
+	vp->v_flag |= VONWORKLST;
+	splx(s);
+}
+
+struct  proc *updateproc;
+static void sched_sync(void);
+static struct kproc_desc up_kp = {
+	"syncer",
+	sched_sync,
+	&updateproc
+};
+SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+void
+sched_sync(void)
+{
+	struct synclist *slp;
+	struct vnode *vp;
+	struct mount *mp;
+	long starttime;
+	int s;
+	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);  /* XXXKSE */
+
+	mtx_lock(&Giant);
+
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
+	    SHUTDOWN_PRI_LAST);
+
+	for (;;) {
+		kthread_suspend_check(td->td_proc);
+
+		starttime = time_second;
+
+		/*
+		 * Push files whose dirty time has expired.  Be careful
+		 * of interrupt race on slp queue.
+		 */
+		s = splbio();
+		slp = &syncer_workitem_pending[syncer_delayno];
+		syncer_delayno += 1;
+		if (syncer_delayno == syncer_maxdelay)
+			syncer_delayno = 0;
+		splx(s);
+
+		while ((vp = LIST_FIRST(slp)) != NULL) {
+			if (VOP_ISLOCKED(vp, NULL) == 0 &&
+			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
+				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+				(void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
+				VOP_UNLOCK(vp, 0, td);
+				vn_finished_write(mp);
+			}
+			s = splbio();
+			if (LIST_FIRST(slp) == vp) {
+				/*
+				 * Note: v_tag VT_VFS vps can remain on the
+				 * worklist too with no dirty blocks, but
+				 * since sync_fsync() moves it to a different
+				 * slot we are safe.
+				 */
+				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
+				    !vn_isdisk(vp, NULL))
+					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
+				/*
+				 * Put us back on the worklist.  The worklist
+				 * routine will remove us from our current
+				 * position and then add us back in at a later
+				 * position.
+				 */
+				vn_syncer_add_to_worklist(vp, syncdelay);
+			}
+			splx(s);
+		}
+
+		/*
+		 * Do soft update processing.
+		 */
+#ifdef SOFTUPDATES
+		softdep_process_worklist(NULL);
+#endif
+
+		/*
+		 * The variable rushjob allows the kernel to speed up the
+		 * processing of the filesystem syncer process. A rushjob
+		 * value of N tells the filesystem syncer to process the next
+		 * N seconds worth of work on its queue ASAP. Currently rushjob
+		 * is used by the soft update code to speed up the filesystem
+		 * syncer process when the incore state is getting so far
+		 * ahead of the disk that the kernel memory pool is being
+		 * threatened with exhaustion.
+		 */
+		if (rushjob > 0) {
+			rushjob -= 1;
+			continue;
+		}
+		/*
+		 * If it has taken us less than a second to process the
+		 * current work, then wait. Otherwise start right over
+		 * again. We can still lose time if any single round
+		 * takes more than two seconds, but it does not really
+		 * matter as we are just trying to generally pace the
+		 * filesystem activity.
+		 */
+		if (time_second == starttime)
+			tsleep(&lbolt, PPAUSE, "syncer", 0);
+	}
+}
+
+/*
+ * Request the syncer daemon to speed up its work.
+ * We never push it to speed up more than half of its
+ * normal turn time, otherwise it could take over the cpu.
+ * XXXKSE  only one update?
+ */
+int
+speedup_syncer()
+{
+
+	mtx_lock_spin(&sched_lock);
+	if (FIRST_THREAD_IN_PROC(updateproc)->td_wchan == &lbolt) /* XXXKSE */
+		setrunnable(FIRST_THREAD_IN_PROC(updateproc));
+	mtx_unlock_spin(&sched_lock);
+	if (rushjob < syncdelay / 2) {
+		rushjob += 1;
+		stat_rush_requests += 1;
+		return (1);
+	}
+	return(0);
+}
+
+/*
+ * Associate a p-buffer with a vnode.
+ *
+ * Also sets B_PAGING flag to indicate that vnode is not fully associated
+ * with the buffer.  i.e. the bp has not been linked into the vnode or
+ * ref-counted.
+ */
+void
+pbgetvp(vp, bp)
+	register struct vnode *vp;
+	register struct buf *bp;
+{
+
+	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
+
+	bp->b_vp = vp;
+	bp->b_flags |= B_PAGING;
+	bp->b_dev = vn_todev(vp);
+}
+
+/*
+ * Disassociate a p-buffer from a vnode.
+ */
+void
+pbrelvp(bp)
+	register struct buf *bp;
+{
+
+	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
+
+	/* XXX REMOVE ME */
+	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
+		panic(
+		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
+		    bp,
+		    (int)bp->b_flags
+		);
+	}
+	bp->b_vp = (struct vnode *) 0;
+	bp->b_flags &= ~B_PAGING;
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(bp, newvp)
+	register struct buf *bp;
+	register struct vnode *newvp;
+{
+	struct buflists *listheadp;
+	int delay;
+	int s;
+
+	if (newvp == NULL) {
+		printf("reassignbuf: NULL");
+		return;
+	}
+	++reassignbufcalls;
+
+	/*
+	 * B_PAGING flagged buffers cannot be reassigned because their vp
+	 * is not fully linked in.
+	 */
+	if (bp->b_flags & B_PAGING)
+		panic("cannot reassign paging buffer");
+
+	s = splbio();
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
+		if (bp->b_xflags & BX_VNDIRTY)
+			listheadp = &bp->b_vp->v_dirtyblkhd;
+		else
+			listheadp = &bp->b_vp->v_cleanblkhd;
+		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
+		if (bp->b_vp != newvp) {
+			vdrop(bp->b_vp);
+			bp->b_vp = NULL;	/* for clarification */
+		}
+	}
+	/*
+	 * If dirty, put on list of dirty buffers; otherwise insert onto list
+	 * of clean buffers.
+	 */
+	if (bp->b_flags & B_DELWRI) {
+		struct buf *tbp;
+
+		listheadp = &newvp->v_dirtyblkhd;
+		if ((newvp->v_flag & VONWORKLST) == 0) {
+			switch (newvp->v_type) {
+			case VDIR:
+				delay = dirdelay;
+				break;
+			case VCHR:
+				if (newvp->v_rdev->si_mountpoint != NULL) {
+					delay = metadelay;
+					break;
+				}
+				/* fall through */
+			default:
+				delay = filedelay;
+			}
+			vn_syncer_add_to_worklist(newvp, delay);
+		}
+		bp->b_xflags |= BX_VNDIRTY;
+		tbp = TAILQ_FIRST(listheadp);
+		if (tbp == NULL ||
+		    bp->b_lblkno == 0 ||
+		    (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
+		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
+			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
+			++reassignbufsortgood;
+		} else if (bp->b_lblkno < 0) {
+			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
+			++reassignbufsortgood;
+		} else if (reassignbufmethod == 1) {
+			/*
+			 * New sorting algorithm, only handle sequential case,
+			 * otherwise append to end (but before metadata)
+			 */
+			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
+			    (tbp->b_xflags & BX_VNDIRTY)) {
+				/*
+				 * Found the best place to insert the buffer
+				 */
+				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+				++reassignbufsortgood;
+			} else {
+				/*
+				 * Missed, append to end, but before meta-data.
+				 * We know that the head buffer in the list is
+				 * not meta-data due to prior conditionals.
+				 *
+				 * Indirect effects:  NFS second stage write
+				 * tends to wind up here, giving maximum
+				 * distance between the unstable write and the
+				 * commit rpc.
+				 */
+				tbp = TAILQ_LAST(listheadp, buflists);
+				while (tbp && tbp->b_lblkno < 0)
+					tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
+				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+				++reassignbufsortbad;
+			}
+		} else {
+			/*
+			 * Old sorting algorithm, scan queue and insert
+			 */
+			struct buf *ttbp;
+			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
+			    (ttbp->b_lblkno < bp->b_lblkno)) {
+				++reassignbufloops;
+				tbp = ttbp;
+			}
+			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+		}
+	} else {
+		bp->b_xflags |= BX_VNCLEAN;
+		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
+		if ((newvp->v_flag & VONWORKLST) &&
+		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
+			newvp->v_flag &= ~VONWORKLST;
+			LIST_REMOVE(newvp, v_synclist);
+		}
+	}
+	if (bp->b_vp != newvp) {
+		bp->b_vp = newvp;
+		vhold(bp->b_vp);
+	}
+	splx(s);
+}
+
+/*
+ * Create a vnode for a device.
+ * Used for mounting the root filesystem.
+ */
+int
+bdevvp(dev, vpp)
+	dev_t dev;
+	struct vnode **vpp;
+{
+	register struct vnode *vp;
+	struct vnode *nvp;
+	int error;
+
+	if (dev == NODEV) {
+		*vpp = NULLVP;
+		return (ENXIO);
+	}
+	if (vfinddev(dev, VCHR, vpp))
+		return (0);
+	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
+	if (error) {
+		*vpp = NULLVP;
+		return (error);
+	}
+	vp = nvp;
+	vp->v_type = VCHR;
+	addalias(vp, dev);
+	*vpp = vp;
+	return (0);
+}
+
+/*
+ * Add vnode to the alias list hung off the dev_t.
+ *
+ * The reason for this gunk is that multiple vnodes can reference
+ * the same physical device, so checking vp->v_usecount to see
+ * how many users there are is inadequate; the v_usecount for
+ * the vnodes need to be accumulated.  vcount() does that.
+ */
+struct vnode *
+addaliasu(nvp, nvp_rdev)
+	struct vnode *nvp;
+	udev_t nvp_rdev;
+{
+	struct vnode *ovp;
+	vop_t **ops;
+	dev_t dev;
+
+	if (nvp->v_type == VBLK)
+		return (nvp);
+	if (nvp->v_type != VCHR)
+		panic("addaliasu on non-special vnode");
+	dev = udev2dev(nvp_rdev, 0);
+	/*
+	 * Check to see if we have a bdevvp vnode with no associated
+	 * filesystem. If so, we want to associate the filesystem of
+	 * the new newly instigated vnode with the bdevvp vnode and
+	 * discard the newly created vnode rather than leaving the
+	 * bdevvp vnode lying around with no associated filesystem.
+	 */
+	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
+		addalias(nvp, dev);
+		return (nvp);
+	}
+	/*
+	 * Discard unneeded vnode, but save its node specific data.
+	 * Note that if there is a lock, it is carried over in the
+	 * node specific data to the replacement vnode.
+	 */
+	vref(ovp);
+	ovp->v_data = nvp->v_data;
+	ovp->v_tag = nvp->v_tag;
+	nvp->v_data = NULL;
+	lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg,
+	    nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK);
+	if (nvp->v_vnlock)
+		ovp->v_vnlock = &ovp->v_lock;
+	ops = ovp->v_op;
+	ovp->v_op = nvp->v_op;
+	if (VOP_ISLOCKED(nvp, curthread)) {
+		VOP_UNLOCK(nvp, 0, curthread);
+		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	}
+	nvp->v_op = ops;
+	insmntque(ovp, nvp->v_mount);
+	vrele(nvp);
+	vgone(nvp);
+	return (ovp);
+}
+
+/* This is a local helper function that do the same as addaliasu, but for a
+ * dev_t instead of an udev_t. */
+static void
+addalias(nvp, dev)
+	struct vnode *nvp;
+	dev_t dev;
+{
+
+	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
+	nvp->v_rdev = dev;
+	mtx_lock(&spechash_mtx);
+	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
+	mtx_unlock(&spechash_mtx);
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it. The vnode lock bit is set if the
+ * vnode is being eliminated in vgone. The process is awakened
+ * when the transition is completed, and an error returned to
+ * indicate that the vnode is no longer usable (possibly having
+ * been changed to a new filesystem type).
+ */
+int
+vget(vp, flags, td)
+	register struct vnode *vp;
+	int flags;
+	struct thread *td;
+{
+	int error;
+
+	/*
+	 * If the vnode is in the process of being cleaned out for
+	 * another use, we wait for the cleaning to finish and then
+	 * return failure. Cleaning is determined by checking that
+	 * the VXLOCK flag is set.
+	 */
+	if ((flags & LK_INTERLOCK) == 0)
+		mtx_lock(&vp->v_interlock);
+	if (vp->v_flag & VXLOCK) {
+		if (vp->v_vxproc == curthread) {
+#if 0
+			/* this can now occur in normal operation */
+			log(LOG_INFO, "VXLOCK interlock avoided\n");
+#endif
+		} else {
+			vp->v_flag |= VXWANT;
+			msleep(vp, &vp->v_interlock, PINOD | PDROP, "vget", 0);
+			return (ENOENT);
+		}
+	}
+
+	vp->v_usecount++;
+
+	if (VSHOULDBUSY(vp))
+		vbusy(vp);
+	if (flags & LK_TYPE_MASK) {
+		if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
+			/*
+			 * must expand vrele here because we do not want
+			 * to call VOP_INACTIVE if the reference count
+			 * drops back to zero since it was never really
+			 * active. We must remove it from the free list
+			 * before sleeping so that multiple processes do
+			 * not try to recycle it.
+			 */
+			mtx_lock(&vp->v_interlock);
+			vp->v_usecount--;
+			if (VSHOULDFREE(vp))
+				vfree(vp);
+			else
+				vlruvp(vp);
+			mtx_unlock(&vp->v_interlock);
+		}
+		return (error);
+	}
+	mtx_unlock(&vp->v_interlock);
+	return (0);
+}
+
+/*
+ * Increase the reference count of a vnode.
+ */
+void
+vref(struct vnode *vp)
+{
+	mtx_lock(&vp->v_interlock);
+	vp->v_usecount++;
+	mtx_unlock(&vp->v_interlock);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(vp)
+	struct vnode *vp;
+{
+	struct thread *td = curthread;	/* XXX */
+
+	KASSERT(vp != NULL, ("vrele: null vp"));
+
+	mtx_lock(&vp->v_interlock);
+
+	/* Skip this v_writecount check if we're going to panic below. */
+	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
+	    ("vrele: missed vn_close"));
+
+	if (vp->v_usecount > 1) {
+
+		vp->v_usecount--;
+		mtx_unlock(&vp->v_interlock);
+
+		return;
+	}
+
+	if (vp->v_usecount == 1) {
+		vp->v_usecount--;
+		/*
+		 * We must call VOP_INACTIVE with the node locked.
+		 * If we are doing a vput, the node is already locked,
+		 * but, in the case of vrele, we must explicitly lock
+		 * the vnode before calling VOP_INACTIVE.
+		 */
+		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0)
+			VOP_INACTIVE(vp, td);
+		if (VSHOULDFREE(vp))
+			vfree(vp);
+		else
+			vlruvp(vp);
+
+	} else {
+#ifdef DIAGNOSTIC
+		vprint("vrele: negative ref count", vp);
+		mtx_unlock(&vp->v_interlock);
+#endif
+		panic("vrele: negative ref cnt");
+	}
+}
+
+/*
+ * Release an already locked vnode.  This give the same effects as
+ * unlock+vrele(), but takes less time and avoids releasing and
+ * re-aquiring the lock (as vrele() aquires the lock internally.)
+ */
+void
+vput(vp)
+	struct vnode *vp;
+{
+	struct thread *td = curthread;	/* XXX */
+
+	GIANT_REQUIRED;
+
+	KASSERT(vp != NULL, ("vput: null vp"));
+	mtx_lock(&vp->v_interlock);
+	/* Skip this v_writecount check if we're going to panic below. */
+	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
+	    ("vput: missed vn_close"));
+
+	if (vp->v_usecount > 1) {
+		vp->v_usecount--;
+		VOP_UNLOCK(vp, LK_INTERLOCK, td);
+		return;
+	}
+
+	if (vp->v_usecount == 1) {
+		vp->v_usecount--;
+		/*
+		 * We must call VOP_INACTIVE with the node locked.
+		 * If we are doing a vput, the node is already locked,
+		 * so we just need to release the vnode mutex.
+		 */
+		mtx_unlock(&vp->v_interlock);
+		VOP_INACTIVE(vp, td);
+		if (VSHOULDFREE(vp))
+			vfree(vp);
+		else
+			vlruvp(vp);
+
+	} else {
+#ifdef DIAGNOSTIC
+		vprint("vput: negative ref count", vp);
+#endif
+		panic("vput: negative ref cnt");
+	}
+}
+
+/*
+ * Somebody doesn't want the vnode recycled.
+ */
+void
+vhold(vp)
+	register struct vnode *vp;
+{
+	int s;
+
+	s = splbio();
+	vp->v_holdcnt++;
+	if (VSHOULDBUSY(vp))
+		vbusy(vp);
+	splx(s);
+}
+
+/*
+ * Note that there is one less who cares about this vnode.  vdrop() is the
+ * opposite of vhold().
+ */
+void
+vdrop(vp)
+	register struct vnode *vp;
+{
+	int s;
+
+	s = splbio();
+	if (vp->v_holdcnt <= 0)
+		panic("vdrop: holdcnt");
+	vp->v_holdcnt--;
+	if (VSHOULDFREE(vp))
+		vfree(vp);
+	else
+		vlruvp(vp);
+	splx(s);
+}
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If FORCECLOSE is not specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If FORCECLOSE is specified, detach any active vnodes
+ * that are found.
+ *
+ * If WRITECLOSE is set, only flush out regular file vnodes open for
+ * writing.
+ *
+ * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped.
+ *
+ * `rootrefs' specifies the base reference count for the root vnode
+ * of this filesystem. The root vnode is considered busy if its
+ * v_usecount exceeds this value. On a successful return, vflush()
+ * will call vrele() on the root vnode exactly rootrefs times.
+ * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
+ * be zero.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0;		/* print out busy vnodes */
+SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
+#endif
+
+int
+vflush(mp, rootrefs, flags)
+	struct mount *mp;
+	int rootrefs;
+	int flags;
+{
+	struct thread *td = curthread;	/* XXX */
+	struct vnode *vp, *nvp, *rootvp = NULL;
+	struct vattr vattr;
+	int busy = 0, error;
+
+	if (rootrefs > 0) {
+		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
+		    ("vflush: bad args"));
+		/*
+		 * Get the filesystem root vnode. We can vput() it
+		 * immediately, since with rootrefs > 0, it won't go away.
+		 */
+		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
+			return (error);
+		vput(rootvp);
+	}
+	mtx_lock(&mntvnode_mtx);
+loop:
+	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
+		/*
+		 * Make sure this vnode wasn't reclaimed in getnewvnode().
+		 * Start over if it has (it won't be on the list anymore).
+		 */
+		if (vp->v_mount != mp)
+			goto loop;
+		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
+
+		mtx_unlock(&mntvnode_mtx);
+		mtx_lock(&vp->v_interlock);
+		/*
+		 * Skip over a vnodes marked VSYSTEM.
+		 */
+		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+			mtx_unlock(&vp->v_interlock);
+			mtx_lock(&mntvnode_mtx);
+			continue;
+		}
+		/*
+		 * If WRITECLOSE is set, flush out unlinked but still open
+		 * files (even if open only for reading) and regular file
+		 * vnodes open for writing.
+		 */
+		if ((flags & WRITECLOSE) &&
+		    (vp->v_type == VNON ||
+		    (VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 &&
+		    vattr.va_nlink > 0)) &&
+		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
+			mtx_unlock(&vp->v_interlock);
+			mtx_lock(&mntvnode_mtx);
+			continue;
+		}
+
+		/*
+		 * With v_usecount == 0, all we need to do is clear out the
+		 * vnode data structures and we are done.
+		 */
+		if (vp->v_usecount == 0) {
+			vgonel(vp, td);
+			mtx_lock(&mntvnode_mtx);
+			continue;
+		}
+
+		/*
+		 * If FORCECLOSE is set, forcibly close the vnode. For block
+		 * or character devices, revert to an anonymous device. For
+		 * all other files, just kill them.
+		 */
+		if (flags & FORCECLOSE) {
+			if (vp->v_type != VCHR) {
+				vgonel(vp, td);
+			} else {
+				vclean(vp, 0, td);
+				vp->v_op = spec_vnodeop_p;
+				insmntque(vp, (struct mount *) 0);
+			}
+			mtx_lock(&mntvnode_mtx);
+			continue;
+		}
+#ifdef DIAGNOSTIC
+		if (busyprt)
+			vprint("vflush: busy vnode", vp);
+#endif
+		mtx_unlock(&vp->v_interlock);
+		mtx_lock(&mntvnode_mtx);
+		busy++;
+	}
+	mtx_unlock(&mntvnode_mtx);
+	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
+		/*
+		 * If just the root vnode is busy, and if its refcount
+		 * is equal to `rootrefs', then go ahead and kill it.
+		 */
+		mtx_lock(&rootvp->v_interlock);
+		KASSERT(busy > 0, ("vflush: not busy"));
+		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
+		if (busy == 1 && rootvp->v_usecount == rootrefs) {
+			vgonel(rootvp, td);
+			busy = 0;
+		} else
+			mtx_unlock(&rootvp->v_interlock);
+	}
+	if (busy)
+		return (EBUSY);
+	for (; rootrefs > 0; rootrefs--)
+		vrele(rootvp);
+	return (0);
+}
+
+/*
+ * This moves a now (likely recyclable) vnode to the end of the
+ * mountlist.  XXX However, it is temporarily disabled until we
+ * can clean up ffs_sync() and friends, which have loop restart
+ * conditions which this code causes to operate O(N^2).
+ */
+static void
+vlruvp(struct vnode *vp)
+{
+#if 0
+	struct mount *mp;
+
+	if ((mp = vp->v_mount) != NULL) {
+		mtx_lock(&mntvnode_mtx);
+		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+		mtx_unlock(&mntvnode_mtx);
+	}
+#endif
+}
+
+/*
+ * Disassociate the underlying filesystem from a vnode.
+ */
+static void
+vclean(vp, flags, td)
+	struct vnode *vp;
+	int flags;
+	struct thread *td;
+{
+	int active;
+
+	/*
+	 * Check to see if the vnode is in use. If so we have to reference it
+	 * before we clean it out so that its count cannot fall to zero and
+	 * generate a race against ourselves to recycle it.
+	 */
+	if ((active = vp->v_usecount))
+		vp->v_usecount++;
+
+	/*
+	 * Prevent the vnode from being recycled or brought into use while we
+	 * clean it out.
+	 */
+	if (vp->v_flag & VXLOCK)
+		panic("vclean: deadlock");
+	vp->v_flag |= VXLOCK;
+	vp->v_vxproc = curthread;
+	/*
+	 * Even if the count is zero, the VOP_INACTIVE routine may still
+	 * have the object locked while it cleans it out. The VOP_LOCK
+	 * ensures that the VOP_INACTIVE routine is done with its work.
+	 * For active vnodes, it ensures that no other activity can
+	 * occur while the underlying object is being cleaned out.
+	 */
+	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
+
+	/*
+	 * Clean out any buffers associated with the vnode.
+	 * If the flush fails, just toss the buffers.
+	 */
+	if (flags & DOCLOSE) {
+		if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
+			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
+		if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
+			vinvalbuf(vp, 0, NOCRED, td, 0, 0);
+	}
+
+	VOP_DESTROYVOBJECT(vp);
+
+	/*
+	 * If purging an active vnode, it must be closed and
+	 * deactivated before being reclaimed. Note that the
+	 * VOP_INACTIVE will unlock the vnode.
+	 */
+	if (active) {
+		if (flags & DOCLOSE)
+			VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
+		VOP_INACTIVE(vp, td);
+	} else {
+		/*
+		 * Any other processes trying to obtain this lock must first
+		 * wait for VXLOCK to clear, then call the new lock operation.
+		 */
+		VOP_UNLOCK(vp, 0, td);
+	}
+	/*
+	 * Reclaim the vnode.
+	 */
+	if (VOP_RECLAIM(vp, td))
+		panic("vclean: cannot reclaim");
+
+	if (active) {
+		/*
+		 * Inline copy of vrele() since VOP_INACTIVE
+		 * has already been called.
+		 */
+		mtx_lock(&vp->v_interlock);
+		if (--vp->v_usecount <= 0) {
+#ifdef DIAGNOSTIC
+			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
+				vprint("vclean: bad ref count", vp);
+				panic("vclean: ref cnt");
+			}
+#endif
+			vfree(vp);
+		}
+		mtx_unlock(&vp->v_interlock);
+	}
+
+	cache_purge(vp);
+	vp->v_vnlock = NULL;
+	lockdestroy(&vp->v_lock);
+
+	if (VSHOULDFREE(vp))
+		vfree(vp);
+
+	/*
+	 * Done with purge, notify sleepers of the grim news.
+	 */
+	vp->v_op = dead_vnodeop_p;
+	if (vp->v_pollinfo != NULL)
+		vn_pollgone(vp);
+	vp->v_tag = VT_NON;
+	vp->v_flag &= ~VXLOCK;
+	vp->v_vxproc = NULL;
+	if (vp->v_flag & VXWANT) {
+		vp->v_flag &= ~VXWANT;
+		wakeup(vp);
+	}
+}
+
+/*
+ * Eliminate all activity associated with the requested vnode
+ * and with all vnodes aliased to the requested vnode.
+ */
+int
+vop_revoke(ap)
+	struct vop_revoke_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+	} */ *ap;
+{
+	struct vnode *vp, *vq;
+	dev_t dev;
+
+	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
+
+	vp = ap->a_vp;
+	/*
+	 * If a vgone (or vclean) is already in progress,
+	 * wait until it is done and return.
+	 */
+	if (vp->v_flag & VXLOCK) {
+		vp->v_flag |= VXWANT;
+		msleep(vp, &vp->v_interlock, PINOD | PDROP,
+		    "vop_revokeall", 0);
+		return (0);
+	}
+	dev = vp->v_rdev;
+	for (;;) {
+		mtx_lock(&spechash_mtx);
+		vq = SLIST_FIRST(&dev->si_hlist);
+		mtx_unlock(&spechash_mtx);
+		if (!vq)
+			break;
+		vgone(vq);
+	}
+	return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ * Release the passed interlock if the vnode will be recycled.
+ */
+int
+vrecycle(vp, inter_lkp, td)
+	struct vnode *vp;
+	struct mtx *inter_lkp;
+	struct thread *td;
+{
+
+	mtx_lock(&vp->v_interlock);
+	if (vp->v_usecount == 0) {
+		if (inter_lkp) {
+			mtx_unlock(inter_lkp);
+		}
+		vgonel(vp, td);
+		return (1);
+	}
+	mtx_unlock(&vp->v_interlock);
+	return (0);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(vp)
+	register struct vnode *vp;
+{
+	struct thread *td = curthread;	/* XXX */
+
+	mtx_lock(&vp->v_interlock);
+	vgonel(vp, td);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+void
+vgonel(vp, td)
+	struct vnode *vp;
+	struct thread *td;
+{
+	int s;
+
+	/*
+	 * If a vgone (or vclean) is already in progress,
+	 * wait until it is done and return.
+	 */
+	if (vp->v_flag & VXLOCK) {
+		vp->v_flag |= VXWANT;
+		msleep(vp, &vp->v_interlock, PINOD | PDROP, "vgone", 0);
+		return;
+	}
+
+	/*
+	 * Clean out the filesystem specific data.
+	 */
+	vclean(vp, DOCLOSE, td);
+	mtx_lock(&vp->v_interlock);
+
+	/*
+	 * Delete from old mount point vnode list, if on one.
+	 */
+	if (vp->v_mount != NULL)
+		insmntque(vp, (struct mount *)0);
+	/*
+	 * If special device, remove it from special device alias list
+	 * if it is on one.
+	 */
+	if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
+		mtx_lock(&spechash_mtx);
+		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
+		freedev(vp->v_rdev);
+		mtx_unlock(&spechash_mtx);
+		vp->v_rdev = NULL;
+	}
+
+	/*
+	 * If it is on the freelist and not already at the head,
+	 * move it to the head of the list. The test of the
+	 * VDOOMED flag and the reference count of zero is because
+	 * it will be removed from the free list by getnewvnode,
+	 * but will not have its reference count incremented until
+	 * after calling vgone. If the reference count were
+	 * incremented first, vgone would (incorrectly) try to
+	 * close the previous instance of the underlying object.
+	 */
+	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
+		s = splbio();
+		mtx_lock(&vnode_free_list_mtx);
+		if (vp->v_flag & VFREE)
+			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		else
+			freevnodes++;
+		vp->v_flag |= VFREE;
+		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+		mtx_unlock(&vnode_free_list_mtx);
+		splx(s);
+	}
+
+	vp->v_type = VBAD;
+	mtx_unlock(&vp->v_interlock);
+}
+
+/*
+ * Lookup a vnode by device number.
+ */
+int
+vfinddev(dev, type, vpp)
+	dev_t dev;
+	enum vtype type;
+	struct vnode **vpp;
+{
+	struct vnode *vp;
+
+	mtx_lock(&spechash_mtx);
+	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
+		if (type == vp->v_type) {
+			*vpp = vp;
+			mtx_unlock(&spechash_mtx);
+			return (1);
+		}
+	}
+	mtx_unlock(&spechash_mtx);
+	return (0);
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(vp)
+	struct vnode *vp;
+{
+	struct vnode *vq;
+	int count;
+
+	count = 0;
+	mtx_lock(&spechash_mtx);
+	SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext)
+		count += vq->v_usecount;
+	mtx_unlock(&spechash_mtx);
+	return (count);
+}
+
+/*
+ * Same as above, but using the dev_t as argument
+ */
+int
+count_dev(dev)
+	dev_t dev;
+{
+	struct vnode *vp;
+
+	vp = SLIST_FIRST(&dev->si_hlist);
+	if (vp == NULL)
+		return (0);
+	return(vcount(vp));
+}
+
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
+
+void
+vprint(label, vp)
+	char *label;
+	struct vnode *vp;
+{
+	char buf[96];
+
+	if (label != NULL)
+		printf("%s: %p: ", label, (void *)vp);
+	else
+		printf("%p: ", (void *)vp);
+	printf("type %s, usecount %d, writecount %d, refcount %d,",
+	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
+	    vp->v_holdcnt);
+	buf[0] = '\0';
+	if (vp->v_flag & VROOT)
+		strcat(buf, "|VROOT");
+	if (vp->v_flag & VTEXT)
+		strcat(buf, "|VTEXT");
+	if (vp->v_flag & VSYSTEM)
+		strcat(buf, "|VSYSTEM");
+	if (vp->v_flag & VXLOCK)
+		strcat(buf, "|VXLOCK");
+	if (vp->v_flag & VXWANT)
+		strcat(buf, "|VXWANT");
+	if (vp->v_flag & VBWAIT)
+		strcat(buf, "|VBWAIT");
+	if (vp->v_flag & VDOOMED)
+		strcat(buf, "|VDOOMED");
+	if (vp->v_flag & VFREE)
+		strcat(buf, "|VFREE");
+	if (vp->v_flag & VOBJBUF)
+		strcat(buf, "|VOBJBUF");
+	if (buf[0] != '\0')
+		printf(" flags (%s)", &buf[1]);
+	if (vp->v_data == NULL) {
+		printf("\n");
+	} else {
+		printf("\n\t");
+		VOP_PRINT(vp);
+	}
+}
+
+#ifdef DDB
+#include <ddb/ddb.h>
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
+{
+	struct thread *td = curthread;	/* XXX */
+	struct mount *mp, *nmp;
+	struct vnode *vp;
+
+	printf("Locked vnodes\n");
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+		mtx_lock(&mntvnode_mtx);
+		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+			if (VOP_ISLOCKED(vp, NULL))
+				vprint((char *)0, vp);
+		}
+		mtx_unlock(&mntvnode_mtx);
+		mtx_lock(&mountlist_mtx);
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		vfs_unbusy(mp, td);
+	}
+	mtx_unlock(&mountlist_mtx);
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1 - 1;	/* XXX */
+	u_int namelen = arg2 + 1;	/* XXX */
+	struct vfsconf *vfsp;
+
+#if 1 || defined(COMPAT_PRELITE2)
+	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+	if (namelen == 1)
+		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+	/* XXX the below code does not compile; vfs_sysctl does not exist. */
+#ifdef notyet
+	/* all sysctl names at this level are at least name and field */
+	if (namelen < 2)
+		return (ENOTDIR);		/* overloaded */
+	if (name[0] != VFS_GENERIC) {
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == name[0])
+				break;
+		if (vfsp == NULL)
+			return (EOPNOTSUPP);
+		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
+		    oldp, oldlenp, newp, newlen, td));
+	}
+#endif
+	switch (name[1]) {
+	case VFS_MAXTYPENUM:
+		if (namelen != 2)
+			return (ENOTDIR);
+		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+	case VFS_CONF:
+		if (namelen != 3)
+			return (ENOTDIR);	/* overloaded */
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == name[2])
+				break;
+		if (vfsp == NULL)
+			return (EOPNOTSUPP);
+		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
+	}
+	return (EOPNOTSUPP);
+}
+
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
+	"Generic filesystem");
+
+#if 1 || defined(COMPAT_PRELITE2)
+
+static int
+sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct vfsconf *vfsp;
+	struct ovfsconf ovfs;
+
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
+		strcpy(ovfs.vfc_name, vfsp->vfc_name);
+		ovfs.vfc_index = vfsp->vfc_typenum;
+		ovfs.vfc_refcount = vfsp->vfc_refcount;
+		ovfs.vfc_flags = vfsp->vfc_flags;
+		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+		if (error)
+			return error;
+	}
+	return 0;
+}
+
+#endif /* 1 || COMPAT_PRELITE2 */
+
+#if COMPILING_LINT
+#define KINFO_VNODESLOP	10
+/*
+ * Dump vnode list (via sysctl).
+ * Copyout address of vnode followed by vnode.
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode(SYSCTL_HANDLER_ARGS)
+{
+	struct thread *td = curthread;	/* XXX */
+	struct mount *mp, *nmp;
+	struct vnode *nvp, *vp;
+	int error;
+
+#define VPTRSZ	sizeof (struct vnode *)
+#define VNODESZ	sizeof (struct vnode)
+
+	req->lock = 0;
+	if (!req->oldptr) /* Make an estimate */
+		return (SYSCTL_OUT(req, 0,
+			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
+
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+		mtx_lock(&mntvnode_mtx);
+again:
+		for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+		     vp != NULL;
+		     vp = nvp) {
+			/*
+			 * Check that the vp is still associated with
+			 * this filesystem.  RACE: could have been
+			 * recycled onto the same filesystem.
+			 */
+			if (vp->v_mount != mp)
+				goto again;
+			nvp = TAILQ_NEXT(vp, v_nmntvnodes);
+			mtx_unlock(&mntvnode_mtx);
+			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
+			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
+				return (error);
+			mtx_lock(&mntvnode_mtx);
+		}
+		mtx_unlock(&mntvnode_mtx);
+		mtx_lock(&mountlist_mtx);
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		vfs_unbusy(mp, td);
+	}
+	mtx_unlock(&mountlist_mtx);
+
+	return (0);
+}
+
+/*
+ * XXX
+ * Exporting the vnode list on large systems causes them to crash.
+ * Exporting the vnode list on medium systems causes sysctl to coredump.
+ */
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, 0, sysctl_vnode, "S,vnode", "");
+#endif
+
+/*
+ * Check to see if a filesystem is mounted on a block device.
+ */
+int
+vfs_mountedon(vp)
+	struct vnode *vp;
+{
+
+	if (vp->v_rdev->si_mountpoint != NULL)
+		return (EBUSY);
+	return (0);
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall()
+{
+	struct mount *mp;
+	struct thread *td;
+	int error;
+
+	if (curthread != NULL)
+		td = curthread;
+	else
+		td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
+	/*
+	 * Since this only runs when rebooting, it is not interlocked.
+	 */
+	while(!TAILQ_EMPTY(&mountlist)) {
+		mp = TAILQ_LAST(&mountlist, mntlist);
+		error = dounmount(mp, MNT_FORCE, td);
+		if (error) {
+			TAILQ_REMOVE(&mountlist, mp, mnt_list);
+			printf("unmount of %s failed (",
+			    mp->mnt_stat.f_mntonname);
+			if (error == EBUSY)
+				printf("BUSY)\n");
+			else
+				printf("%d)\n", error);
+		} else {
+			/* The unmount has removed mp from the mountlist */
+		}
+	}
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags)
+{
+	struct vnode *vp, *nvp;
+	struct vm_object *obj;
+	int tries;
+
+	GIANT_REQUIRED;
+
+	tries = 5;
+	mtx_lock(&mntvnode_mtx);
+loop:
+	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
+		if (vp->v_mount != mp) {
+			if (--tries > 0)
+				goto loop;
+			break;
+		}
+		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
+
+		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
+			continue;
+
+		if (vp->v_flag & VNOSYNC)	/* unlinked, skip it */
+			continue;
+
+		if ((vp->v_flag & VOBJDIRTY) &&
+		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
+			mtx_unlock(&mntvnode_mtx);
+			if (!vget(vp,
+			    LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) {
+				if (VOP_GETVOBJECT(vp, &obj) == 0) {
+					vm_object_page_clean(obj, 0, 0,
+					    flags == MNT_WAIT ?
+					    OBJPC_SYNC : OBJPC_NOSYNC);
+				}
+				vput(vp);
+			}
+			mtx_lock(&mntvnode_mtx);
+			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
+				if (--tries > 0)
+					goto loop;
+				break;
+			}
+		}
+	}
+	mtx_unlock(&mntvnode_mtx);
+}
+
+/*
+ * Create the VM object needed for VMIO and mmap support.  This
+ * is done for all VREG files in the system.  Some filesystems might
+ * afford the additional metadata buffering capability of the
+ * VMIO code by making the device node be VMIO mode also.
+ *
+ * vp must be locked when vfs_object_create is called.
+ */
+int
+vfs_object_create(vp, td, cred)
+	struct vnode *vp;
+	struct thread *td;
+	struct ucred *cred;
+{
+	GIANT_REQUIRED;
+	return (VOP_CREATEVOBJECT(vp, cred, td));
+}
+
+/*
+ * Mark a vnode as free, putting it up for recycling.
+ */
+void
+vfree(vp)
+	struct vnode *vp;
+{
+	int s;
+
+	s = splbio();
+	mtx_lock(&vnode_free_list_mtx);
+	KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
+	if (vp->v_flag & VAGE) {
+		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+	} else {
+		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+	}
+	freevnodes++;
+	mtx_unlock(&vnode_free_list_mtx);
+	vp->v_flag &= ~VAGE;
+	vp->v_flag |= VFREE;
+	splx(s);
+}
+
+/*
+ * Opposite of vfree() - mark a vnode as in use.
+ */
+void
+vbusy(vp)
+	struct vnode *vp;
+{
+	int s;
+
+	s = splbio();
+	mtx_lock(&vnode_free_list_mtx);
+	KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
+	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+	freevnodes--;
+	mtx_unlock(&vnode_free_list_mtx);
+	vp->v_flag &= ~(VFREE|VAGE);
+	splx(s);
+}
+
+/*
+ * Record a process's interest in events which might happen to
+ * a vnode.  Because poll uses the historic select-style interface
+ * internally, this routine serves as both the ``check for any
+ * pending events'' and the ``record my interest in future events''
+ * functions.  (These are done together, while the lock is held,
+ * to avoid race conditions.)
+ */
+int
+vn_pollrecord(vp, td, events)
+	struct vnode *vp;
+	struct thread *td;
+	short events;
+{
+
+	if (vp->v_pollinfo == NULL)
+		v_addpollinfo(vp);
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	if (vp->v_pollinfo->vpi_revents & events) {
+		/*
+		 * This leaves events we are not interested
+		 * in available for the other process which
+		 * which presumably had requested them
+		 * (otherwise they would never have been
+		 * recorded).
+		 */
+		events &= vp->v_pollinfo->vpi_revents;
+		vp->v_pollinfo->vpi_revents &= ~events;
+
+		mtx_unlock(&vp->v_pollinfo->vpi_lock);
+		return events;
+	}
+	vp->v_pollinfo->vpi_events |= events;
+	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+	return 0;
+}
+
+/*
+ * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
+ * it is possible for us to miss an event due to race conditions, but
+ * that condition is expected to be rare, so for the moment it is the
+ * preferred interface.
+ */
+void
+vn_pollevent(vp, events)
+	struct vnode *vp;
+	short events;
+{
+
+	if (vp->v_pollinfo == NULL)
+		v_addpollinfo(vp);
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	if (vp->v_pollinfo->vpi_events & events) {
+		/*
+		 * We clear vpi_events so that we don't
+		 * call selwakeup() twice if two events are
+		 * posted before the polling process(es) is
+		 * awakened.  This also ensures that we take at
+		 * most one selwakeup() if the polling process
+		 * is no longer interested.  However, it does
+		 * mean that only one event can be noticed at
+		 * a time.  (Perhaps we should only clear those
+		 * event bits which we note?) XXX
+		 */
+		vp->v_pollinfo->vpi_events = 0;	/* &= ~events ??? */
+		vp->v_pollinfo->vpi_revents |= events;
+		selwakeup(&vp->v_pollinfo->vpi_selinfo);
+	}
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+}
+
+/*
+ * Wake up anyone polling on vp because it is being revoked.
+ * This depends on dead_poll() returning POLLHUP for correct
+ * behavior.
+ */
+void
+vn_pollgone(vp)
+	struct vnode *vp;
+{
+
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	VN_KNOTE(vp, NOTE_REVOKE);
+	if (vp->v_pollinfo->vpi_events) {
+		vp->v_pollinfo->vpi_events = 0;
+		selwakeup(&vp->v_pollinfo->vpi_selinfo);
+	}
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+}
+
+
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*)(struct  vop_close_args *))nullop)
+static int	sync_fsync(struct  vop_fsync_args *);
+static int	sync_inactive(struct  vop_inactive_args *);
+static int	sync_reclaim(struct  vop_reclaim_args *);
+#define sync_lock ((int (*)(struct  vop_lock_args *))vop_nolock)
+#define sync_unlock ((int (*)(struct  vop_unlock_args *))vop_nounlock)
+static int	sync_print(struct vop_print_args *);
+#define sync_islocked ((int(*)(struct vop_islocked_args *))vop_noislocked)
+
+static vop_t **sync_vnodeop_p;
+static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
+	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
+	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
+	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
+	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
+	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
+	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
+	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
+	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
+	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
+	{ NULL, NULL }
+};
+static struct vnodeopv_desc sync_vnodeop_opv_desc =
+	{ &sync_vnodeop_p, sync_vnodeop_entries };
+
+VNODEOP_SET(sync_vnodeop_opv_desc);
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+int
+vfs_allocate_syncvnode(mp)
+	struct mount *mp;
+{
+	struct vnode *vp;
+	static long start, incr, next;
+	int error;
+
+	/* Allocate a new vnode */
+	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
+		mp->mnt_syncer = NULL;
+		return (error);
+	}
+	vp->v_type = VNON;
+	/*
+	 * Place the vnode onto the syncer worklist. We attempt to
+	 * scatter them about on the list so that they will go off
+	 * at evenly distributed times even if all the filesystems
+	 * are mounted at once.
+	 */
+	next += incr;
+	if (next == 0 || next > syncer_maxdelay) {
+		start /= 2;
+		incr /= 2;
+		if (start == 0) {
+			start = syncer_maxdelay / 2;
+			incr = syncer_maxdelay;
+		}
+		next = start;
+	}
+	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+	mp->mnt_syncer = vp;
+	return (0);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+static int
+sync_fsync(ap)
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		struct ucred *a_cred;
+		int a_waitfor;
+		struct thread *a_td;
+	} */ *ap;
+{
+	struct vnode *syncvp = ap->a_vp;
+	struct mount *mp = syncvp->v_mount;
+	struct thread *td = ap->a_td;
+	int asyncflag;
+
+	/*
+	 * We only need to do something if this is a lazy evaluation.
+	 */
+	if (ap->a_waitfor != MNT_LAZY)
+		return (0);
+
+	/*
+	 * Move ourselves to the back of the sync list.
+	 */
+	vn_syncer_add_to_worklist(syncvp, syncdelay);
+
+	/*
+	 * Walk the list of vnodes pushing all that are dirty and
+	 * not already on the sync list.
+	 */
+	mtx_lock(&mountlist_mtx);
+	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
+		mtx_unlock(&mountlist_mtx);
+		return (0);
+	}
+	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
+		vfs_unbusy(mp, td);
+		return (0);
+	}
+	asyncflag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &= ~MNT_ASYNC;
+	vfs_msync(mp, MNT_NOWAIT);
+	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
+	if (asyncflag)
+		mp->mnt_flag |= MNT_ASYNC;
+	vn_finished_write(mp);
+	vfs_unbusy(mp, td);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+static int
+sync_inactive(ap)
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+		struct thread *a_td;
+	} */ *ap;
+{
+
+	vgone(ap->a_vp);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ *
+ * Modifications to the worklist must be protected at splbio().
+ */
+static int
+sync_reclaim(ap)
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+	int s;
+
+	s = splbio();
+	vp->v_mount->mnt_syncer = NULL;
+	if (vp->v_flag & VONWORKLST) {
+		LIST_REMOVE(vp, v_synclist);
+		vp->v_flag &= ~VONWORKLST;
+	}
+	splx(s);
+
+	return (0);
+}
+
+/*
+ * Print out a syncer vnode.
+ */
+static int
+sync_print(ap)
+	struct vop_print_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	printf("syncer vnode");
+	if (vp->v_vnlock != NULL)
+		lockmgr_printinfo(vp->v_vnlock);
+	printf("\n");
+	return (0);
+}
+
+/*
+ * extract the dev_t from a VCHR
+ */
+dev_t
+vn_todev(vp)
+	struct vnode *vp;
+{
+	if (vp->v_type != VCHR)
+		return (NODEV);
+	return (vp->v_rdev);
+}
+
+/*
+ * Check if vnode represents a disk device
+ */
+int
+vn_isdisk(vp, errp)
+	struct vnode *vp;
+	int *errp;
+{
+	struct cdevsw *cdevsw;
+
+	if (vp->v_type != VCHR) {
+		if (errp != NULL)
+			*errp = ENOTBLK;
+		return (0);
+	}
+	if (vp->v_rdev == NULL) {
+		if (errp != NULL)
+			*errp = ENXIO;
+		return (0);
+	}
+	cdevsw = devsw(vp->v_rdev);
+	if (cdevsw == NULL) {
+		if (errp != NULL)
+			*errp = ENXIO;
+		return (0);
+	}
+	if (!(cdevsw->d_flags & D_DISK)) {
+		if (errp != NULL)
+			*errp = ENOTBLK;
+		return (0);
+	}
+	if (errp != NULL)
+		*errp = 0;
+	return (1);
+}
+
+/*
+ * Free data allocated by namei(); see namei(9) for details.
+ */
+void
+NDFREE(ndp, flags)
+     struct nameidata *ndp;
+     const uint flags;
+{
+	if (!(flags & NDF_NO_FREE_PNBUF) &&
+	    (ndp->ni_cnd.cn_flags & HASBUF)) {
+		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
+		ndp->ni_cnd.cn_flags &= ~HASBUF;
+	}
+	if (!(flags & NDF_NO_DVP_UNLOCK) &&
+	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
+	    ndp->ni_dvp != ndp->ni_vp)
+		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
+	if (!(flags & NDF_NO_DVP_RELE) &&
+	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
+		vrele(ndp->ni_dvp);
+		ndp->ni_dvp = NULL;
+	}
+	if (!(flags & NDF_NO_VP_UNLOCK) &&
+	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
+		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
+	if (!(flags & NDF_NO_VP_RELE) &&
+	    ndp->ni_vp) {
+		vrele(ndp->ni_vp);
+		ndp->ni_vp = NULL;
+	}
+	if (!(flags & NDF_NO_STARTDIR_RELE) &&
+	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
+		vrele(ndp->ni_startdir);
+		ndp->ni_startdir = NULL;
+	}
+}
+
+/*
+ * Common filesystem object access control check routine.  Accepts a
+ * vnode's type, "mode", uid and gid, requested access mode, credentials,
+ * and optional call-by-reference privused argument allowing vaccess()
+ * to indicate to the caller whether privilege was used to satisfy the
+ * request.  Returns 0 on success, or an errno on failure.
+ */
+int
+vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
+	enum vtype type;
+	mode_t file_mode;
+	uid_t file_uid;
+	gid_t file_gid;
+	mode_t acc_mode;
+	struct ucred *cred;
+	int *privused;
+{
+	mode_t dac_granted;
+#ifdef CAPABILITIES
+	mode_t cap_granted;
+#endif
+
+	/*
+	 * Look for a normal, non-privileged way to access the file/directory
+	 * as requested.  If it exists, go with that.
+	 */
+
+	if (privused != NULL)
+		*privused = 0;
+
+	dac_granted = 0;
+
+	/* Check the owner. */
+	if (cred->cr_uid == file_uid) {
+		dac_granted |= VADMIN;
+		if (file_mode & S_IXUSR)
+			dac_granted |= VEXEC;
+		if (file_mode & S_IRUSR)
+			dac_granted |= VREAD;
+		if (file_mode & S_IWUSR)
+			dac_granted |= VWRITE;
+
+		if ((acc_mode & dac_granted) == acc_mode)
+			return (0);
+
+		goto privcheck;
+	}
+
+	/* Otherwise, check the groups (first match) */
+	if (groupmember(file_gid, cred)) {
+		if (file_mode & S_IXGRP)
+			dac_granted |= VEXEC;
+		if (file_mode & S_IRGRP)
+			dac_granted |= VREAD;
+		if (file_mode & S_IWGRP)
+			dac_granted |= VWRITE;
+
+		if ((acc_mode & dac_granted) == acc_mode)
+			return (0);
+
+		goto privcheck;
+	}
+
+	/* Otherwise, check everyone else. */
+	if (file_mode & S_IXOTH)
+		dac_granted |= VEXEC;
+	if (file_mode & S_IROTH)
+		dac_granted |= VREAD;
+	if (file_mode & S_IWOTH)
+		dac_granted |= VWRITE;
+	if ((acc_mode & dac_granted) == acc_mode)
+		return (0);
+
+privcheck:
+	if (!suser_cred(cred, PRISON_ROOT)) {
+		/* XXX audit: privilege used */
+		if (privused != NULL)
+			*privused = 1;
+		return (0);
+	}
+
+#ifdef CAPABILITIES
+	/*
+	 * Build a capability mask to determine if the set of capabilities
+	 * satisfies the requirements when combined with the granted mask
+	 * from above.
+	 * For each capability, if the capability is required, bitwise
+	 * or the request type onto the cap_granted mask.
+	 */
+	cap_granted = 0;
+
+	if (type == VDIR) {
+		/*
+		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
+		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
+		 */
+		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
+			cap_granted |= VEXEC;
+	} else {
+		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
+			cap_granted |= VEXEC;
+	}
+
+	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
+	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
+		cap_granted |= VREAD;
+
+	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
+	    !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
+		cap_granted |= VWRITE;
+
+	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
+	    !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
+		cap_granted |= VADMIN;
+
+	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
+		/* XXX audit: privilege used */
+		if (privused != NULL)
+			*privused = 1;
+		return (0);
+	}
+#endif
+
+	return ((acc_mode & VADMIN) ? EPERM : EACCES);
+}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
new file mode 100644
index 0000000..1244e54
--- /dev/null
+++ b/sys/kern/vfs_syscalls.c
@@ -0,0 +1,4862 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
+ * $FreeBSD$
+ */
+
+/* For 4.3 integer FS ID compatibility */
+#include "opt_compat.h"
+#include "opt_ffs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sysent.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/linker.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+#include <sys/extattr.h>
+#include <sys/jail.h>
+#include <sys/sysctl.h>
+
+#include <machine/limits.h>
+#include <machine/stdarg.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+static int change_dir(struct nameidata *ndp, struct thread *td);
+static void checkdirs(struct vnode *olddp, struct vnode *newdp);
+static int chroot_refuse_vdir_fds(struct filedesc *fdp);
+static int getutimes(const struct timeval *, struct timespec *);
+static int setfown(struct thread *td, struct vnode *, uid_t, gid_t);
+static int setfmode(struct thread *td, struct vnode *, int);
+static int setfflags(struct thread *td, struct vnode *, int);
+static int setutimes(struct thread *td, struct vnode *,
+    const struct timespec *, int);
+static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
+    struct thread *td);
+static int vfs_nmount(struct thread *td, int, struct uio *);
+
+static int	usermount = 0;	/* if 1, non-root can mount fs. */
+
+int (*union_dircheckp)(struct thread *td, struct vnode **, struct file *);
+
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
+
+/*
+ * Virtual File System System Calls
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct nmount_args {
+	struct iovec    *iovp;
+	unsigned int    iovcnt;
+	int             flags;
+};
+#endif
+/* ARGSUSED */
+int
+nmount(td, uap)
+	struct thread *td;
+	struct nmount_args /* {
+		syscallarg(struct iovec *) iovp;
+		syscallarg(unsigned int) iovcnt;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct uio auio;
+	struct iovec *iov, *needfree;
+	struct iovec aiov[UIO_SMALLIOV];
+	unsigned int i;
+	int error;
+	u_int iovlen, iovcnt;
+
+	iovcnt = SCARG(uap, iovcnt);
+	iovlen = iovcnt * sizeof (struct iovec);
+	/*
+	 * Check that we have an even number of iovec's
+	 * and that we have at least two options.
+	 */
+	if ((iovcnt & 1) || (iovcnt < 4) || (iovcnt > UIO_MAXIOV))
+		return (EINVAL);
+
+	if (iovcnt > UIO_SMALLIOV) {
+		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+		needfree = iov;
+	} else {
+		iov = aiov;
+		needfree = NULL;
+	}
+	auio.uio_iov = iov;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_segflg = UIO_USERSPACE;
+	if ((error = copyin(uap->iovp, iov, iovlen)))
+		goto finish;
+
+	for (i = 0; i < iovcnt; i++) {
+		if (iov->iov_len > MMAXOPTIONLEN) {
+			error = EINVAL;
+			goto finish;
+		}
+		iov++;
+	}
+	error = vfs_nmount(td, SCARG(uap, flags), &auio);
+finish:
+	if (needfree != NULL)
+		free(needfree, M_TEMP);
+	return (error);
+}
+
+/*
+ * Release all resources related to the
+ * mount options.
+ */
+void
+vfs_freeopts(struct vfsoptlist *opts)
+{
+	struct vfsopt *opt;
+
+	while (!TAILQ_EMPTY(opts)) {
+		opt = TAILQ_FIRST(opts);
+		TAILQ_REMOVE(opts, opt, link);
+		free(opt->name, M_MOUNT);
+		free(opt->value, M_MOUNT);
+		free(opt, M_MOUNT);
+	}
+	free(opts, M_MOUNT);
+}
+
+int
+kernel_mount(iovp, iovcnt, flags)
+	struct iovec *iovp;
+	unsigned int iovcnt;
+	int flags;
+{
+	struct uio auio;
+	int error;
+
+	/*
+	 * Check that we have an even number of iovec's
+	 * and that we have at least two options.
+	 */
+	if ((iovcnt & 1) || (iovcnt < 4))
+		return (EINVAL);
+
+	auio.uio_iov = iovp;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_segflg = UIO_SYSSPACE;
+
+	error = vfs_nmount(curthread, flags, &auio);
+	return (error);
+}
+
+int
+kernel_vmount(int flags, ...)
+{
+	struct iovec *iovp;
+	struct uio auio;
+	va_list ap;
+	unsigned int iovcnt, iovlen, len;
+	const char *cp;
+	char *buf, *pos;
+	size_t n;
+	int error, i;
+
+	len = 0;
+	va_start(ap, flags);
+	for (iovcnt = 0; (cp = va_arg(ap, const char *)) != NULL; iovcnt++)
+		len += strlen(cp) + 1;
+	va_end(ap);
+
+	if (iovcnt < 4 || iovcnt & 1)
+		return (EINVAL);
+
+	iovlen = iovcnt * sizeof (struct iovec);
+	MALLOC(iovp, struct iovec *, iovlen, M_MOUNT, M_WAITOK);
+	MALLOC(buf, char *, len, M_MOUNT, M_WAITOK);
+	pos = buf;
+	va_start(ap, flags);
+	for (i = 0; i < iovcnt; i++) {
+		cp = va_arg(ap, const char *);
+		copystr(cp, pos, len - (pos - buf), &n);
+		iovp[i].iov_base = pos;
+		iovp[i].iov_len = n;
+		pos += n;
+	}
+	va_end(ap);
+
+	auio.uio_iov = iovp;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_segflg = UIO_SYSSPACE;
+
+	error = vfs_nmount(curthread, flags, &auio);
+	FREE(iovp, M_MOUNT);
+	FREE(buf, M_MOUNT);
+	return (error);
+}
+
+/*
+ * vfs_nmount(): actually attempt a filesystem mount.
+ */
+static int
+vfs_nmount(td, fsflags, fsoptions)
+	struct thread *td;
+	int fsflags;		/* Flags common to all filesystems. */
+	struct uio *fsoptions;	/* Options local to the filesystem. */
+{
+	linker_file_t lf;
+	struct vnode *vp;
+	struct mount *mp;
+	struct vfsconf *vfsp;
+	struct vfsoptlist *optlist;
+	char *fstype, *fspath;
+	int error, flag = 0, kern_flag = 0;
+	int fstypelen, fspathlen;
+	struct vattr va;
+	struct nameidata nd;
+
+	error = vfs_buildopts(fsoptions, &optlist);
+	if (error)
+		return (error);
+
+	/*
+	 * We need these two options before the others,
+	 * and they are mandatory for any filesystem.
+	 * Ensure they are NUL terminated as well.
+	 */
+	fstypelen = 0;
+	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
+	if (error || fstype[fstypelen - 1] != '\0') {
+		error = EINVAL;
+		goto bad;
+	}
+	fspathlen = 0;
+	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
+	if (error || fspath[fspathlen - 1] != '\0') {
+		error = EINVAL;
+		goto bad;
+	}
+
+	/*
+	 * Be ultra-paranoid about making sure the type and fspath
+	 * variables will fit in our mp buffers, including the
+	 * terminating NUL.
+	 */
+	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
+		error = ENAMETOOLONG;
+		goto bad;
+	}
+
+	if (usermount == 0) {
+	       	error = suser(td);
+		if (error)
+			goto bad;
+	}
+	/*
+	 * Do not allow NFS export by non-root users.
+	 */
+	if (fsflags & MNT_EXPORTED) {
+		error = suser(td);
+		if (error)
+			goto bad;
+	}
+	/*
+	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users.
+	 */
+	if (suser(td)) 
+		fsflags |= MNT_NOSUID | MNT_NODEV;
+	/*
+	 * Get vnode to be covered
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
+	if ((error = namei(&nd)) != 0)
+		goto bad;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if (fsflags & MNT_UPDATE) {
+		if ((vp->v_flag & VROOT) == 0) {
+			vput(vp);
+			error = EINVAL;
+			goto bad;
+		}
+		mp = vp->v_mount;
+		flag = mp->mnt_flag;
+		kern_flag = mp->mnt_kern_flag;
+		/*
+		 * We only allow the filesystem to be reloaded if it
+		 * is currently mounted read-only.
+		 */
+		if ((fsflags & MNT_RELOAD) &&
+		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+			vput(vp);
+			error = EOPNOTSUPP;	/* Needs translation */
+			goto bad;
+		}
+		/*
+		 * Only root, or the user that did the original mount is
+		 * permitted to update it.
+		 */
+		if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+			error = suser(td);
+			if (error) {
+				vput(vp);
+				goto bad;
+			}
+		}
+		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
+			vput(vp);
+			error = EBUSY;
+			goto bad;
+		}
+		mtx_lock(&vp->v_interlock);
+		if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) {
+			mtx_unlock(&vp->v_interlock);
+			vfs_unbusy(mp, td);
+			vput(vp);
+			error = EBUSY;
+			goto bad;
+		}
+		vp->v_flag |= VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		mp->mnt_flag |= fsflags &
+		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT);
+		VOP_UNLOCK(vp, 0, td);
+		goto update;
+	}
+	/*
+	 * If the user is not root, ensure that they own the directory
+	 * onto which we are attempting to mount.
+	 */
+	error = VOP_GETATTR(vp, &va, td->td_ucred, td);
+	if (error) {
+		vput(vp);
+		goto bad;
+	}
+	if (va.va_uid != td->td_ucred->cr_uid) {
+		error = suser(td);
+		if (error) {
+			vput(vp);
+			goto bad;
+		}
+	}
+	if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) {
+		vput(vp);
+		goto bad;
+	}
+	if (vp->v_type != VDIR) {
+		vput(vp);
+		error = ENOTDIR;
+		goto bad;
+	}
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstype))
+			break;
+	if (vfsp == NULL) {
+		/* Only load modules for root (very important!). */
+		error = suser(td);
+		if (error) {
+			vput(vp);
+			goto bad;
+		}
+		error = securelevel_gt(td->td_ucred, 0);
+		if (error) {
+			vput(vp);
+			goto bad;
+		}
+		error = linker_load_file(fstype, &lf);
+		if (error || lf == NULL) {
+			vput(vp);
+			if (lf == NULL)
+				error = ENODEV;
+			goto bad;
+		}
+		lf->userrefs++;
+		/* Look up again to see if the VFS was loaded. */
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (!strcmp(vfsp->vfc_name, fstype))
+				break;
+		if (vfsp == NULL) {
+			lf->userrefs--;
+			linker_file_unload(lf);
+			vput(vp);
+			error = ENODEV;
+			goto bad;
+		}
+	}
+	mtx_lock(&vp->v_interlock);
+	if ((vp->v_flag & VMOUNT) != 0 ||
+	    vp->v_mountedhere != NULL) {
+		mtx_unlock(&vp->v_interlock);
+		vput(vp);
+		error = EBUSY;
+		goto bad;
+	}
+	vp->v_flag |= VMOUNT;
+	mtx_unlock(&vp->v_interlock);
+
+	/*
+	 * Allocate and initialize the filesystem.
+	 */
+	mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&mp->mnt_nvnodelist);
+	TAILQ_INIT(&mp->mnt_reservedvnlist);
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, td);
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_vfc = vfsp;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN);
+	mp->mnt_vnodecovered = vp;
+	mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
+	strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+	mp->mnt_iosize_max = DFLTPHYS;
+	VOP_UNLOCK(vp, 0, td);
+
+update:
+	mp->mnt_optnew = optlist;
+	/*
+	 * Check if the fs implements the new VFS_NMOUNT()
+	 * function, since the new system call was used.
+	 */
+	if (mp->mnt_op->vfs_mount != NULL) {
+		printf("%s doesn't support the new mount syscall\n",
+		    mp->mnt_vfc->vfc_name);
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		if (mp->mnt_flag & MNT_UPDATE)
+			vfs_unbusy(mp, td);
+		else {
+			mp->mnt_vfc->vfc_refcount--;
+			vfs_unbusy(mp, td);
+			free(mp, M_MOUNT);
+		}
+		vrele(vp);
+		error = EOPNOTSUPP;
+		goto bad;
+	}
+
+	/*
+	 * Set the mount level flags.
+	 */
+	if (fsflags & MNT_RDONLY)
+		mp->mnt_flag |= MNT_RDONLY;
+	else if (mp->mnt_flag & MNT_RDONLY)
+		mp->mnt_kern_flag |= MNTK_WANTRDWR;
+	mp->mnt_flag &=~ MNT_UPDATEMASK;
+	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE);
+	/*
+	 * Mount the filesystem.
+	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+	 * get.  No freeing of cn_pnbuf.
+	 */
+	error = VFS_NMOUNT(mp, &nd, td);
+	if (!error) {
+		if (mp->mnt_opt != NULL)
+			vfs_freeopts(mp->mnt_opt);
+		mp->mnt_opt = mp->mnt_optnew;
+	}
+	/*
+	 * Prevent external consumers of mount
+	 * options to read mnt_optnew.
+	 */
+	mp->mnt_optnew = NULL;
+	if (mp->mnt_flag & MNT_UPDATE) {
+		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+			mp->mnt_flag &= ~MNT_RDONLY;
+		mp->mnt_flag &=~
+		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
+		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+		if (error) {
+			mp->mnt_flag = flag;
+			mp->mnt_kern_flag = kern_flag;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			if (mp->mnt_syncer == NULL)
+				error = vfs_allocate_syncvnode(mp);
+		} else {
+			if (mp->mnt_syncer != NULL)
+				vrele(mp->mnt_syncer);
+			mp->mnt_syncer = NULL;
+		}
+		vfs_unbusy(mp, td);
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		vrele(vp);
+		return (error);
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	/*
+	 * Put the new filesystem on the mount list after root.
+	 */
+	cache_purge(vp);
+	if (!error) {
+		struct vnode *newdp;
+
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		vp->v_mountedhere = mp;
+		mtx_unlock(&vp->v_interlock);
+		mtx_lock(&mountlist_mtx);
+		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+		mtx_unlock(&mountlist_mtx);
+		if (VFS_ROOT(mp, &newdp))
+			panic("mount: lost mount");
+		checkdirs(vp, newdp);
+		vput(newdp);
+		VOP_UNLOCK(vp, 0, td);
+		if ((mp->mnt_flag & MNT_RDONLY) == 0)
+			error = vfs_allocate_syncvnode(mp);
+		vfs_unbusy(mp, td);
+		if ((error = VFS_START(mp, 0, td)) != 0) {
+			vrele(vp);
+			goto bad;
+		}
+	} else {
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		mp->mnt_vfc->vfc_refcount--;
+		vfs_unbusy(mp, td);
+		free(mp, M_MOUNT);
+		vput(vp);
+		goto bad;
+	}
+	return (0);
+bad:
+	vfs_freeopts(optlist);
+	return (error);
+}
+
+/*
+ * Old Mount API.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+	char	*type;
+	char	*path;
+	int	flags;
+	caddr_t	data;
+};
+#endif
+/* ARGSUSED */
+int
+mount(td, uap)
+	struct thread *td;
+	struct mount_args /* {
+		syscallarg(char *) type;
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(caddr_t) data;
+	} */ *uap;
+{
+	char *fstype;
+	char *fspath;
+	int error;
+
+	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
+	fspath = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+
+	/*
+	 * vfs_mount() actually takes a kernel string for `type' and
+	 * `path' now, so extract them.
+	 */
+	error = copyinstr(SCARG(uap, type), fstype, MFSNAMELEN, NULL);
+	if (error)
+		goto finish;
+	error = copyinstr(SCARG(uap, path), fspath, MNAMELEN, NULL);
+	if (error)
+		goto finish;
+	error = vfs_mount(td, fstype, fspath, SCARG(uap, flags),
+	    SCARG(uap, data));
+finish:
+	free(fstype, M_TEMP);
+	free(fspath, M_TEMP);
+	return (error);
+}
+
+/*
+ * vfs_mount(): actually attempt a filesystem mount.
+ *
+ * This routine is designed to be a "generic" entry point for routines
+ * that wish to mount a filesystem. All parameters except `fsdata' are
+ * pointers into kernel space. `fsdata' is currently still a pointer
+ * into userspace.
+ */
+int
+vfs_mount(td, fstype, fspath, fsflags, fsdata)
+	struct thread *td;
+	const char *fstype;
+	char *fspath;
+	int fsflags;
+	void *fsdata;
+{
+	linker_file_t lf;
+	struct vnode *vp;
+	struct mount *mp;
+	struct vfsconf *vfsp;
+	int error, flag = 0, kern_flag = 0;
+	struct vattr va;
+	struct nameidata nd;
+
+	/*
+	 * Be ultra-paranoid about making sure the type and fspath
+	 * variables will fit in our mp buffers, including the
+	 * terminating NUL.
+	 */
+	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+		return (ENAMETOOLONG);
+
+	if (usermount == 0) {
+		error = suser(td);
+		if (error)
+			return (error);
+	}
+	/*
+	 * Do not allow NFS export by non-root users.
+	 */
+	if (fsflags & MNT_EXPORTED) {
+		error = suser(td);
+		if (error)
+			return (error);
+	}
+	/*
+	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users.
+	 */
+	if (suser(td)) 
+		fsflags |= MNT_NOSUID | MNT_NODEV;
+	/*
+	 * Get vnode to be covered
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if (fsflags & MNT_UPDATE) {
+		if ((vp->v_flag & VROOT) == 0) {
+			vput(vp);
+			return (EINVAL);
+		}
+		mp = vp->v_mount;
+		flag = mp->mnt_flag;
+		kern_flag = mp->mnt_kern_flag;
+		/*
+		 * We only allow the filesystem to be reloaded if it
+		 * is currently mounted read-only.
+		 */
+		if ((fsflags & MNT_RELOAD) &&
+		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+			vput(vp);
+			return (EOPNOTSUPP);	/* Needs translation */
+		}
+		/*
+		 * Only root, or the user that did the original mount is
+		 * permitted to update it.
+		 */
+		if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+			error = suser(td);
+			if (error) {
+				vput(vp);
+				return (error);
+			}
+		}
+		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
+			vput(vp);
+			return (EBUSY);
+		}
+		mtx_lock(&vp->v_interlock);
+		if ((vp->v_flag & VMOUNT) != 0 || vp->v_mountedhere != NULL) {
+			mtx_unlock(&vp->v_interlock);
+			vfs_unbusy(mp, td);
+			vput(vp);
+			return (EBUSY);
+		}
+		vp->v_flag |= VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		mp->mnt_flag |= fsflags &
+		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT);
+		VOP_UNLOCK(vp, 0, td);
+		goto update;
+	}
+	/*
+	 * If the user is not root, ensure that they own the directory
+	 * onto which we are attempting to mount.
+	 */
+	error = VOP_GETATTR(vp, &va, td->td_ucred, td);
+	if (error) {
+		vput(vp);
+		return (error);
+	}
+	if (va.va_uid != td->td_ucred->cr_uid) {
+		error = suser(td);
+		if (error) {
+			vput(vp);
+			return (error);
+		}
+	}
+	if ((error = vinvalbuf(vp, V_SAVE, td->td_ucred, td, 0, 0)) != 0) {
+		vput(vp);
+		return (error);
+	}
+	if (vp->v_type != VDIR) {
+		vput(vp);
+		return (ENOTDIR);
+	}
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstype))
+			break;
+	if (vfsp == NULL) {
+		/* Only load modules for root (very important!). */
+		error = suser(td);
+		if (error) {
+			vput(vp);
+			return (error);
+		}
+		error = securelevel_gt(td->td_ucred, 0);
+		if (error) {
+			vput(vp);
+			return (error);
+		}
+		error = linker_load_file(fstype, &lf);
+		if (error || lf == NULL) {
+			vput(vp);
+			if (lf == NULL)
+				error = ENODEV;
+			return (error);
+		}
+		lf->userrefs++;
+		/* Look up again to see if the VFS was loaded. */
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (!strcmp(vfsp->vfc_name, fstype))
+				break;
+		if (vfsp == NULL) {
+			lf->userrefs--;
+			linker_file_unload(lf);
+			vput(vp);
+			return (ENODEV);
+		}
+	}
+	mtx_lock(&vp->v_interlock);
+	if ((vp->v_flag & VMOUNT) != 0 ||
+	    vp->v_mountedhere != NULL) {
+		mtx_unlock(&vp->v_interlock);
+		vput(vp);
+		return (EBUSY);
+	}
+	vp->v_flag |= VMOUNT;
+	mtx_unlock(&vp->v_interlock);
+
+	/*
+	 * Allocate and initialize the filesystem.
+	 */
+	mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&mp->mnt_nvnodelist);
+	TAILQ_INIT(&mp->mnt_reservedvnlist);
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, td);
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_vfc = vfsp;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN);
+	mp->mnt_vnodecovered = vp;
+	mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
+	strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+	mp->mnt_iosize_max = DFLTPHYS;
+	VOP_UNLOCK(vp, 0, td);
+update:
+	/*
+	 * Check if the fs implements the old VFS_MOUNT()
+	 * function, since the old system call was used.
+	 */
+	if (mp->mnt_op->vfs_mount == NULL) {
+		printf("%s doesn't support the old mount syscall\n",
+		    mp->mnt_vfc->vfc_name);
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		if (mp->mnt_flag & MNT_UPDATE)
+			vfs_unbusy(mp, td);
+		else {
+			mp->mnt_vfc->vfc_refcount--;
+			vfs_unbusy(mp, td);
+			free(mp, M_MOUNT);
+		}
+		vrele(vp);
+		return (EOPNOTSUPP);
+	}
+
+	/*
+	 * Set the mount level flags.
+	 */
+	if (fsflags & MNT_RDONLY)
+		mp->mnt_flag |= MNT_RDONLY;
+	else if (mp->mnt_flag & MNT_RDONLY)
+		mp->mnt_kern_flag |= MNTK_WANTRDWR;
+	mp->mnt_flag &=~ MNT_UPDATEMASK;
+	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE);
+	/*
+	 * Mount the filesystem.
+	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+	 * get.  No freeing of cn_pnbuf.
+	 */
+	error = VFS_MOUNT(mp, fspath, fsdata, &nd, td);
+	if (mp->mnt_flag & MNT_UPDATE) {
+		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
+			mp->mnt_flag &= ~MNT_RDONLY;
+		mp->mnt_flag &=~
+		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
+		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
+		if (error) {
+			mp->mnt_flag = flag;
+			mp->mnt_kern_flag = kern_flag;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			if (mp->mnt_syncer == NULL)
+				error = vfs_allocate_syncvnode(mp);
+		} else {
+			if (mp->mnt_syncer != NULL)
+				vrele(mp->mnt_syncer);
+			mp->mnt_syncer = NULL;
+		}
+		vfs_unbusy(mp, td);
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		vrele(vp);
+		return (error);
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	/*
+	 * Put the new filesystem on the mount list after root.
+	 */
+	cache_purge(vp);
+	if (!error) {
+		struct vnode *newdp;
+
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		vp->v_mountedhere = mp;
+		mtx_unlock(&vp->v_interlock);
+		mtx_lock(&mountlist_mtx);
+		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+		mtx_unlock(&mountlist_mtx);
+		if (VFS_ROOT(mp, &newdp))
+			panic("mount: lost mount");
+		checkdirs(vp, newdp);
+		vput(newdp);
+		VOP_UNLOCK(vp, 0, td);
+		if ((mp->mnt_flag & MNT_RDONLY) == 0)
+			error = vfs_allocate_syncvnode(mp);
+		vfs_unbusy(mp, td);
+		if ((error = VFS_START(mp, 0, td)) != 0)
+			vrele(vp);
+	} else {
+		mtx_lock(&vp->v_interlock);
+		vp->v_flag &= ~VMOUNT;
+		mtx_unlock(&vp->v_interlock);
+		mp->mnt_vfc->vfc_refcount--;
+		vfs_unbusy(mp, td);
+		free(mp, M_MOUNT);
+		vput(vp);
+	}
+	return (error);
+}
+
+/*
+ * Scan all active processes to see if any of them have a current
+ * or root directory of `olddp'. If so, replace them with the new
+ * mount point.
+ */
+static void
+checkdirs(olddp, newdp)
+	struct vnode *olddp, *newdp;
+{
+	struct filedesc *fdp;
+	struct proc *p;
+	int nrele;
+
+	if (olddp->v_usecount == 1)
+		return;
+	sx_slock(&allproc_lock);
+	LIST_FOREACH(p, &allproc, p_list) {
+		PROC_LOCK(p);
+		fdp = p->p_fd;
+		if (fdp == NULL) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		nrele = 0;
+		FILEDESC_LOCK(fdp);
+		if (fdp->fd_cdir == olddp) {
+			VREF(newdp);
+			fdp->fd_cdir = newdp;
+			nrele++;
+		}
+		if (fdp->fd_rdir == olddp) {
+			VREF(newdp);
+			fdp->fd_rdir = newdp;
+			nrele++;
+		}
+		FILEDESC_UNLOCK(fdp);
+		PROC_UNLOCK(p);
+		while (nrele--)
+			vrele(olddp);
+	}
+	sx_sunlock(&allproc_lock);
+	if (rootvnode == olddp) {
+		vrele(rootvnode);
+		VREF(newdp);
+		rootvnode = newdp;
+	}
+}
+
+/*
+ * Unmount a filesystem.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument,
+ * not special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+unmount(td, uap)
+	struct thread *td;
+	register struct unmount_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct mount *mp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	mp = vp->v_mount;
+
+	/*
+	 * Only root, or the user that did the original mount is
+	 * permitted to unmount this filesystem.
+	 */
+	if (mp->mnt_stat.f_owner != td->td_ucred->cr_uid) {
+		error = suser(td);
+		if (error) {
+			vput(vp);
+			return (error);
+		}
+	}
+
+	/*
+	 * Don't allow unmounting the root filesystem.
+	 */
+	if (mp->mnt_flag & MNT_ROOTFS) {
+		vput(vp);
+		return (EINVAL);
+	}
+
+	/*
+	 * Must be the root of the filesystem
+	 */
+	if ((vp->v_flag & VROOT) == 0) {
+		vput(vp);
+		return (EINVAL);
+	}
+	vput(vp);
+	return (dounmount(mp, SCARG(uap, flags), td));
+}
+
+/*
+ * Do the actual filesystem unmount.
+ */
+int
+dounmount(mp, flags, td)
+	struct mount *mp;
+	int flags;
+	struct thread *td;
+{
+	struct vnode *coveredvp, *fsrootvp;
+	int error;
+	int async_flag;
+
+	mtx_lock(&mountlist_mtx);
+	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+		mtx_unlock(&mountlist_mtx);
+		return (EBUSY);
+	}
+	mp->mnt_kern_flag |= MNTK_UNMOUNT;
+	/* Allow filesystems to detect that a forced unmount is in progress. */
+	if (flags & MNT_FORCE)
+		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
+	error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
+	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), &mountlist_mtx, td);
+	if (error) {
+		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+		if (mp->mnt_kern_flag & MNTK_MWAIT)
+			wakeup(mp);
+		return (error);
+	}
+	vn_start_write(NULL, &mp, V_WAIT);
+
+	if (mp->mnt_flag & MNT_EXPUBLIC)
+		vfs_setpublicfs(NULL, NULL, NULL);
+
+	vfs_msync(mp, MNT_WAIT);
+	async_flag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &=~ MNT_ASYNC;
+	cache_purgevfs(mp);	/* remove cache entries for this file sys */
+	if (mp->mnt_syncer != NULL)
+		vrele(mp->mnt_syncer);
+	/* Move process cdir/rdir refs on fs root to underlying vnode. */
+	if (VFS_ROOT(mp, &fsrootvp) == 0) {
+		if (mp->mnt_vnodecovered != NULL)
+			checkdirs(fsrootvp, mp->mnt_vnodecovered);
+		if (fsrootvp == rootvnode) {
+			vrele(rootvnode);
+			rootvnode = NULL;
+		}
+		vput(fsrootvp);
+	}
+	if (((mp->mnt_flag & MNT_RDONLY) ||
+	     (error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) == 0) ||
+	    (flags & MNT_FORCE)) {
+		error = VFS_UNMOUNT(mp, flags, td);
+	}
+	vn_finished_write(mp);
+	if (error) {
+		/* Undo cdir/rdir and rootvnode changes made above. */
+		if (VFS_ROOT(mp, &fsrootvp) == 0) {
+			if (mp->mnt_vnodecovered != NULL)
+				checkdirs(mp->mnt_vnodecovered, fsrootvp);
+			if (rootvnode == NULL) {
+				rootvnode = fsrootvp;
+				vref(rootvnode);
+			}
+			vput(fsrootvp);
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
+			(void) vfs_allocate_syncvnode(mp);
+		mtx_lock(&mountlist_mtx);
+		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+		mp->mnt_flag |= async_flag;
+		lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK,
+		    &mountlist_mtx, td);
+		if (mp->mnt_kern_flag & MNTK_MWAIT)
+			wakeup(mp);
+		return (error);
+	}
+	mtx_lock(&mountlist_mtx);
+	TAILQ_REMOVE(&mountlist, mp, mnt_list);
+	if ((coveredvp = mp->mnt_vnodecovered) != NULL)
+		coveredvp->v_mountedhere = NULL;
+	mp->mnt_vfc->vfc_refcount--;
+	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
+		panic("unmount: dangling vnode");
+	lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td);
+	lockdestroy(&mp->mnt_lock);
+	if (coveredvp != NULL)
+		vrele(coveredvp);
+	if (mp->mnt_kern_flag & MNTK_MWAIT)
+		wakeup(mp);
+	if (mp->mnt_op->vfs_mount == NULL)
+		vfs_freeopts(mp->mnt_opt);
+	free(mp, M_MOUNT);
+	return (0);
+}
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+        int     dummy;
+};
+#endif
+
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/* ARGSUSED */
+int
+sync(td, uap)
+	struct thread *td;
+	struct sync_args *uap;
+{
+	struct mount *mp, *nmp;
+	int asyncflag;
+
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
+		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
+			asyncflag = mp->mnt_flag & MNT_ASYNC;
+			mp->mnt_flag &= ~MNT_ASYNC;
+			vfs_msync(mp, MNT_NOWAIT);
+			VFS_SYNC(mp, MNT_NOWAIT,
+			    ((td != NULL) ? td->td_ucred : NOCRED), td);
+			mp->mnt_flag |= asyncflag;
+			vn_finished_write(mp);
+		}
+		mtx_lock(&mountlist_mtx);
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		vfs_unbusy(mp, td);
+	}
+	mtx_unlock(&mountlist_mtx);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
+#ifdef DIAGNOSTIC
+	if (syncprt)
+		vfs_bufstats();
+#endif /* DIAGNOSTIC */
+#endif
+	return (0);
+}
+
+/* XXX PRISON: could be per prison flag */
+static int prison_quotas;
+#if 0
+SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
+#endif
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+	char *path;
+	int cmd;
+	int uid;
+	caddr_t arg;
+};
+#endif
+/* ARGSUSED */
+int
+quotactl(td, uap)
+	struct thread *td;
+	register struct quotactl_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) cmd;
+		syscallarg(int) uid;
+		syscallarg(caddr_t) arg;
+	} */ *uap;
+{
+	struct mount *mp;
+	int error;
+	struct nameidata nd;
+
+	if (jailed(td->td_ucred) && !prison_quotas)
+		return (EPERM);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
+	vrele(nd.ni_vp);
+	if (error)
+		return (error);
+	error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
+	    SCARG(uap, arg), td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+	char *path;
+	struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+statfs(td, uap)
+	struct thread *td;
+	register struct statfs_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct statfs *) buf;
+	} */ *uap;
+{
+	register struct mount *mp;
+	register struct statfs *sp;
+	int error;
+	struct nameidata nd;
+	struct statfs sb;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	mp = nd.ni_vp->v_mount;
+	sp = &mp->mnt_stat;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vrele(nd.ni_vp);
+	error = VFS_STATFS(mp, sp, td);
+	if (error)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (suser(td)) {
+		bcopy(sp, &sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+	int fd;
+	struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+fstatfs(td, uap)
+	struct thread *td;
+	register struct fstatfs_args /* {
+		syscallarg(int) fd;
+		syscallarg(struct statfs *) buf;
+	} */ *uap;
+{
+	struct file *fp;
+	struct mount *mp;
+	register struct statfs *sp;
+	int error;
+	struct statfs sb;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	mp = ((struct vnode *)fp->f_data)->v_mount;
+	fdrop(fp, td);
+	if (mp == NULL)
+		return (EBADF);
+	sp = &mp->mnt_stat;
+	error = VFS_STATFS(mp, sp, td);
+	if (error)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (suser(td)) {
+		bcopy(sp, &sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+	struct statfs *buf;
+	long bufsize;
+	int flags;
+};
+#endif
+int
+getfsstat(td, uap)
+	struct thread *td;
+	register struct getfsstat_args /* {
+		syscallarg(struct statfs *) buf;
+		syscallarg(long) bufsize;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	register struct mount *mp, *nmp;
+	register struct statfs *sp;
+	caddr_t sfsp;
+	long count, maxcount, error;
+
+	maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
+	sfsp = (caddr_t)SCARG(uap, buf);
+	count = 0;
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+		if (sfsp && count < maxcount) {
+			sp = &mp->mnt_stat;
+			/*
+			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
+			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+			 * overrides MNT_WAIT.
+			 */
+			if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
+			    (SCARG(uap, flags) & MNT_WAIT)) &&
+			    (error = VFS_STATFS(mp, sp, td))) {
+				mtx_lock(&mountlist_mtx);
+				nmp = TAILQ_NEXT(mp, mnt_list);
+				vfs_unbusy(mp, td);
+				continue;
+			}
+			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+			error = copyout(sp, sfsp, sizeof(*sp));
+			if (error) {
+				vfs_unbusy(mp, td);
+				return (error);
+			}
+			sfsp += sizeof(*sp);
+		}
+		count++;
+		mtx_lock(&mountlist_mtx);
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		vfs_unbusy(mp, td);
+	}
+	mtx_unlock(&mountlist_mtx);
+	if (sfsp && count > maxcount)
+		td->td_retval[0] = maxcount;
+	else
+		td->td_retval[0] = count;
+	return (0);
+}
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+	int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+fchdir(td, uap)
+	struct thread *td;
+	struct fchdir_args /* {
+		syscallarg(int) fd;
+	} */ *uap;
+{
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	struct vnode *vp, *tdp, *vpold;
+	struct mount *mp;
+	struct file *fp;
+	int error;
+
+	if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	VREF(vp);
+	fdrop(fp, td);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (vp->v_type != VDIR)
+		error = ENOTDIR;
+	else
+		error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+	while (!error && (mp = vp->v_mountedhere) != NULL) {
+		if (vfs_busy(mp, 0, 0, td))
+			continue;
+		error = VFS_ROOT(mp, &tdp);
+		vfs_unbusy(mp, td);
+		if (error)
+			break;
+		vput(vp);
+		vp = tdp;
+	}
+	if (error) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0, td);
+	FILEDESC_LOCK(fdp);
+	vpold = fdp->fd_cdir;
+	fdp->fd_cdir = vp;
+	FILEDESC_UNLOCK(fdp);
+	vrele(vpold);
+	return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+chdir(td, uap)
+	struct thread *td;
+	struct chdir_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	int error;
+	struct nameidata nd;
+	struct vnode *vp;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = change_dir(&nd, td)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	FILEDESC_LOCK(fdp);
+	vp = fdp->fd_cdir;
+	fdp->fd_cdir = nd.ni_vp;
+	FILEDESC_UNLOCK(fdp);
+	vrele(vp);
+	return (0);
+}
+
+/*
+ * Helper function for raised chroot(2) security function:  Refuse if
+ * any filedescriptors are open directories.
+ */
+static int
+chroot_refuse_vdir_fds(fdp)
+	struct filedesc *fdp;
+{
+	struct vnode *vp;
+	struct file *fp;
+	int fd;
+
+	FILEDESC_LOCK(fdp);
+	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
+		fp = fget_locked(fdp, fd);
+		if (fp == NULL)
+			continue;
+		if (fp->f_type == DTYPE_VNODE) {
+			vp = (struct vnode *)fp->f_data;
+			if (vp->v_type == VDIR) {
+				FILEDESC_UNLOCK(fdp);
+				return (EPERM);
+			}
+		}
+	}
+	FILEDESC_UNLOCK(fdp);
+	return (0);
+}
+
+/*
+ * This sysctl determines if we will allow a process to chroot(2) if it
+ * has a directory open:
+ *	0: disallowed for all processes.
+ *	1: allowed for processes that were not already chroot(2)'ed.
+ *	2: allowed for all processes.
+ */
+
+static int chroot_allow_open_directories = 1;
+
+SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
+     &chroot_allow_open_directories, 0, "");
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+chroot(td, uap)
+	struct thread *td;
+	struct chroot_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	int error;
+	struct nameidata nd;
+	struct vnode *vp;
+
+	error = suser_cred(td->td_ucred, PRISON_ROOT);
+	if (error)
+		return (error);
+	FILEDESC_LOCK(fdp);
+	if (chroot_allow_open_directories == 0 ||
+	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
+		FILEDESC_UNLOCK(fdp);
+		error = chroot_refuse_vdir_fds(fdp);
+	} else
+		FILEDESC_UNLOCK(fdp);
+	if (error)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = change_dir(&nd, td)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	FILEDESC_LOCK(fdp);
+	vp = fdp->fd_rdir;
+	fdp->fd_rdir = nd.ni_vp;
+	if (!fdp->fd_jdir) {
+		fdp->fd_jdir = nd.ni_vp;
+                VREF(fdp->fd_jdir);
+	}
+	FILEDESC_UNLOCK(fdp);
+	vrele(vp);
+	return (0);
+}
+
+/*
+ * Common routine for chroot and chdir.
+ */
+static int
+change_dir(ndp, td)
+	register struct nameidata *ndp;
+	struct thread *td;
+{
+	struct vnode *vp;
+	int error;
+
+	error = namei(ndp);
+	if (error)
+		return (error);
+	vp = ndp->ni_vp;
+	if (vp->v_type != VDIR)
+		error = ENOTDIR;
+	else
+		error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+	if (error)
+		vput(vp);
+	else
+		VOP_UNLOCK(vp, 0, td);
+	return (error);
+}
+
+/*
+ * Check permissions, allocate an open file structure,
+ * and call the device open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+	char	*path;
+	int	flags;
+	int	mode;
+};
+#endif
+int
+open(td, uap)
+	struct thread *td;
+	register struct open_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp = p->p_fd;
+	struct file *fp;
+	struct vnode *vp;
+	struct vattr vat;
+	struct mount *mp;
+	int cmode, flags, oflags;
+	struct file *nfp;
+	int type, indx, error;
+	struct flock lf;
+	struct nameidata nd;
+
+	oflags = SCARG(uap, flags);
+	if ((oflags & O_ACCMODE) == O_ACCMODE)
+		return (EINVAL);
+	flags = FFLAGS(oflags);
+	error = falloc(td, &nfp, &indx);
+	if (error)
+		return (error);
+	fp = nfp;
+	FILEDESC_LOCK(fdp);
+	cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+	FILEDESC_UNLOCK(fdp);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	td->td_dupfd = -indx - 1;		/* XXX check for fdopen */
+	/*
+	 * Bump the ref count to prevent another process from closing
+	 * the descriptor while we are blocked in vn_open()
+	 */
+	fhold(fp);
+	error = vn_open(&nd, &flags, cmode);
+	if (error) {
+		/*
+		 * release our own reference
+		 */
+		fdrop(fp, td);
+
+		/*
+		 * handle special fdopen() case.  bleh.  dupfdopen() is
+		 * responsible for dropping the old contents of ofiles[indx]
+		 * if it succeeds.
+		 */
+		if ((error == ENODEV || error == ENXIO) &&
+		    td->td_dupfd >= 0 &&		/* XXX from fdopen */
+		    (error =
+			dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) {
+			td->td_retval[0] = indx;
+			return (0);
+		}
+		/*
+		 * Clean up the descriptor, but only if another thread hadn't
+		 * replaced or closed it.
+		 */
+		FILEDESC_LOCK(fdp);
+		if (fdp->fd_ofiles[indx] == fp) {
+			fdp->fd_ofiles[indx] = NULL;
+			FILEDESC_UNLOCK(fdp);
+			fdrop(fp, td);
+		} else
+			FILEDESC_UNLOCK(fdp);
+
+		if (error == ERESTART)
+			error = EINTR;
+		return (error);
+	}
+	td->td_dupfd = 0;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+
+	/*
+	 * There should be 2 references on the file, one from the descriptor
+	 * table, and one for us.
+	 *
+	 * Handle the case where someone closed the file (via its file
+	 * descriptor) while we were blocked.  The end result should look
+	 * like opening the file succeeded but it was immediately closed.
+	 */
+	FILEDESC_LOCK(fdp);
+	FILE_LOCK(fp);
+	if (fp->f_count == 1) {
+		KASSERT(fdp->fd_ofiles[indx] != fp,
+		    ("Open file descriptor lost all refs"));
+		FILEDESC_UNLOCK(fdp);
+		FILE_UNLOCK(fp);
+		VOP_UNLOCK(vp, 0, td);
+		vn_close(vp, flags & FMASK, fp->f_cred, td);
+		fdrop(fp, td);
+		td->td_retval[0] = indx;
+		return 0;
+	}
+
+	/* assert that vn_open created a backing object if one is needed */
+	KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0,
+		("open: vmio vnode has no backing object after vn_open"));
+
+	fp->f_data = vp;
+	fp->f_flag = flags & FMASK;
+	fp->f_ops = &vnops;
+	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+	FILEDESC_UNLOCK(fdp);
+	FILE_UNLOCK(fp);
+	VOP_UNLOCK(vp, 0, td);
+	if (flags & (O_EXLOCK | O_SHLOCK)) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		if (flags & O_EXLOCK)
+			lf.l_type = F_WRLCK;
+		else
+			lf.l_type = F_RDLCK;
+		type = F_FLOCK;
+		if ((flags & FNONBLOCK) == 0)
+			type |= F_WAIT;
+		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+			    type)) != 0)
+			goto bad;
+		fp->f_flag |= FHASLOCK;
+	}
+	if (flags & O_TRUNC) {
+		if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+			goto bad;
+		VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+		VATTR_NULL(&vat);
+		vat.va_size = 0;
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		error = VOP_SETATTR(vp, &vat, td->td_ucred, td);
+		VOP_UNLOCK(vp, 0, td);
+		vn_finished_write(mp);
+		if (error)
+			goto bad;
+	}
+	/*
+	 * Release our private reference, leaving the one associated with
+	 * the descriptor table intact.
+	 */
+	fdrop(fp, td);
+	td->td_retval[0] = indx;
+	return (0);
+bad:
+	FILEDESC_LOCK(fdp);
+	if (fdp->fd_ofiles[indx] == fp) {
+		fdp->fd_ofiles[indx] = NULL;
+		FILEDESC_UNLOCK(fdp);
+		fdrop(fp, td);
+	} else
+		FILEDESC_UNLOCK(fdp);
+	return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+ocreat(td, uap)
+	struct thread *td;
+	register struct ocreat_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct open_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(int) mode;
+	} */ nuap;
+
+	SCARG(&nuap, path) = SCARG(uap, path);
+	SCARG(&nuap, mode) = SCARG(uap, mode);
+	SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
+	return (open(td, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+	char	*path;
+	int	mode;
+	int	dev;
+};
+#endif
+/* ARGSUSED */
+int
+mknod(td, uap)
+	struct thread *td;
+	register struct mknod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+		syscallarg(int) dev;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct vattr vattr;
+	int error;
+	int whiteout = 0;
+	struct nameidata nd;
+
+	switch (SCARG(uap, mode) & S_IFMT) {
+	case S_IFCHR:
+	case S_IFBLK:
+		error = suser(td);
+		break;
+	default:
+		error = suser_cred(td->td_ucred, PRISON_ROOT);
+		break;
+	}
+	if (error)
+		return (error);
+restart:
+	bwillwrite();
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		vrele(vp);
+		error = EEXIST;
+	} else {
+		VATTR_NULL(&vattr);
+		FILEDESC_LOCK(td->td_proc->p_fd);
+		vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+		FILEDESC_UNLOCK(td->td_proc->p_fd);
+		vattr.va_rdev = SCARG(uap, dev);
+		whiteout = 0;
+
+		switch (SCARG(uap, mode) & S_IFMT) {
+		case S_IFMT:	/* used by badsect to flag bad sectors */
+			vattr.va_type = VBAD;
+			break;
+		case S_IFCHR:
+			vattr.va_type = VCHR;
+			break;
+		case S_IFBLK:
+			vattr.va_type = VBLK;
+			break;
+		case S_IFWHT:
+			whiteout = 1;
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+		if (whiteout)
+			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+		else {
+			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+						&nd.ni_cnd, &vattr);
+			if (error == 0)
+				vput(nd.ni_vp);
+		}
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod");
+	return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkfifo(td, uap)
+	struct thread *td;
+	register struct mkfifo_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	if (nd.ni_vp != NULL) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vrele(nd.ni_vp);
+		vput(nd.ni_dvp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VFIFO;
+	FILEDESC_LOCK(td->td_proc->p_fd);
+	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	if (error == 0)
+		vput(nd.ni_vp);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+	char	*path;
+	char	*link;
+};
+#endif
+/* ARGSUSED */
+int
+link(td, uap)
+	struct thread *td;
+	register struct link_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) link;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct nameidata nd;
+	int error;
+
+	bwillwrite();
+	NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if (vp->v_type == VDIR) {
+		vrele(vp);
+		return (EPERM);		/* POSIX */
+	}
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		vrele(vp);
+		return (error);
+	}
+	NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
+	if ((error = namei(&nd)) == 0) {
+		if (nd.ni_vp != NULL) {
+			vrele(nd.ni_vp);
+			error = EEXIST;
+		} else {
+			VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+			VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+		}
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+	}
+	vrele(vp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "link");
+	return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+	char	*path;
+	char	*link;
+};
+#endif
+/* ARGSUSED */
+int
+symlink(td, uap)
+	struct thread *td;
+	register struct symlink_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) link;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vattr vattr;
+	char *path;
+	int error;
+	struct nameidata nd;
+
+	path = uma_zalloc(namei_zone, M_WAITOK);
+	if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0)
+		goto out;
+restart:
+	bwillwrite();
+	NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
+	if ((error = namei(&nd)) != 0)
+		goto out;
+	if (nd.ni_vp) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vrele(nd.ni_vp);
+		vput(nd.ni_dvp);
+		error = EEXIST;
+		goto out;
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	FILEDESC_LOCK(td->td_proc->p_fd);
+	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error == 0)
+		vput(nd.ni_vp);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
+out:
+	uma_zfree(namei_zone, path);
+	return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+/* ARGSUSED */
+int
+undelete(td, uap)
+	struct thread *td;
+	register struct undelete_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	int error;
+	struct mount *mp;
+	struct nameidata nd;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+
+	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_vp)
+			vrele(nd.ni_vp);
+		vput(nd.ni_dvp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete");
+	return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+unlink(td, uap)
+	struct thread *td;
+	struct unlink_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type == VDIR)
+		error = EPERM;		/* POSIX */
+	else {
+		/*
+		 * The root of a mounted filesystem cannot be deleted.
+		 *
+		 * XXX: can this only be a VDIR case?
+		 */
+		if (vp->v_flag & VROOT)
+			error = EBUSY;
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vrele(vp);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vput(vp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink");
+	return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+	int	fd;
+	int	pad;
+	off_t	offset;
+	int	whence;
+};
+#endif
+int
+lseek(td, uap)
+	struct thread *td;
+	register struct lseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) offset;
+		syscallarg(int) whence;
+	} */ *uap;
+{
+	struct ucred *cred = td->td_ucred;
+	struct file *fp;
+	struct vnode *vp;
+	struct vattr vattr;
+	off_t offset;
+	int error, noneg;
+
+	if ((error = fget(td, uap->fd, &fp)) != 0)
+		return (error);
+	if (fp->f_type != DTYPE_VNODE) {
+		fdrop(fp, td);
+		return (ESPIPE);
+	}
+	vp = (struct vnode *)fp->f_data;
+	noneg = (vp->v_type != VCHR);
+	offset = SCARG(uap, offset);
+	switch (SCARG(uap, whence)) {
+	case L_INCR:
+		if (noneg &&
+		    (fp->f_offset < 0 ||
+		     (offset > 0 && fp->f_offset > OFF_MAX - offset)))
+			return (EOVERFLOW);
+		offset += fp->f_offset;
+		break;
+	case L_XTND:
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		error = VOP_GETATTR(vp, &vattr, cred, td);
+		VOP_UNLOCK(vp, 0, td);
+		if (error)
+			return (error);
+		if (noneg &&
+		    (vattr.va_size > OFF_MAX ||
+		     (offset > 0 && vattr.va_size > OFF_MAX - offset)))
+			return (EOVERFLOW);
+		offset += vattr.va_size;
+		break;
+	case L_SET:
+		break;
+	default:
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	if (noneg && offset < 0)
+		return (EINVAL);
+	fp->f_offset = offset;
+	*(off_t *)(td->td_retval) = fp->f_offset;
+	fdrop(fp, td);
+	return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+	int	fd;
+	long	offset;
+	int	whence;
+};
+#endif
+int
+olseek(td, uap)
+	struct thread *td;
+	register struct olseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(long) offset;
+		syscallarg(int) whence;
+	} */ *uap;
+{
+	struct lseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) offset;
+		syscallarg(int) whence;
+	} */ nuap;
+	int error;
+
+	SCARG(&nuap, fd) = SCARG(uap, fd);
+	SCARG(&nuap, offset) = SCARG(uap, offset);
+	SCARG(&nuap, whence) = SCARG(uap, whence);
+	error = lseek(td, &nuap);
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Check access permissions using passed credentials.
+ */
+static int
+vn_access(vp, user_flags, cred, td)
+	struct vnode	*vp;
+	int		user_flags;
+	struct ucred	*cred;
+	struct thread	*td;
+{
+	int error, flags;
+
+	/* Flags == 0 means only check for existence. */
+	error = 0;
+	if (user_flags) {
+		flags = 0;
+		if (user_flags & R_OK)
+			flags |= VREAD;
+		if (user_flags & W_OK)
+			flags |= VWRITE;
+		if (user_flags & X_OK)
+			flags |= VEXEC;
+		if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+			error = VOP_ACCESS(vp, flags, cred, td);
+	}
+	return (error);
+}
+
+/*
+ * Check access permissions using "real" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+	char	*path;
+	int	flags;
+};
+#endif
+int
+access(td, uap)
+	struct thread *td;
+	register struct access_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct ucred *cred, *tmpcred;
+	register struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+	/*
+	 * Create and modify a temporary credential instead of one that
+	 * is potentially shared.  This could also mess up socket
+	 * buffer accounting which can run in an interrupt context.
+	 *
+	 * XXX - Depending on how "threads" are finally implemented, it
+	 * may be better to explicitly pass the credential to namei()
+	 * rather than to modify the potentially shared process structure.
+	 */
+	cred = td->td_ucred;
+	tmpcred = crdup(cred);
+	tmpcred->cr_uid = cred->cr_ruid;
+	tmpcred->cr_groups[0] = cred->cr_rgid;
+	td->td_ucred = tmpcred;
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		goto out1;
+	vp = nd.ni_vp;
+
+	error = vn_access(vp, SCARG(uap, flags), tmpcred, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+out1:
+	td->td_ucred = cred;
+	crfree(tmpcred);
+	return (error);
+}
+
+/*
+ * Check access permissions using "effective" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct eaccess_args {
+	char	*path;
+	int	flags;
+};
+#endif
+int
+eaccess(td, uap)
+	struct thread *td;
+	register struct eaccess_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct nameidata nd;
+	struct vnode *vp;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+
+	error = vn_access(vp, SCARG(uap, flags), td->td_ucred, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+ostat(td, uap)
+	struct thread *td;
+	register struct ostat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct ostat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct ostat osb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = vn_stat(nd.ni_vp, &sb, td);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	cvtstat(&sb, &osb);
+	error = copyout(&osb, SCARG(uap, ub), sizeof (osb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+olstat(td, uap)
+	struct thread *td;
+	register struct olstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct ostat *) ub;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct stat sb;
+	struct ostat osb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	error = vn_stat(vp, &sb, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+	if (error)
+		return (error);
+	cvtstat(&sb, &osb);
+	error = copyout(&osb, SCARG(uap, ub), sizeof (osb));
+	return (error);
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+	struct stat *st;
+	struct ostat *ost;
+{
+
+	ost->st_dev = st->st_dev;
+	ost->st_ino = st->st_ino;
+	ost->st_mode = st->st_mode;
+	ost->st_nlink = st->st_nlink;
+	ost->st_uid = st->st_uid;
+	ost->st_gid = st->st_gid;
+	ost->st_rdev = st->st_rdev;
+	if (st->st_size < (quad_t)1 << 32)
+		ost->st_size = st->st_size;
+	else
+		ost->st_size = -2;
+	ost->st_atime = st->st_atime;
+	ost->st_mtime = st->st_mtime;
+	ost->st_ctime = st->st_ctime;
+	ost->st_blksize = st->st_blksize;
+	ost->st_blocks = st->st_blocks;
+	ost->st_flags = st->st_flags;
+	ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+stat(td, uap)
+	struct thread *td;
+	register struct stat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct stat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	int error;
+	struct nameidata nd;
+
+#ifdef LOOKUP_SHARED
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | NOOBJ,
+	    UIO_USERSPACE, SCARG(uap, path), td);
+#else
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+#endif
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	error = copyout(&sb, SCARG(uap, ub), sizeof (sb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+lstat(td, uap)
+	struct thread *td;
+	register struct lstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct stat *) ub;
+	} */ *uap;
+{
+	int error;
+	struct vnode *vp;
+	struct stat sb;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	error = vn_stat(vp, &sb, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+	if (error)
+		return (error);
+	error = copyout(&sb, SCARG(uap, ub), sizeof (sb));
+	return (error);
+}
+
+/*
+ * Implementation of the NetBSD stat() function.
+ * XXX This should probably be collapsed with the FreeBSD version,
+ * as the differences are only due to vn_stat() clearing spares at
+ * the end of the structures.  vn_stat could be split to avoid this,
+ * and thus collapse the following to close to zero code.
+ */
+void
+cvtnstat(sb, nsb)
+	struct stat *sb;
+	struct nstat *nsb;
+{
+	bzero(nsb, sizeof *nsb);
+	nsb->st_dev = sb->st_dev;
+	nsb->st_ino = sb->st_ino;
+	nsb->st_mode = sb->st_mode;
+	nsb->st_nlink = sb->st_nlink;
+	nsb->st_uid = sb->st_uid;
+	nsb->st_gid = sb->st_gid;
+	nsb->st_rdev = sb->st_rdev;
+	nsb->st_atimespec = sb->st_atimespec;
+	nsb->st_mtimespec = sb->st_mtimespec;
+	nsb->st_ctimespec = sb->st_ctimespec;
+	nsb->st_size = sb->st_size;
+	nsb->st_blocks = sb->st_blocks;
+	nsb->st_blksize = sb->st_blksize;
+	nsb->st_flags = sb->st_flags;
+	nsb->st_gen = sb->st_gen;
+	nsb->st_createtimespec = sb->st_createtimespec;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nstat_args {
+	char	*path;
+	struct nstat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nstat(td, uap)
+	struct thread *td;
+	register struct nstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct nstat *) ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct nstat nsb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = vn_stat(nd.ni_vp, &sb, td);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	cvtnstat(&sb, &nsb);
+	error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb));
+	return (error);
+}
+
+/*
+ * NetBSD lstat.  Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+nlstat(td, uap)
+	struct thread *td;
+	register struct nlstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct nstat *) ub;
+	} */ *uap;
+{
+	int error;
+	struct vnode *vp;
+	struct stat sb;
+	struct nstat nsb;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = vn_stat(vp, &sb, td);
+	vput(vp);
+	if (error)
+		return (error);
+	cvtnstat(&sb, &nsb);
+	error = copyout(&nsb, SCARG(uap, ub), sizeof (nsb));
+	return (error);
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+	char	*path;
+	int	name;
+};
+#endif
+/* ARGSUSED */
+int
+pathconf(td, uap)
+	struct thread *td;
+	register struct pathconf_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) name;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), td->td_retval);
+	vput(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+	char	*path;
+	char	*buf;
+	int	count;
+};
+#endif
+/* ARGSUSED */
+int
+readlink(td, uap)
+	struct thread *td;
+	register struct readlink_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) buf;
+		syscallarg(int) count;
+	} */ *uap;
+{
+	register struct vnode *vp;
+	struct iovec aiov;
+	struct uio auio;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if (vp->v_type != VLNK)
+		error = EINVAL;
+	else {
+		aiov.iov_base = SCARG(uap, buf);
+		aiov.iov_len = SCARG(uap, count);
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_td = td;
+		auio.uio_resid = SCARG(uap, count);
+		error = VOP_READLINK(vp, &auio, td->td_ucred);
+	}
+	vput(vp);
+	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+
+/*
+ * Common implementation code for chflags() and fchflags().
+ */
+static int
+setfflags(td, vp, flags)
+	struct thread *td;
+	struct vnode *vp;
+	int flags;
+{
+	int error;
+	struct mount *mp;
+	struct vattr vattr;
+
+	/*
+	 * Prevent non-root users from setting flags on devices.  When
+	 * a device is reused, users can retain ownership of the device
+	 * if they are allowed to set flags and programs assume that
+	 * chown can't fail when done as root.
+	 */
+	if (vp->v_type == VCHR || vp->v_type == VBLK) {
+		error = suser_cred(td->td_ucred, PRISON_ROOT);
+		if (error)
+			return (error);
+	}
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	VATTR_NULL(&vattr);
+	vattr.va_flags = flags;
+	error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+chflags(td, uap)
+	struct thread *td;
+	register struct chflags_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfflags(td, nd.ni_vp, SCARG(uap, flags));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Same as chflags() but doesn't follow symlinks.
+ */
+int
+lchflags(td, uap)
+	struct thread *td;
+	register struct lchflags_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfflags(td, nd.ni_vp, SCARG(uap, flags));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+	int	fd;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+fchflags(td, uap)
+	struct thread *td;
+	register struct fchflags_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct file *fp;
+	int error;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	error = setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags));
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation code for chmod(), lchmod() and fchmod().
+ */
+static int
+setfmode(td, vp, mode)
+	struct thread *td;
+	struct vnode *vp;
+	int mode;
+{
+	int error;
+	struct mount *mp;
+	struct vattr vattr;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	VATTR_NULL(&vattr);
+	vattr.va_mode = mode & ALLPERMS;
+	error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return error;
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+chmod(td, uap)
+	struct thread *td;
+	register struct chmod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+lchmod(td, uap)
+	struct thread *td;
+	register struct lchmod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
+	vrele(nd.ni_vp);
+	return error;
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+	int	fd;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+fchmod(td, uap)
+	struct thread *td;
+	register struct fchmod_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+	struct file *fp;
+	struct vnode *vp;
+	int error;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	error = setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode));
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation for chown(), lchown(), and fchown()
+ */
+static int
+setfown(td, vp, uid, gid)
+	struct thread *td;
+	struct vnode *vp;
+	uid_t uid;
+	gid_t gid;
+{
+	int error;
+	struct mount *mp;
+	struct vattr vattr;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	VATTR_NULL(&vattr);
+	vattr.va_uid = uid;
+	vattr.va_gid = gid;
+	error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return error;
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+chown(td, uap)
+	struct thread *td;
+	register struct chown_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+lchown(td, uap)
+	struct thread *td;
+	register struct lchown_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+	int	fd;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+fchown(td, uap)
+	struct thread *td;
+	register struct fchown_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+{
+	struct file *fp;
+	struct vnode *vp;
+	int error;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	error = setfown(td, (struct vnode *)fp->f_data,
+		SCARG(uap, uid), SCARG(uap, gid));
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+getutimes(usrtvp, tsp)
+	const struct timeval *usrtvp;
+	struct timespec *tsp;
+{
+	struct timeval tv[2];
+	int error;
+
+	if (usrtvp == NULL) {
+		microtime(&tv[0]);
+		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
+		tsp[1] = tsp[0];
+	} else {
+		if ((error = copyin(usrtvp, tv, sizeof (tv))) != 0)
+			return (error);
+		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
+		TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
+	}
+	return 0;
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+setutimes(td, vp, ts, nullflag)
+	struct thread *td;
+	struct vnode *vp;
+	const struct timespec *ts;
+	int nullflag;
+{
+	int error;
+	struct mount *mp;
+	struct vattr vattr;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	VATTR_NULL(&vattr);
+	vattr.va_atime = ts[0];
+	vattr.va_mtime = ts[1];
+	if (nullflag)
+		vattr.va_vaflags |= VA_UTIMES_NULL;
+	error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return error;
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+utimes(td, uap)
+	struct thread *td;
+	register struct utimes_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timespec ts[2];
+	struct timeval *usrtvp;
+	int error;
+	struct nameidata nd;
+
+	usrtvp = SCARG(uap, tptr);
+	if ((error = getutimes(usrtvp, ts)) != 0)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+lutimes(td, uap)
+	struct thread *td;
+	register struct lutimes_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timespec ts[2];
+	struct timeval *usrtvp;
+	int error;
+	struct nameidata nd;
+
+	usrtvp = SCARG(uap, tptr);
+	if ((error = getutimes(usrtvp, ts)) != 0)
+		return (error);
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+	int	fd;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+futimes(td, uap)
+	struct thread *td;
+	register struct futimes_args /* {
+		syscallarg(int ) fd;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+{
+	struct timespec ts[2];
+	struct file *fp;
+	struct timeval *usrtvp;
+	int error;
+
+	usrtvp = SCARG(uap, tptr);
+	if ((error = getutimes(usrtvp, ts)) != 0)
+		return (error);
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	error = setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+	char	*path;
+	int	pad;
+	off_t	length;
+};
+#endif
+/* ARGSUSED */
+int
+truncate(td, uap)
+	struct thread *td;
+	register struct truncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	if (uap->length < 0)
+		return(EINVAL);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		vrele(vp);
+		return (error);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0 &&
+	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = SCARG(uap, length);
+		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
+	}
+	vput(vp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+	int	fd;
+	int	pad;
+	off_t	length;
+};
+#endif
+/* ARGSUSED */
+int
+ftruncate(td, uap)
+	struct thread *td;
+	register struct ftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vattr vattr;
+	struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (uap->length < 0)
+		return(EINVAL);
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	if ((fp->f_flag & FWRITE) == 0) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	vp = (struct vnode *)fp->f_data;
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		fdrop(fp, td);
+		return (error);
+	}
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = SCARG(uap, length);
+		error = VOP_SETATTR(vp, &vattr, fp->f_cred, td);
+	}
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	fdrop(fp, td);
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+	char	*path;
+	long	length;
+};
+#endif
+/* ARGSUSED */
+int
+otruncate(td, uap)
+	struct thread *td;
+	register struct otruncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(long) length;
+	} */ *uap;
+{
+	struct truncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ nuap;
+
+	SCARG(&nuap, path) = SCARG(uap, path);
+	SCARG(&nuap, length) = SCARG(uap, length);
+	return (truncate(td, &nuap));
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+	int	fd;
+	long	length;
+};
+#endif
+/* ARGSUSED */
+int
+oftruncate(td, uap)
+	struct thread *td;
+	register struct oftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(long) length;
+	} */ *uap;
+{
+	struct ftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ nuap;
+
+	SCARG(&nuap, fd) = SCARG(uap, fd);
+	SCARG(&nuap, length) = SCARG(uap, length);
+	return (ftruncate(td, &nuap));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+	int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+fsync(td, uap)
+	struct thread *td;
+	struct fsync_args /* {
+		syscallarg(int) fd;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct file *fp;
+	vm_object_t obj;
+	int error;
+
+	GIANT_REQUIRED;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		fdrop(fp, td);
+		return (error);
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (VOP_GETVOBJECT(vp, &obj) == 0) {
+		vm_object_page_clean(obj, 0, 0, 0);
+	}
+	error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td);
+#ifdef SOFTUPDATES
+	if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
+	    error = softdep_fsync(vp);
+#endif
+
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Rename files.  Source and destination must either both be directories,
+ * or both not be directories.  If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+	char	*from;
+	char	*to;
+};
+#endif
+/* ARGSUSED */
+int
+rename(td, uap)
+	struct thread *td;
+	register struct rename_args /* {
+		syscallarg(char *) from;
+		syscallarg(char *) to;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *tvp, *fvp, *tdvp;
+	struct nameidata fromnd, tond;
+	int error;
+
+	bwillwrite();
+	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
+	    SCARG(uap, from), td);
+	if ((error = namei(&fromnd)) != 0)
+		return (error);
+	fvp = fromnd.ni_vp;
+	if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) {
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		goto out1;
+	}
+	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ,
+	    UIO_USERSPACE, SCARG(uap, to), td);
+	if (fromnd.ni_vp->v_type == VDIR)
+		tond.ni_cnd.cn_flags |= WILLBEDIR;
+	if ((error = namei(&tond)) != 0) {
+		/* Translate error code for rename("dir1", "dir2/."). */
+		if (error == EISDIR && fvp->v_type == VDIR)
+			error = EINVAL;
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		goto out1;
+	}
+	tdvp = tond.ni_dvp;
+	tvp = tond.ni_vp;
+	if (tvp != NULL) {
+		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto out;
+		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+			error = EISDIR;
+			goto out;
+		}
+	}
+	if (fvp == tdvp)
+		error = EINVAL;
+	/*
+	 * If source is the same as the destination (that is the
+	 * same inode number with the same name in the same directory),
+	 * then there is nothing to do.
+	 */
+	if (fvp == tvp && fromnd.ni_dvp == tdvp &&
+	    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
+	    !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
+	      fromnd.ni_cnd.cn_namelen))
+		error = -1;
+out:
+	if (!error) {
+		VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE);
+		if (fromnd.ni_dvp != tdvp) {
+			VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+		}
+		if (tvp) {
+			VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE);
+		}
+		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		NDFREE(&tond, NDF_ONLY_PNBUF);
+	} else {
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		NDFREE(&tond, NDF_ONLY_PNBUF);
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		if (tvp)
+			vput(tvp);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+	}
+	vrele(tond.ni_startdir);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename");
+	ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename");
+	ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename");
+	ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename");
+out1:
+	if (fromnd.ni_startdir)
+		vrele(fromnd.ni_startdir);
+	if (error == -1)
+		return (0);
+	return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkdir(td, uap)
+	struct thread *td;
+	register struct mkdir_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+{
+
+	return vn_mkdir(uap->path, uap->mode, UIO_USERSPACE, td);
+}
+
+int
+vn_mkdir(path, mode, segflg, td)
+	char *path;
+	int mode;
+	enum uio_seg segflg;
+	struct thread *td;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, CREATE, LOCKPARENT, segflg, path, td);
+	nd.ni_cnd.cn_flags |= WILLBEDIR;
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vrele(vp);
+		/*
+		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
+		 * the strange behaviour of leaving the vnode unlocked
+		 * if the target is the same vnode as the parent.
+		 */
+		if (vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VDIR;
+	FILEDESC_LOCK(td->td_proc->p_fd);
+	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	if (!error)
+		vput(nd.ni_vp);
+	vn_finished_write(mp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir");
+	return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+rmdir(td, uap)
+	struct thread *td;
+	struct rmdir_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+	/*
+	 * No rmdir "." please.
+	 */
+	if (nd.ni_dvp == vp) {
+		error = EINVAL;
+		goto out;
+	}
+	/*
+	 * The root of a mounted filesystem cannot be deleted.
+	 */
+	if (vp->v_flag & VROOT) {
+		error = EBUSY;
+		goto out;
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vput(vp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+	vn_finished_write(mp);
+out:
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (nd.ni_dvp == vp)
+		vrele(nd.ni_dvp);
+	else
+		vput(nd.ni_dvp);
+	vput(vp);
+	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir");
+	ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir");
+	return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+ogetdirentries(td, uap)
+	struct thread *td;
+	register struct ogetdirentries_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+		syscallarg(long *) basep;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio, kuio;
+	struct iovec aiov, kiov;
+	struct dirent *dp, *edp;
+	caddr_t dirbuf;
+	int error, eofflag, readcnt;
+	long loff;
+
+	/* XXX arbitrary sanity limit on `count'. */
+	if (SCARG(uap, count) > 64 * 1024)
+		return (EINVAL);
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+	vp = (struct vnode *)fp->f_data;
+unionread:
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	aiov.iov_base = SCARG(uap, buf);
+	aiov.iov_len = SCARG(uap, count);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	auio.uio_resid = SCARG(uap, count);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	loff = auio.uio_offset = fp->f_offset;
+#	if (BYTE_ORDER != LITTLE_ENDIAN)
+		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+			fp->f_offset = auio.uio_offset;
+		} else
+#	endif
+	{
+		kuio = auio;
+		kuio.uio_iov = &kiov;
+		kuio.uio_segflg = UIO_SYSSPACE;
+		kiov.iov_len = SCARG(uap, count);
+		MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
+		kiov.iov_base = dirbuf;
+		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+		fp->f_offset = kuio.uio_offset;
+		if (error == 0) {
+			readcnt = SCARG(uap, count) - kuio.uio_resid;
+			edp = (struct dirent *)&dirbuf[readcnt];
+			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+#				if (BYTE_ORDER == LITTLE_ENDIAN)
+					/*
+					 * The expected low byte of
+					 * dp->d_namlen is our dp->d_type.
+					 * The high MBZ byte of dp->d_namlen
+					 * is our dp->d_namlen.
+					 */
+					dp->d_type = dp->d_namlen;
+					dp->d_namlen = 0;
+#				else
+					/*
+					 * The dp->d_type is the high byte
+					 * of the expected dp->d_namlen,
+					 * so must be zero'ed.
+					 */
+					dp->d_type = 0;
+#				endif
+				if (dp->d_reclen > 0) {
+					dp = (struct dirent *)
+					    ((char *)dp + dp->d_reclen);
+				} else {
+					error = EIO;
+					break;
+				}
+			}
+			if (dp >= edp)
+				error = uiomove(dirbuf, readcnt, &auio);
+		}
+		FREE(dirbuf, M_TEMP);
+	}
+	VOP_UNLOCK(vp, 0, td);
+	if (error) {
+		fdrop(fp, td);
+		return (error);
+	}
+	if (SCARG(uap, count) == auio.uio_resid) {
+		if (union_dircheckp) {
+			error = union_dircheckp(td, &vp, fp);
+			if (error == -1)
+				goto unionread;
+			if (error) {
+				fdrop(fp, td);
+				return (error);
+			}
+		}
+		if ((vp->v_flag & VROOT) &&
+		    (vp->v_mount->mnt_flag & MNT_UNION)) {
+			struct vnode *tvp = vp;
+			vp = vp->v_mount->mnt_vnodecovered;
+			VREF(vp);
+			fp->f_data = vp;
+			fp->f_offset = 0;
+			vrele(tvp);
+			goto unionread;
+		}
+	}
+	error = copyout(&loff, SCARG(uap, basep), sizeof(long));
+	fdrop(fp, td);
+	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+getdirentries(td, uap)
+	struct thread *td;
+	register struct getdirentries_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+		syscallarg(long *) basep;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	long loff;
+	int error, eofflag;
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+	vp = (struct vnode *)fp->f_data;
+unionread:
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	aiov.iov_base = SCARG(uap, buf);
+	aiov.iov_len = SCARG(uap, count);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	auio.uio_resid = SCARG(uap, count);
+	/* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	loff = auio.uio_offset = fp->f_offset;
+	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
+	fp->f_offset = auio.uio_offset;
+	VOP_UNLOCK(vp, 0, td);
+	if (error) {
+		fdrop(fp, td);
+		return (error);
+	}
+	if (SCARG(uap, count) == auio.uio_resid) {
+		if (union_dircheckp) {
+			error = union_dircheckp(td, &vp, fp);
+			if (error == -1)
+				goto unionread;
+			if (error) {
+				fdrop(fp, td);
+				return (error);
+			}
+		}
+		if ((vp->v_flag & VROOT) &&
+		    (vp->v_mount->mnt_flag & MNT_UNION)) {
+			struct vnode *tvp = vp;
+			vp = vp->v_mount->mnt_vnodecovered;
+			VREF(vp);
+			fp->f_data = vp;
+			fp->f_offset = 0;
+			vrele(tvp);
+			goto unionread;
+		}
+	}
+	if (SCARG(uap, basep) != NULL) {
+		error = copyout(&loff, SCARG(uap, basep), sizeof(long));
+	}
+	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	fdrop(fp, td);
+	return (error);
+}
+#ifndef _SYS_SYSPROTO_H_
+struct getdents_args {
+	int fd;
+	char *buf;
+	size_t count;
+};
+#endif
+int
+getdents(td, uap)
+	struct thread *td;
+	register struct getdents_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+	} */ *uap;
+{
+	struct getdirentries_args ap;
+	ap.fd = uap->fd;
+	ap.buf = uap->buf;
+	ap.count = uap->count;
+	ap.basep = NULL;
+	return getdirentries(td, &ap);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ *
+ * MP SAFE
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+	int	newmask;
+};
+#endif
+int
+umask(td, uap)
+	struct thread *td;
+	struct umask_args /* {
+		syscallarg(int) newmask;
+	} */ *uap;
+{
+	register struct filedesc *fdp;
+
+	FILEDESC_LOCK(td->td_proc->p_fd);
+	fdp = td->td_proc->p_fd;
+	td->td_retval[0] = fdp->fd_cmask;
+	fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem
+ * away from vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+revoke(td, uap)
+	struct thread *td;
+	register struct revoke_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path),
+	    td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (vp->v_type != VCHR) {
+		vput(vp);
+		return (EINVAL);
+	}
+	error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
+	if (error) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0, td);
+	if (td->td_ucred->cr_uid != vattr.va_uid) {
+		error = suser_cred(td->td_ucred, PRISON_ROOT);
+		if (error)
+			goto out;
+	}
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto out;
+	if (vcount(vp) > 1)
+		VOP_REVOKE(vp, REVOKEALL);
+	vn_finished_write(mp);
+out:
+	vrele(vp);
+	return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry.
+ * The file entry is locked upon returning.
+ */
+int
+getvnode(fdp, fd, fpp)
+	struct filedesc *fdp;
+	int fd;
+	struct file **fpp;
+{
+	int error;
+	struct file *fp;
+
+	fp = NULL;
+	if (fdp == NULL)
+		error = EBADF;
+	else {
+		FILEDESC_LOCK(fdp);
+		if ((u_int)fd >= fdp->fd_nfiles ||
+		    (fp = fdp->fd_ofiles[fd]) == NULL)
+			error = EBADF;
+		else if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
+			fp = NULL;
+			error = EINVAL;
+		} else {
+			fhold(fp);
+			error = 0;
+		}
+		FILEDESC_UNLOCK(fdp);
+	}
+	*fpp = fp;
+	return (error);
+}
+/*
+ * Get (NFS) file handle
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfh_args {
+	char	*fname;
+	fhandle_t *fhp;
+};
+#endif
+int
+getfh(td, uap)
+	struct thread *td;
+	register struct getfh_args *uap;
+{
+	struct nameidata nd;
+	fhandle_t fh;
+	register struct vnode *vp;
+	int error;
+
+	/*
+	 * Must be super user
+	 */
+	error = suser(td);
+	if (error)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	bzero(&fh, sizeof(fh));
+	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
+	error = VFS_VPTOFH(vp, &fh.fh_fid);
+	vput(vp);
+	if (error)
+		return (error);
+	error = copyout(&fh, uap->fhp, sizeof (fh));
+	return (error);
+}
+
+/*
+ * syscall for the rpc.lockd to use to translate a NFS file handle into
+ * an open descriptor.
+ *
+ * warning: do not remove the suser() call or this becomes one giant
+ * security hole.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhopen_args {
+	const struct fhandle *u_fhp;
+	int flags;
+};
+#endif
+int
+fhopen(td, uap)
+	struct thread *td;
+	struct fhopen_args /* {
+		syscallarg(const struct fhandle *) u_fhp;
+		syscallarg(int) flags;
+	} */ *uap;
+{
+	struct proc *p = td->td_proc;
+	struct mount *mp;
+	struct vnode *vp;
+	struct fhandle fhp;
+	struct vattr vat;
+	struct vattr *vap = &vat;
+	struct flock lf;
+	struct file *fp;
+	register struct filedesc *fdp = p->p_fd;
+	int fmode, mode, error, type;
+	struct file *nfp; 
+	int indx;
+
+	/*
+	 * Must be super user
+	 */
+	error = suser(td);
+	if (error)
+		return (error);
+
+	fmode = FFLAGS(SCARG(uap, flags));
+	/* why not allow a non-read/write open for our lockd? */
+	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
+		return (EINVAL);
+	error = copyin(SCARG(uap,u_fhp), &fhp, sizeof(fhp));
+	if (error)
+		return(error);
+	/* find the mount point */
+	mp = vfs_getvfs(&fhp.fh_fsid);
+	if (mp == NULL)
+		return (ESTALE);
+	/* now give me my vnode, it gets returned to me locked */
+	error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
+	if (error)
+		return (error);
+ 	/*
+	 * from now on we have to make sure not
+	 * to forget about the vnode
+	 * any error that causes an abort must vput(vp) 
+	 * just set error = err and 'goto bad;'.
+	 */
+
+	/* 
+	 * from vn_open 
+	 */
+	if (vp->v_type == VLNK) {
+		error = EMLINK;
+		goto bad;
+	}
+	if (vp->v_type == VSOCK) {
+		error = EOPNOTSUPP;
+		goto bad;
+	}
+	mode = 0;
+	if (fmode & (FWRITE | O_TRUNC)) {
+		if (vp->v_type == VDIR) {
+			error = EISDIR;
+			goto bad;
+		}
+		error = vn_writechk(vp);
+		if (error)
+			goto bad;
+		mode |= VWRITE;
+	}
+	if (fmode & FREAD)
+		mode |= VREAD;
+	if (mode) {
+		error = VOP_ACCESS(vp, mode, td->td_ucred, td);
+		if (error)
+			goto bad;
+	}
+	if (fmode & O_TRUNC) {
+		VOP_UNLOCK(vp, 0, td);				/* XXX */
+		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
+			vrele(vp);
+			return (error);
+		}
+		VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);	/* XXX */
+		VATTR_NULL(vap);
+		vap->va_size = 0;
+		error = VOP_SETATTR(vp, vap, td->td_ucred, td);
+		vn_finished_write(mp);
+		if (error)
+			goto bad;
+	}
+	error = VOP_OPEN(vp, fmode, td->td_ucred, td);
+	if (error)
+		goto bad;
+	/*
+	 * Make sure that a VM object is created for VMIO support.
+	 */
+	if (vn_canvmio(vp) == TRUE) {
+		if ((error = vfs_object_create(vp, td, td->td_ucred)) != 0)
+			goto bad;
+	}
+	if (fmode & FWRITE)
+		vp->v_writecount++;
+
+	/*
+	 * end of vn_open code 
+	 */
+
+	if ((error = falloc(td, &nfp, &indx)) != 0) {
+		if (fmode & FWRITE)
+			vp->v_writecount--;
+		goto bad;
+	}
+	fp = nfp;	
+
+	/*
+	 * Hold an extra reference to avoid having fp ripped out 
+	 * from under us while we block in the lock op
+	 */
+	fhold(fp);
+	nfp->f_data = vp;
+	nfp->f_flag = fmode & FMASK;
+	nfp->f_ops = &vnops;
+	nfp->f_type = DTYPE_VNODE;
+	if (fmode & (O_EXLOCK | O_SHLOCK)) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		if (fmode & O_EXLOCK)
+			lf.l_type = F_WRLCK;
+		else
+			lf.l_type = F_RDLCK;
+		type = F_FLOCK;
+		if ((fmode & FNONBLOCK) == 0)
+			type |= F_WAIT;
+		VOP_UNLOCK(vp, 0, td);
+		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+			    type)) != 0) {
+			/*
+			 * The lock request failed.  Normally close the
+			 * descriptor but handle the case where someone might
+			 * have dup()d or close()d it when we weren't looking.
+			 */
+			FILEDESC_LOCK(fdp);
+			if (fdp->fd_ofiles[indx] == fp) {
+				fdp->fd_ofiles[indx] = NULL;
+				FILEDESC_UNLOCK(fdp);
+				fdrop(fp, td);
+			} else
+				FILEDESC_UNLOCK(fdp);
+			/*
+			 * release our private reference
+			 */
+			fdrop(fp, td);
+			return(error);
+		}
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		fp->f_flag |= FHASLOCK;
+	}
+	if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0))
+		vfs_object_create(vp, td, td->td_ucred);
+
+	VOP_UNLOCK(vp, 0, td);
+	fdrop(fp, td);
+	td->td_retval[0] = indx;
+	return (0);
+
+bad:
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Stat an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstat_args {
+	struct fhandle *u_fhp;
+	struct stat *sb;
+};
+#endif
+int
+fhstat(td, uap)
+	struct thread *td;
+	register struct fhstat_args /* {
+		syscallarg(struct fhandle *) u_fhp;
+		syscallarg(struct stat *) sb;
+	} */ *uap;
+{
+	struct stat sb;
+	fhandle_t fh;
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+
+	/*
+	 * Must be super user
+	 */
+	error = suser(td);
+	if (error)
+		return (error);
+	
+	error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t));
+	if (error)
+		return (error);
+
+	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
+		return (ESTALE);
+	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
+		return (error);
+	error = vn_stat(vp, &sb, td);
+	vput(vp);
+	if (error)
+		return (error);
+	error = copyout(&sb, SCARG(uap, sb), sizeof(sb));
+	return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstatfs_args {
+	struct fhandle *u_fhp;
+	struct statfs *buf;
+};
+#endif
+int
+fhstatfs(td, uap)
+	struct thread *td;
+	struct fhstatfs_args /* {
+		syscallarg(struct fhandle) *u_fhp;
+		syscallarg(struct statfs) *buf;
+	} */ *uap;
+{
+	struct statfs *sp;
+	struct mount *mp;
+	struct vnode *vp;
+	struct statfs sb;
+	fhandle_t fh;
+	int error;
+
+	/*
+	 * Must be super user
+	 */
+	error = suser(td);
+	if (error)
+		return (error);
+
+	if ((error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t))) != 0)
+		return (error);
+
+	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
+		return (ESTALE);
+	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
+		return (error);
+	mp = vp->v_mount;
+	sp = &mp->mnt_stat;
+	vput(vp);
+	if ((error = VFS_STATFS(mp, sp, td)) != 0)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (suser(td)) {
+		bcopy(sp, &sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Syscall to push extended attribute configuration information into the
+ * VFS.  Accepts a path, which it converts to a mountpoint, as well as
+ * a command (int cmd), and attribute name and misc data.  For now, the
+ * attribute name is left in userspace for consumption by the VFS_op.
+ * It will probably be changed to be copied into sysspace by the
+ * syscall in the future, once issues with various consumers of the
+ * attribute code have raised their hands.
+ *
+ * Currently this is used only by UFS Extended Attributes.
+ */
+int
+extattrctl(td, uap)
+	struct thread *td;
+	struct extattrctl_args /* {
+		syscallarg(const char *) path;
+		syscallarg(int) cmd;
+		syscallarg(const char *) filename;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+	} */ *uap;
+{
+	struct vnode *filename_vp;
+	struct nameidata nd;
+	struct mount *mp, *mp_writable;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	/*
+	 * uap->attrname is not always defined.  We check again later when we
+	 * invoke the VFS call so as to pass in NULL there if needed.
+	 */
+	if (uap->attrname != NULL) {
+		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
+		    NULL);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * uap->filename is not always defined.  If it is, grab a vnode lock,
+	 * which VFS_EXTATTRCTL() will later release.
+	 */
+	filename_vp = NULL;
+	if (uap->filename != NULL) {
+		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+		    uap->filename, td);
+		if ((error = namei(&nd)) != 0)
+			return (error);
+		filename_vp = nd.ni_vp;
+		NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
+	}
+
+	/* uap->path is always defined. */
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	if ((error = namei(&nd)) != 0) {
+		if (filename_vp != NULL)
+			vput(filename_vp);
+		return (error);
+	}
+	mp = nd.ni_vp->v_mount;
+	error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
+	NDFREE(&nd, 0);
+	if (error) {
+		if (filename_vp != NULL)
+			vput(filename_vp);
+		return (error);
+	}
+
+	if (uap->attrname != NULL) {
+		error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp,
+		    uap->attrnamespace, attrname, td);
+	} else {
+		error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp,
+		    uap->attrnamespace, NULL, td);
+	}
+
+	vn_finished_write(mp_writable);
+	/*
+	 * VFS_EXTATTRCTL will have unlocked, but not de-ref'd,
+	 * filename_vp, so vrele it if it is defined.
+	 */
+	if (filename_vp != NULL)
+		vrele(filename_vp);
+
+	return (error);
+}
+
+/*-
+ * Set a named extended attribute on a file or directory
+ * 
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", userspace buffer
+ *            pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    void *data, size_t nbytes, struct thread *td)
+{
+	struct mount *mp;
+	struct uio auio;
+	struct iovec aiov;
+	ssize_t cnt;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+	aiov.iov_base = data;
+	aiov.iov_len = nbytes;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = 0;
+	if (nbytes > INT_MAX) {
+		error = EINVAL;
+		goto done;
+	}
+	auio.uio_resid = nbytes;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	cnt = nbytes;
+
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
+	    td->td_ucred, td);
+	cnt -= auio.uio_resid;
+	td->td_retval[0] = cnt;
+
+done:
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+int
+extattr_set_file(td, uap)
+	struct thread *td;
+	struct extattr_set_file_args /* {
+		syscallarg(const char *) path;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+		syscallarg(void *) data;
+		syscallarg(size_t) nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+int
+extattr_set_fd(td, uap)
+	struct thread *td;
+	struct extattr_set_fd_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+		syscallarg(void *) data;
+		syscallarg(size_t) nbytes;
+	} */ *uap;
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+
+	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
+		return (error);
+
+	error = extattr_set_vp((struct vnode *)fp->f_data, uap->attrnamespace,
+	    attrname, uap->data, uap->nbytes, td);
+	fdrop(fp, td);
+
+	return (error);
+}
+
+/*-
+ * Get a named extended attribute on a file or directory
+ * 
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", userspace buffer
+ *            pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    void *data, size_t nbytes, struct thread *td)
+{
+	struct uio auio, *auiop;
+	struct iovec aiov;
+	ssize_t cnt;
+	size_t size, *sizep;
+	int error;
+
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+	/*
+	 * Slightly unusual semantics: if the user provides a NULL data
+	 * pointer, they don't want to receive the data, just the
+	 * maximum read length.
+	 */
+	auiop = NULL;
+	sizep = NULL;
+	cnt = 0;
+	if (data != NULL) {
+		aiov.iov_base = data;
+		aiov.iov_len = nbytes;
+		auio.uio_iov = &aiov;
+		auio.uio_offset = 0;
+		if (nbytes > INT_MAX) {
+			error = EINVAL;
+			goto done;
+		}
+		auio.uio_resid = nbytes;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_td = td;
+		auiop = &auio;
+		cnt = nbytes;
+	} else
+		sizep = &size;
+
+	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
+	    td->td_ucred, td);
+
+	if (auiop != NULL) {
+		cnt -= auio.uio_resid;
+		td->td_retval[0] = cnt;
+	} else
+		td->td_retval[0] = size;
+
+done:
+	VOP_UNLOCK(vp, 0, td);
+	return (error);
+}
+
+int
+extattr_get_file(td, uap)
+	struct thread *td;
+	struct extattr_get_file_args /* {
+		syscallarg(const char *) path;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+		syscallarg(void *) data;
+		syscallarg(size_t) nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+int
+extattr_get_fd(td, uap)
+	struct thread *td;
+	struct extattr_get_fd_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+		syscallarg(void *) data;
+		syscallarg(size_t) nbytes;
+	} */ *uap;
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+
+	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
+		return (error);
+
+	error = extattr_get_vp((struct vnode *)fp->f_data, uap->attrnamespace,
+	    attrname, uap->data, uap->nbytes, td);
+
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * extattr_delete_vp(): Delete a named extended attribute on a file or
+ *                      directory
+ * 
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", proc "p"
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    struct thread *td)
+{
+	struct mount *mp;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred,
+	    td);
+
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+int
+extattr_delete_file(td, uap)
+	struct thread *td;
+	struct extattr_delete_file_args /* {
+		syscallarg(const char *) path;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return(error);
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	if ((error = namei(&nd)) != 0)
+		return(error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+
+	vrele(nd.ni_vp);
+	return(error);
+}
+
+int
+extattr_delete_fd(td, uap)
+	struct thread *td;
+	struct extattr_delete_fd_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) attrnamespace;
+		syscallarg(const char *) attrname;
+	} */ *uap;
+{
+	struct file *fp;
+	struct vnode *vp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+
+	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+
+	error = extattr_delete_vp((struct vnode *)fp->f_data,
+	    uap->attrnamespace, attrname, td);
+
+	fdrop(fp, td);
+	return (error);
+}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
new file mode 100644
index 0000000..77568c2
--- /dev/null
+++ b/sys/kern/vfs_vnops.c
@@ -0,0 +1,1056 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/filio.h>
+#include <sys/sx.h>
+#include <sys/ttycom.h>
+#include <sys/conf.h>
+#include <sys/syslog.h>
+
+#include <machine/limits.h>
+
+static int vn_closefile(struct file *fp, struct thread *td);
+static int vn_ioctl(struct file *fp, u_long com, caddr_t data, 
+		struct thread *td);
+static int vn_read(struct file *fp, struct uio *uio, 
+		struct ucred *cred, int flags, struct thread *td);
+static int vn_poll(struct file *fp, int events, struct ucred *cred,
+		struct thread *td);
+static int vn_kqfilter(struct file *fp, struct knote *kn);
+static int vn_statfile(struct file *fp, struct stat *sb, struct thread *td);
+static int vn_write(struct file *fp, struct uio *uio, 
+		struct ucred *cred, int flags, struct thread *td);
+
+struct 	fileops vnops = {
+	vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter,
+	vn_statfile, vn_closefile
+};
+
+int
+vn_open(ndp, flagp, cmode)
+	register struct nameidata *ndp;
+	int *flagp, cmode;
+{
+	struct thread *td = ndp->ni_cnd.cn_thread;
+
+	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred));
+}
+
+/*
+ * Common code for vnode open operations.
+ * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
+ * 
+ * Note that this does NOT free nameidata for the successful case,
+ * due to the NDINIT being done elsewhere.
+ */
+int
+vn_open_cred(ndp, flagp, cmode, cred)
+	register struct nameidata *ndp;
+	int *flagp, cmode;
+	struct ucred *cred;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct thread *td = ndp->ni_cnd.cn_thread;
+	struct vattr vat;
+	struct vattr *vap = &vat;
+	int mode, fmode, error;
+#ifdef LOOKUP_SHARED
+	int exclusive;	/* The current intended lock state */
+
+	exclusive = 0;
+#endif
+
+restart:
+	fmode = *flagp;
+	if (fmode & O_CREAT) {
+		ndp->ni_cnd.cn_nameiop = CREATE;
+		ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
+		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
+			ndp->ni_cnd.cn_flags |= FOLLOW;
+		bwillwrite();
+		if ((error = namei(ndp)) != 0)
+			return (error);
+		if (ndp->ni_vp == NULL) {
+			VATTR_NULL(vap);
+			vap->va_type = VREG;
+			vap->va_mode = cmode;
+			if (fmode & O_EXCL)
+				vap->va_vaflags |= VA_EXCLUSIVE;
+			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
+				NDFREE(ndp, NDF_ONLY_PNBUF);
+				vput(ndp->ni_dvp);
+				if ((error = vn_start_write(NULL, &mp,
+				    V_XSLEEP | PCATCH)) != 0)
+					return (error);
+				goto restart;
+			}
+			VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
+			error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
+					   &ndp->ni_cnd, vap);
+			vput(ndp->ni_dvp);
+			vn_finished_write(mp);
+			if (error) {
+				NDFREE(ndp, NDF_ONLY_PNBUF);
+				return (error);
+			}
+			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
+			ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
+			fmode &= ~O_TRUNC;
+			vp = ndp->ni_vp;
+#ifdef LOOKUP_SHARED
+			exclusive = 1;
+#endif
+		} else {
+			if (ndp->ni_dvp == ndp->ni_vp)
+				vrele(ndp->ni_dvp);
+			else
+				vput(ndp->ni_dvp);
+			ndp->ni_dvp = NULL;
+			vp = ndp->ni_vp;
+			if (fmode & O_EXCL) {
+				error = EEXIST;
+				goto bad;
+			}
+			fmode &= ~O_CREAT;
+		}
+	} else {
+		ndp->ni_cnd.cn_nameiop = LOOKUP;
+#ifdef LOOKUP_SHARED
+		ndp->ni_cnd.cn_flags =
+		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
+		    LOCKSHARED | LOCKLEAF;
+#else
+		ndp->ni_cnd.cn_flags =
+		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
+#endif
+		if ((error = namei(ndp)) != 0)
+			return (error);
+		vp = ndp->ni_vp;
+	}
+	if (vp->v_type == VLNK) {
+		error = EMLINK;
+		goto bad;
+	}
+	if (vp->v_type == VSOCK) {
+		error = EOPNOTSUPP;
+		goto bad;
+	}
+	if ((fmode & O_CREAT) == 0) {
+		mode = 0;
+		if (fmode & (FWRITE | O_TRUNC)) {
+			if (vp->v_type == VDIR) {
+				error = EISDIR;
+				goto bad;
+			}
+			error = vn_writechk(vp);
+			if (error)
+				goto bad;
+			mode |= VWRITE;
+		}
+		if (fmode & FREAD)
+			mode |= VREAD;
+		if (mode) {
+		        error = VOP_ACCESS(vp, mode, cred, td);
+			if (error)
+				goto bad;
+		}
+	}
+	if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0)
+		goto bad;
+	/*
+	 * Make sure that a VM object is created for VMIO support.
+	 */
+	if (vn_canvmio(vp) == TRUE) {
+#ifdef LOOKUP_SHARED
+		int flock;
+
+		if (!exclusive && VOP_GETVOBJECT(vp, NULL) != 0)
+			VOP_LOCK(vp, LK_UPGRADE, td);
+		/*
+		 * In cases where the object is marked as dead object_create
+		 * will unlock and relock exclusive.  It is safe to call in
+		 * here with a shared lock because we only examine fields that
+		 * the shared lock guarantees will be stable.  In the UPGRADE
+		 * case it is not likely that anyone has used this vnode yet
+		 * so there will be no contention.  The logic after this call
+		 * restores the requested locking state.
+		 */
+#endif
+		if ((error = vfs_object_create(vp, td, cred)) != 0) {
+			VOP_UNLOCK(vp, 0, td);
+			VOP_CLOSE(vp, fmode, cred, td);
+			NDFREE(ndp, NDF_ONLY_PNBUF);
+			vrele(vp);
+			*flagp = fmode;
+			return (error);
+		}
+#ifdef LOOKUP_SHARED
+		flock = VOP_ISLOCKED(vp, td);
+		if (!exclusive && flock == LK_EXCLUSIVE)
+			VOP_LOCK(vp, LK_DOWNGRADE, td);
+#endif
+	}
+
+	if (fmode & FWRITE)
+		vp->v_writecount++;
+	*flagp = fmode;
+	return (0);
+bad:
+	NDFREE(ndp, NDF_ONLY_PNBUF);
+	vput(vp);
+	*flagp = fmode;
+	return (error);
+}
+
+/*
+ * Check for write permissions on the specified vnode.
+ * Prototype text segments cannot be written.
+ */
+int
+vn_writechk(vp)
+	register struct vnode *vp;
+{
+
+	/*
+	 * If there's shared text associated with
+	 * the vnode, try to free it up once.  If
+	 * we fail, we can't allow writing.
+	 */
+	if (vp->v_flag & VTEXT)
+		return (ETXTBSY);
+	return (0);
+}
+
+/*
+ * Vnode close call
+ */
+int
+vn_close(vp, flags, cred, td)
+	register struct vnode *vp;
+	int flags;
+	struct ucred *cred;
+	struct thread *td;
+{
+	int error;
+
+	if (flags & FWRITE)
+		vp->v_writecount--;
+	error = VOP_CLOSE(vp, flags, cred, td);
+	/*
+	 * XXX - In certain instances VOP_CLOSE has to do the vrele
+	 * itself. If the vrele has been done, it will return EAGAIN
+	 * to indicate that the vrele should not be done again. When
+	 * this happens, we just return success. The correct thing to
+	 * do would be to have all VOP_CLOSE instances do the vrele.
+	 */
+	if (error == EAGAIN)
+		return (0);
+	vrele(vp);
+	return (error);
+}
+
+/*
+ * Sequential heuristic - detect sequential operation
+ */
+static __inline
+int
+sequential_heuristic(struct uio *uio, struct file *fp)
+{
+
+	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
+	    uio->uio_offset == fp->f_nextoff) {
+		/*
+		 * XXX we assume that the filesystem block size is
+		 * the default.  Not true, but still gives us a pretty
+		 * good indicator of how sequential the read operations
+		 * are.
+		 */
+		fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
+		if (fp->f_seqcount >= 127)
+			fp->f_seqcount = 127;
+		return(fp->f_seqcount << 16);
+	}
+
+	/*
+	 * Not sequential, quick draw-down of seqcount
+	 */
+	if (fp->f_seqcount > 1)
+		fp->f_seqcount = 1;
+	else
+		fp->f_seqcount = 0;
+	return(0);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.
+ */
+int
+vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
+	enum uio_rw rw;
+	struct vnode *vp;
+	caddr_t base;
+	int len;
+	off_t offset;
+	enum uio_seg segflg;
+	int ioflg;
+	struct ucred *cred;
+	int *aresid;
+	struct thread *td;
+{
+	struct uio auio;
+	struct iovec aiov;
+	struct mount *mp;
+	int error;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		mp = NULL;
+		if (rw == UIO_WRITE) { 
+			if (vp->v_type != VCHR &&
+			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
+			    != 0)
+				return (error);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		} else {
+			vn_lock(vp, LK_SHARED | LK_RETRY, td);
+		}
+
+	}
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = base;
+	aiov.iov_len = len;
+	auio.uio_resid = len;
+	auio.uio_offset = offset;
+	auio.uio_segflg = segflg;
+	auio.uio_rw = rw;
+	auio.uio_td = td;
+	if (rw == UIO_READ) {
+		error = VOP_READ(vp, &auio, ioflg, cred);
+	} else {
+		error = VOP_WRITE(vp, &auio, ioflg, cred);
+	}
+	if (aresid)
+		*aresid = auio.uio_resid;
+	else
+		if (auio.uio_resid && error == 0)
+			error = EIO;
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if (rw == UIO_WRITE)
+			vn_finished_write(mp);
+		VOP_UNLOCK(vp, 0, td);
+	}
+	return (error);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.  The I/O
+ * request is split up into smaller chunks and we try to avoid saturating
+ * the buffer cache while potentially holding a vnode locked, so we 
+ * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
+ * to give other processes a chance to lock the vnode (either other processes
+ * core'ing the same binary, or unrelated processes scanning the directory).
+ */
+int
+vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
+	enum uio_rw rw;
+	struct vnode *vp;
+	caddr_t base;
+	int len;
+	off_t offset;
+	enum uio_seg segflg;
+	int ioflg;
+	struct ucred *cred;
+	int *aresid;
+	struct thread *td;
+{
+	int error = 0;
+
+	do {
+		int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
+
+		if (rw != UIO_READ && vp->v_type == VREG)
+			bwillwrite();
+		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
+		    ioflg, cred, aresid, td);
+		len -= chunk;	/* aresid calc already includes length */
+		if (error)
+			break;
+		offset += chunk;
+		base += chunk;
+		uio_yield();
+	} while (len);
+	if (aresid)
+		*aresid += len;
+	return (error);
+}
+
+/*
+ * File table vnode read routine.
+ */
+static int
+vn_read(fp, uio, cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+	struct thread *td;
+	int flags;
+{
+	struct vnode *vp;
+	int error, ioflag;
+
+	mtx_lock(&Giant);
+	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+	    uio->uio_td, td));
+	vp = (struct vnode *)fp->f_data;
+	ioflag = 0;
+	if (fp->f_flag & FNONBLOCK)
+		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
+	VOP_LEASE(vp, td, cred, LEASE_READ);
+	vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
+	if ((flags & FOF_OFFSET) == 0)
+		uio->uio_offset = fp->f_offset;
+
+	ioflag |= sequential_heuristic(uio, fp);
+
+	error = VOP_READ(vp, uio, ioflag, cred);
+	if ((flags & FOF_OFFSET) == 0)
+		fp->f_offset = uio->uio_offset;
+	fp->f_nextoff = uio->uio_offset;
+	VOP_UNLOCK(vp, 0, td);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * File table vnode write routine.
+ */
+static int
+vn_write(fp, uio, cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+	struct thread *td;
+	int flags;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	int error, ioflag;
+
+	mtx_lock(&Giant);
+	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+	    uio->uio_td, td));
+	vp = (struct vnode *)fp->f_data;
+	if (vp->v_type == VREG)
+		bwillwrite();
+	ioflag = IO_UNIT;
+	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
+		ioflag |= IO_APPEND;
+	if (fp->f_flag & FNONBLOCK)
+		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
+	if ((fp->f_flag & O_FSYNC) ||
+	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
+		ioflag |= IO_SYNC;
+	mp = NULL;
+	if (vp->v_type != VCHR &&
+	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		mtx_unlock(&Giant);
+		return (error);
+	}
+	VOP_LEASE(vp, td, cred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if ((flags & FOF_OFFSET) == 0)
+		uio->uio_offset = fp->f_offset;
+	ioflag |= sequential_heuristic(uio, fp);
+	error = VOP_WRITE(vp, uio, ioflag, cred);
+	if ((flags & FOF_OFFSET) == 0)
+		fp->f_offset = uio->uio_offset;
+	fp->f_nextoff = uio->uio_offset;
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * File table vnode stat routine.
+ */
+static int
+vn_statfile(fp, sb, td)
+	struct file *fp;
+	struct stat *sb;
+	struct thread *td;
+{
+	struct vnode *vp = (struct vnode *)fp->f_data;
+	int error;
+
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = vn_stat(vp, sb, td);
+	VOP_UNLOCK(vp, 0, td);
+
+	return (error);
+}
+
+/*
+ * Stat a vnode; implementation for the stat syscall
+ */
+int
+vn_stat(vp, sb, td)
+	struct vnode *vp;
+	register struct stat *sb;
+	struct thread *td;
+{
+	struct vattr vattr;
+	register struct vattr *vap;
+	int error;
+	u_short mode;
+
+	vap = &vattr;
+	error = VOP_GETATTR(vp, vap, td->td_ucred, td);
+	if (error)
+		return (error);
+
+	/*
+	 * Zero the spare stat fields
+	 */
+	bzero(sb, sizeof *sb);
+
+	/*
+	 * Copy from vattr table
+	 */
+	if (vap->va_fsid != VNOVAL)
+		sb->st_dev = vap->va_fsid;
+	else
+		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
+	sb->st_ino = vap->va_fileid;
+	mode = vap->va_mode;
+	switch (vap->va_type) {
+	case VREG:
+		mode |= S_IFREG;
+		break;
+	case VDIR:
+		mode |= S_IFDIR;
+		break;
+	case VBLK:
+		mode |= S_IFBLK;
+		break;
+	case VCHR:
+		mode |= S_IFCHR;
+		break;
+	case VLNK:
+		mode |= S_IFLNK;
+		/* This is a cosmetic change, symlinks do not have a mode. */
+		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
+			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
+		else
+			sb->st_mode |= ACCESSPERMS;	/* 0777 */
+		break;
+	case VSOCK:
+		mode |= S_IFSOCK;
+		break;
+	case VFIFO:
+		mode |= S_IFIFO;
+		break;
+	default:
+		return (EBADF);
+	};
+	sb->st_mode = mode;
+	sb->st_nlink = vap->va_nlink;
+	sb->st_uid = vap->va_uid;
+	sb->st_gid = vap->va_gid;
+	sb->st_rdev = vap->va_rdev;
+	if (vap->va_size > OFF_MAX)
+		return (EOVERFLOW);
+	sb->st_size = vap->va_size;
+	sb->st_atimespec = vap->va_atime;
+	sb->st_mtimespec = vap->va_mtime;
+	sb->st_ctimespec = vap->va_ctime;
+	sb->st_createtimespec = vap->va_createtime;
+
+        /*
+	 * According to www.opengroup.org, the meaning of st_blksize is 
+	 *   "a filesystem-specific preferred I/O block size for this 
+	 *    object.  In some filesystem types, this may vary from file
+	 *    to file"
+	 * Default to PAGE_SIZE after much discussion.
+	 */
+
+	if (vap->va_type == VREG) {
+		sb->st_blksize = vap->va_blocksize;
+	} else if (vn_isdisk(vp, NULL)) {
+		sb->st_blksize = vp->v_rdev->si_bsize_best;
+		if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
+			sb->st_blksize = vp->v_rdev->si_bsize_phys;
+		if (sb->st_blksize < BLKDEV_IOSIZE)
+			sb->st_blksize = BLKDEV_IOSIZE;
+	} else {
+		sb->st_blksize = PAGE_SIZE;
+	}
+	
+	sb->st_flags = vap->va_flags;
+	if (suser(td))
+		sb->st_gen = 0;
+	else
+		sb->st_gen = vap->va_gen;
+
+#if (S_BLKSIZE == 512)
+	/* Optimize this case */
+	sb->st_blocks = vap->va_bytes >> 9;
+#else
+	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
+#endif
+	return (0);
+}
+
+/*
+ * File table vnode ioctl routine.
+ */
+static int
+vn_ioctl(fp, com, data, td)
+	struct file *fp;
+	u_long com;
+	caddr_t data;
+	struct thread *td;
+{
+	register struct vnode *vp = ((struct vnode *)fp->f_data);
+	struct vnode *vpold;
+	struct vattr vattr;
+	int error;
+
+	switch (vp->v_type) {
+
+	case VREG:
+	case VDIR:
+		if (com == FIONREAD) {
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
+			VOP_UNLOCK(vp, 0, td);
+			if (error)
+				return (error);
+			*(int *)data = vattr.va_size - fp->f_offset;
+			return (0);
+		}
+		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
+			return (0);			/* XXX */
+		/* fall into ... */
+
+	default:
+#if 0
+		return (ENOTTY);
+#endif
+	case VFIFO:
+	case VCHR:
+	case VBLK:
+		if (com == FIODTYPE) {
+			if (vp->v_type != VCHR && vp->v_type != VBLK)
+				return (ENOTTY);
+			*(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
+			return (0);
+		}
+		error = VOP_IOCTL(vp, com, data, fp->f_flag, td->td_ucred, td);
+		if (error == 0 && com == TIOCSCTTY) {
+
+			/* Do nothing if reassigning same control tty */
+			sx_slock(&proctree_lock);
+			if (td->td_proc->p_session->s_ttyvp == vp) {
+				sx_sunlock(&proctree_lock);
+				return (0);
+			}
+
+			vpold = td->td_proc->p_session->s_ttyvp;
+			VREF(vp);
+			SESS_LOCK(td->td_proc->p_session);
+			td->td_proc->p_session->s_ttyvp = vp;
+			SESS_UNLOCK(td->td_proc->p_session);
+
+			sx_sunlock(&proctree_lock);
+
+			/* Get rid of reference to old control tty */
+			if (vpold)
+				vrele(vpold);
+		}
+		return (error);
+	}
+}
+
+/*
+ * File table vnode poll routine.
+ */
+static int
+vn_poll(fp, events, cred, td)
+	struct file *fp;
+	int events;
+	struct ucred *cred;
+	struct thread *td;
+{
+
+	return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, td));
+}
+
+/*
+ * Check that the vnode is still valid, and if so
+ * acquire requested lock.
+ */
+int
+#ifndef	DEBUG_LOCKS
+vn_lock(vp, flags, td)
+#else
+debug_vn_lock(vp, flags, td, filename, line)
+#endif
+	struct vnode *vp;
+	int flags;
+	struct thread *td;
+#ifdef	DEBUG_LOCKS
+	const char *filename;
+	int line;
+#endif
+{
+	int error;
+
+	do {
+		if ((flags & LK_INTERLOCK) == 0)
+			mtx_lock(&vp->v_interlock);
+		if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curthread) {
+			vp->v_flag |= VXWANT;
+			msleep(vp, &vp->v_interlock, PINOD | PDROP,
+			    "vn_lock", 0);
+			error = ENOENT;
+		} else {
+#if 0
+			/* this can now occur in normal operation */
+			if (vp->v_vxproc != NULL)
+				log(LOG_INFO, "VXLOCK interlock avoided in vn_lock\n");
+#endif
+#ifdef	DEBUG_LOCKS
+			vp->filename = filename;
+			vp->line = line;
+#endif
+			error = VOP_LOCK(vp,
+				    flags | LK_NOPAUSE | LK_INTERLOCK, td);
+			if (error == 0)
+				return (error);
+		}
+		flags &= ~LK_INTERLOCK;
+	} while (flags & LK_RETRY);
+	return (error);
+}
+
+/*
+ * File table vnode close routine.
+ */
+static int
+vn_closefile(fp, td)
+	struct file *fp;
+	struct thread *td;
+{
+
+	fp->f_ops = &badfileops;
+	return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
+		fp->f_cred, td));
+}
+
+/*
+ * Preparing to start a filesystem write operation. If the operation is
+ * permitted, then we bump the count of operations in progress and
+ * proceed. If a suspend request is in progress, we wait until the
+ * suspension is over, and then proceed.
+ */
+int
+vn_start_write(vp, mpp, flags)
+	struct vnode *vp;
+	struct mount **mpp;
+	int flags;
+{
+	struct mount *mp;
+	int error;
+
+	/*
+	 * If a vnode is provided, get and return the mount point that
+	 * to which it will write.
+	 */
+	if (vp != NULL) {
+		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
+			*mpp = NULL;
+			if (error != EOPNOTSUPP)
+				return (error);
+			return (0);
+		}
+	}
+	if ((mp = *mpp) == NULL)
+		return (0);
+	/*
+	 * Check on status of suspension.
+	 */
+	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+		if (flags & V_NOWAIT)
+			return (EWOULDBLOCK);
+		error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
+		    "suspfs", 0);
+		if (error)
+			return (error);
+	}
+	if (flags & V_XSLEEP)
+		return (0);
+	mp->mnt_writeopcount++;
+	return (0);
+}
+
+/*
+ * Secondary suspension. Used by operations such as vop_inactive
+ * routines that are needed by the higher level functions. These
+ * are allowed to proceed until all the higher level functions have
+ * completed (indicated by mnt_writeopcount dropping to zero). At that
+ * time, these operations are halted until the suspension is over.
+ */
+int
+vn_write_suspend_wait(vp, mp, flags)
+	struct vnode *vp;
+	struct mount *mp;
+	int flags;
+{
+	int error;
+
+	if (vp != NULL) {
+		if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
+			if (error != EOPNOTSUPP)
+				return (error);
+			return (0);
+		}
+	}
+	/*
+	 * If we are not suspended or have not yet reached suspended
+	 * mode, then let the operation proceed.
+	 */
+	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
+		return (0);
+	if (flags & V_NOWAIT)
+		return (EWOULDBLOCK);
+	/*
+	 * Wait for the suspension to finish.
+	 */
+	return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
+	    "suspfs", 0));
+}
+
+/*
+ * Filesystem write operation has completed. If we are suspending and this
+ * operation is the last one, notify the suspender that the suspension is
+ * now in effect.
+ */
+void
+vn_finished_write(mp)
+	struct mount *mp;
+{
+
+	if (mp == NULL)
+		return;
+	mp->mnt_writeopcount--;
+	if (mp->mnt_writeopcount < 0)
+		panic("vn_finished_write: neg cnt");
+	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
+	    mp->mnt_writeopcount <= 0)
+		wakeup(&mp->mnt_writeopcount);
+}
+
+/*
+ * Request a filesystem to suspend write operations.
+ */
+void
+vfs_write_suspend(mp)
+	struct mount *mp;
+{
+	struct thread *td = curthread;
+
+	if (mp->mnt_kern_flag & MNTK_SUSPEND)
+		return;
+	mp->mnt_kern_flag |= MNTK_SUSPEND;
+	if (mp->mnt_writeopcount > 0)
+		(void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
+	VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td);
+	mp->mnt_kern_flag |= MNTK_SUSPENDED;
+}
+
+/*
+ * Request a filesystem to resume write operations.
+ */
+void
+vfs_write_resume(mp)
+	struct mount *mp;
+{
+
+	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
+		return;
+	mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
+	wakeup(&mp->mnt_writeopcount);
+	wakeup(&mp->mnt_flag);
+}
+
+/*
+ * Implement kqueues for files by translating it to vnode operation.
+ */
+static int
+vn_kqfilter(struct file *fp, struct knote *kn)
+{
+
+	return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn));
+}
+
+/*
+ * Simplified in-kernel wrapper calls for extended attribute access.
+ * Both calls pass in a NULL credential, authorizing as "kernel" access.
+ * Set IO_NODELOCKED in ioflg if the vnode is already locked.
+ */
+int
+vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
+    const char *attrname, int *buflen, char *buf, struct thread *td)
+{
+	struct uio	auio;
+	struct iovec	iov;
+	int	error;
+
+	iov.iov_len = *buflen;
+	iov.iov_base = buf;
+
+	auio.uio_iov = &iov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;
+	auio.uio_resid = *buflen;
+
+	if ((ioflg & IO_NODELOCKED) == 0)
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+	/* authorize attribute retrieval as kernel */
+	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
+	    td);
+
+	if ((ioflg & IO_NODELOCKED) == 0)
+		VOP_UNLOCK(vp, 0, td);
+
+	if (error == 0) {
+		*buflen = *buflen - auio.uio_resid;
+	}
+
+	return (error);
+}
+
+/*
+ * XXX failure mode if partially written?
+ */
+int
+vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
+    const char *attrname, int buflen, char *buf, struct thread *td)
+{
+	struct uio	auio;
+	struct iovec	iov;
+	struct mount	*mp;
+	int	error;
+
+	iov.iov_len = buflen;
+	iov.iov_base = buf;
+
+	auio.uio_iov = &iov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;
+	auio.uio_resid = buflen;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+			return (error);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	}
+
+	/* authorize attribute setting as kernel */
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		vn_finished_write(mp);
+		VOP_UNLOCK(vp, 0, td);
+	}
+
+	return (error);
+}
+
+int
+vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
+    const char *attrname, struct thread *td)
+{
+	struct mount	*mp;
+	int	error;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+			return (error);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	}
+
+	/* authorize attribute removal as kernel */
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td);
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		vn_finished_write(mp);
+		VOP_UNLOCK(vp, 0, td);
+	}
+
+	return (error);
+}
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
new file mode 100644
index 0000000..cdeb5e5
--- /dev/null
+++ b/sys/kern/vnode_if.src
@@ -0,0 +1,556 @@
+#
+# Copyright (c) 1992, 1993
+#	The Regents of the University of California.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+#    must display the following acknowledgement:
+#	This product includes software developed by the University of
+#	California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+#	@(#)vnode_if.src	8.12 (Berkeley) 5/14/95
+# $FreeBSD$
+#
+
+#
+# Above each of the vop descriptors is a specification of the locking
+# protocol used by each vop call.  The first column is the name of
+# the variable, the remaining three columns are in, out and error
+# respectively.  The "in" column defines the lock state on input,
+# the "out" column defines the state on succesful return, and the
+# "error" column defines the locking state on error exit.
+#
+# The locking value can take the following values:
+# L: locked; not converted to type of lock.
+# A: any lock type.
+# S: locked with shared lock.
+# E: locked with exclusive lock for this process.
+# O: locked with exclusive lock for other process.
+# U: unlocked.
+# -: not applicable.  vnode does not yet (or no longer) exists.
+# =: the same on input and output, may be either L or U.
+# X: locked if not nil.
+#
+
+#
+#% islocked	vp	= = =
+#
+vop_islocked {
+	IN struct vnode *vp;
+	IN struct thread *td;
+};
+
+#
+#% lookup	dvp	L ? ?
+#% lookup	vpp	- L -
+#
+# XXX - the lookup locking protocol defies simple description and depends
+#	on the flags and operation fields in the (cnp) structure.  Note
+#	especially that *vpp may equal dvp and both may be locked.
+#
+vop_lookup {
+	IN struct vnode *dvp;
+	INOUT struct vnode **vpp;
+	IN struct componentname *cnp;
+};
+
+#
+#% cachedlookup	dvp	L ? ?
+#% cachedlookup	vpp	- L -
+#
+# This must be an exact copy of lookup.  See kern/vfs_cache.c for details.
+#
+vop_cachedlookup {
+	IN struct vnode *dvp;
+	INOUT struct vnode **vpp;
+	IN struct componentname *cnp;
+};
+
+#
+#% create	dvp	L L L
+#% create	vpp	- L -
+#
+vop_create {
+	IN struct vnode *dvp;
+	OUT struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+};
+
+#
+#% whiteout	dvp	L L L
+#
+vop_whiteout {
+	IN struct vnode *dvp;
+	IN struct componentname *cnp;
+	IN int flags;
+};
+
+#
+#% mknod	dvp	L L L
+#% mknod	vpp	- L -
+#
+vop_mknod {
+	IN struct vnode *dvp;
+	OUT struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+};
+
+#
+#% open		vp	L L L
+#
+vop_open {
+	IN struct vnode *vp;
+	IN int mode;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% close	vp	U U U
+#
+vop_close {
+	IN struct vnode *vp;
+	IN int fflag;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% access	vp	L L L
+#
+vop_access {
+	IN struct vnode *vp;
+	IN int mode;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% getattr	vp	= = =
+#
+# XXX: This should be A A A
+#
+vop_getattr {
+	IN struct vnode *vp;
+	OUT struct vattr *vap;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% setattr	vp	L L L
+#
+vop_setattr {
+	IN struct vnode *vp;
+	IN struct vattr *vap;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% read		vp	L L L
+#
+vop_read {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN int ioflag;
+	IN struct ucred *cred;
+};
+
+#
+#% write	vp	L L L
+#
+vop_write {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN int ioflag;
+	IN struct ucred *cred;
+};
+
+#
+#% lease	vp	= = =
+#
+vop_lease {
+	IN struct vnode *vp;
+	IN struct thread *td;
+	IN struct ucred *cred;
+	IN int flag;
+};
+
+#
+#% ioctl	vp	U U U
+#
+vop_ioctl {
+	IN struct vnode *vp;
+	IN u_long command;
+	IN caddr_t data;
+	IN int fflag;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% poll	vp	U U U
+#
+vop_poll {
+	IN struct vnode *vp;
+	IN int events;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% kqfilter	vp	U U U
+#
+vop_kqfilter {
+	IN struct vnode *vp;
+	IN struct knote *kn;
+};
+
+#
+#% revoke	vp	U U U
+#
+vop_revoke {
+	IN struct vnode *vp;
+	IN int flags;
+};
+
+#
+#% fsync	vp	L L L
+#
+vop_fsync {
+	IN struct vnode *vp;
+	IN struct ucred *cred;
+	IN int waitfor;
+	IN struct thread *td;
+};
+
+#
+#% remove	dvp	L L L
+#% remove	vp	L L L
+#
+vop_remove {
+	IN struct vnode *dvp;
+	IN struct vnode *vp;
+	IN struct componentname *cnp;
+};
+
+#
+#% link		tdvp	L L L
+#% link		vp	U U U
+#
+vop_link {
+	IN struct vnode *tdvp;
+	IN struct vnode *vp;
+	IN struct componentname *cnp;
+};
+
+#
+#% rename	fdvp	U U U
+#% rename	fvp	U U U
+#% rename	tdvp	L U U
+#% rename	tvp	X U U
+#
+vop_rename {
+	IN WILLRELE struct vnode *fdvp;
+	IN WILLRELE struct vnode *fvp;
+	IN struct componentname *fcnp;
+	IN WILLRELE struct vnode *tdvp;
+	IN WILLRELE struct vnode *tvp;
+	IN struct componentname *tcnp;
+};
+
+#
+#% mkdir	dvp	L L L
+#% mkdir	vpp	- L -
+#
+vop_mkdir {
+	IN struct vnode *dvp;
+	OUT struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+};
+
+#
+#% rmdir	dvp	L L L
+#% rmdir	vp	L L L
+#
+vop_rmdir {
+	IN struct vnode *dvp;
+	IN struct vnode *vp;
+	IN struct componentname *cnp;
+};
+
+#
+#% symlink	dvp	L L L
+#% symlink	vpp	- L -
+#
+vop_symlink {
+	IN struct vnode *dvp;
+	OUT struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+	IN char *target;
+};
+
+#
+#% readdir	vp	L L L
+#
+vop_readdir {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN struct ucred *cred;
+	INOUT int *eofflag;
+	OUT int *ncookies;
+	INOUT u_long **cookies;
+};
+
+#
+#% readlink	vp	L L L
+#
+vop_readlink {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN struct ucred *cred;
+};
+
+#
+#% inactive	vp	L U U
+#
+vop_inactive {
+	IN struct vnode *vp;
+	IN struct thread *td;
+};
+
+#
+#% reclaim	vp	U U U
+#
+vop_reclaim {
+	IN struct vnode *vp;
+	IN struct thread *td;
+};
+
+#
+#% lock		vp	? ? ?
+#
+vop_lock {
+	IN struct vnode *vp;
+	IN int flags;
+	IN struct thread *td;
+};
+
+#
+#% unlock	vp	L U L
+#
+vop_unlock {
+	IN struct vnode *vp;
+	IN int flags;
+	IN struct thread *td;
+};
+
+#
+#% bmap		vp	L L L
+#% bmap		vpp	- U -
+#
+vop_bmap {
+	IN struct vnode *vp;
+	IN daddr_t bn;
+	OUT struct vnode **vpp;
+	IN daddr_t *bnp;
+	OUT int *runp;
+	OUT int *runb;
+};
+
+#
+#% strategy	vp	L L L
+#
+vop_strategy {
+	IN struct vnode *vp;
+	IN struct buf *bp;
+};
+
+#
+#% getwritemount vp	= = =
+#
+vop_getwritemount {
+	IN struct vnode *vp;
+	OUT struct mount **mpp;
+};
+
+#
+#% print	vp	= = =
+#
+vop_print {
+	IN struct vnode *vp;
+};
+
+#
+#% pathconf	vp	L L L
+#
+vop_pathconf {
+	IN struct vnode *vp;
+	IN int name;
+	OUT register_t *retval;
+};
+
+#
+#% advlock	vp	U U U
+#
+vop_advlock {
+	IN struct vnode *vp;
+	IN caddr_t id;
+	IN int op;
+	IN struct flock *fl;
+	IN int flags;
+};
+
+#
+#% reallocblks	vp	L L L
+#
+vop_reallocblks {
+	IN struct vnode *vp;
+	IN struct cluster_save *buflist;
+};
+
+#
+#% getpages	vp	L L L
+#
+vop_getpages {
+	IN struct vnode *vp;
+	IN vm_page_t *m;
+	IN int count;
+	IN int reqpage;
+	IN vm_ooffset_t offset;
+};
+
+#
+#% putpages	vp	L L L
+#
+vop_putpages {
+	IN struct vnode *vp;
+	IN vm_page_t *m;
+	IN int count;
+	IN int sync;
+	IN int *rtvals;
+	IN vm_ooffset_t offset;
+};
+
+#
+#% freeblks	vp	- - -
+#
+# This call is used by the filesystem to release blocks back to 
+# device-driver.  This is useful if the driver has a lengthy 
+# erase handling or similar.
+#
+
+vop_freeblks {
+	IN struct vnode *vp;
+	IN daddr_t addr;
+	IN daddr_t length;
+};
+
+#
+#% getacl	vp	L L L
+#
+vop_getacl {
+	IN struct vnode *vp;
+	IN acl_type_t type;
+	OUT struct acl *aclp;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% setacl	vp	L L L
+#
+vop_setacl {
+	IN struct vnode *vp;
+	IN acl_type_t type;
+	IN struct acl *aclp;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% aclcheck	vp	= = =
+#
+vop_aclcheck {
+	IN struct vnode *vp;
+	IN acl_type_t type;
+	IN struct acl *aclp;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% getextattr	vp	L L L
+#
+vop_getextattr {
+	IN struct vnode *vp;
+	IN int attrnamespace;
+	IN const char *name;
+	INOUT struct uio *uio;
+	OUT size_t *size;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% setextattr	vp	L L L
+#
+vop_setextattr {
+	IN struct vnode *vp;
+	IN int attrnamespace;
+	IN const char *name;
+	INOUT struct uio *uio;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% createvobject	vp	L L L
+#
+vop_createvobject {
+	IN struct vnode *vp;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+#
+#% destroyvobject	vp	L L L
+#
+vop_destroyvobject {
+	IN struct vnode *vp;
+};
+
+#
+#% getvobject	vp	L L L
+#
+vop_getvobject {
+	IN struct vnode *vp;
+	OUT struct vm_object **objpp;
+};